volumes.c 194.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6 7
#include <linux/sched.h>
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/buffer_head.h>
10
#include <linux/blkdev.h>
11
#include <linux/iocontext.h>
12
#include <linux/capability.h>
13
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
14
#include <linux/kthread.h>
D
David Woodhouse 已提交
15
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
16
#include <linux/semaphore.h>
17
#include <linux/uuid.h>
A
Anand Jain 已提交
18
#include <linux/list_sort.h>
D
David Woodhouse 已提交
19
#include <asm/div64.h>
20 21 22 23 24 25
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
26
#include "raid56.h"
27
#include "async-thread.h"
28
#include "check-integrity.h"
29
#include "rcu-string.h"
30
#include "math.h"
31
#include "dev-replace.h"
32
#include "sysfs.h"
33

Z
Zhao Lei 已提交
34 35 36 37 38 39
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
40
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
41 42 43 44 45 46 47 48
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
49
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
50 51 52 53 54 55 56 57
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
58
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
59 60 61 62 63 64 65 66
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
67
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
68 69 70 71 72 73 74 75
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
76
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
77 78 79 80 81 82 83 84
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
85
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
86 87 88 89 90 91 92 93
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
94
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
95 96 97 98 99
		.devs_increment	= 1,
		.ncopies	= 3,
	},
};

100
const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
Z
Zhao Lei 已提交
101 102 103 104 105 106 107 108 109
	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
};

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
/*
 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 * condition is not met. Zero means there's no corresponding
 * BTRFS_ERROR_DEV_*_NOT_MET value.
 */
const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	[BTRFS_RAID_DUP]    = 0,
	[BTRFS_RAID_RAID0]  = 0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
};

Y
Yan Zheng 已提交
125
static int init_first_rw_device(struct btrfs_trans_handle *trans,
126
				struct btrfs_fs_info *fs_info);
127
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 132 133 134 135
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
136

D
David Sterba 已提交
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
 * seeding, structure cloning, openning/closing devices at mount/umount time
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list!
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * volume_mutex
 * ------------
 * coarse lock owned by a mounted filesystem; used to exclude some operations
 * that cannot run in parallel and affect the higher-level properties of the
 * filesystem like: device add/deleting/resize/replace, or balance
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
 * device is added/removed
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   volume_mutex
 *     device_list_mutex
 *       chunk_mutex
 *     balance_mutex
 */

202
DEFINE_MUTEX(uuid_mutex);
203
static LIST_HEAD(fs_uuids);
204 205 206 207
struct list_head *btrfs_get_fs_uuids(void)
{
	return &fs_uuids;
}
208

D
David Sterba 已提交
209 210 211 212 213 214 215 216 217
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
 * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
218 219 220
{
	struct btrfs_fs_devices *fs_devs;

221
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
222 223 224 225 226 227
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
228
	INIT_LIST_HEAD(&fs_devs->resized_devices);
229 230 231 232 233 234 235 236
	INIT_LIST_HEAD(&fs_devs->alloc_list);
	INIT_LIST_HEAD(&fs_devs->list);
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

	return fs_devs;
}

237 238 239 240 241 242 243
static void free_device(struct btrfs_device *device)
{
	rcu_string_free(device->name);
	bio_put(device->flush_bio);
	kfree(device);
}

Y
Yan Zheng 已提交
244 245 246 247 248 249 250 251
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
252
		free_device(device);
Y
Yan Zheng 已提交
253 254 255 256
	}
	kfree(fs_devices);
}

257 258 259 260 261 262 263
static void btrfs_kobject_uevent(struct block_device *bdev,
				 enum kobject_action action)
{
	int ret;

	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
	if (ret)
264
		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
265 266 267 268 269
			action,
			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			&disk_to_dev(bdev->bd_disk)->kobj);
}

270
void __exit btrfs_cleanup_fs_uuids(void)
271 272 273
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
274 275 276 277
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
					struct btrfs_fs_devices, list);
		list_del(&fs_devices->list);
Y
Yan Zheng 已提交
278
		free_fs_devices(fs_devices);
279 280 281
	}
}

282 283 284 285 286
/*
 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 * Returned struct is not linked onto any lists and must be destroyed using
 * free_device.
 */
287 288 289 290
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

291
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
292 293 294
	if (!dev)
		return ERR_PTR(-ENOMEM);

295 296 297 298 299 300 301 302 303 304
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

305 306
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
307
	INIT_LIST_HEAD(&dev->resized_list);
308 309 310 311

	spin_lock_init(&dev->io_lock);

	atomic_set(&dev->reada_in_flight, 0);
312
	atomic_set(&dev->dev_stats_ccnt, 0);
313
	btrfs_device_data_ordered_init(dev);
314
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
315
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
316 317 318 319

	return dev;
}

320 321 322 323 324 325 326 327 328
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
		u64 devid, const u8 *uuid)
329
{
330
	struct list_head *head = &fs_devices->devices;
331 332
	struct btrfs_device *dev;

Q
Qinghuang Feng 已提交
333
	list_for_each_entry(dev, head, dev_list) {
334
		if (dev->devid == devid &&
335
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
336
			return dev;
337
		}
338 339 340 341
	}
	return NULL;
}

342
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
343 344 345
{
	struct btrfs_fs_devices *fs_devices;

Q
Qinghuang Feng 已提交
346
	list_for_each_entry(fs_devices, &fs_uuids, list) {
347 348 349 350 351 352
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
369
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
370 371 372 373 374 375
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
376 377
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
378 379 380 381 382 383 384 385 386 387 388 389
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

390 391 392 393 394 395 396 397 398 399 400 401 402 403
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

404 405 406 407 408 409 410 411 412 413 414
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
415
static noinline void run_scheduled_bios(struct btrfs_device *device)
416
{
417
	struct btrfs_fs_info *fs_info = device->fs_info;
418 419
	struct bio *pending;
	struct backing_dev_info *bdi;
420
	struct btrfs_pending_bios *pending_bios;
421 422 423
	struct bio *tail;
	struct bio *cur;
	int again = 0;
424
	unsigned long num_run;
425
	unsigned long batch_run = 0;
426
	unsigned long last_waited = 0;
427
	int force_reg = 0;
M
Miao Xie 已提交
428
	int sync_pending = 0;
429 430 431 432 433 434 435 436 437
	struct blk_plug plug;

	/*
	 * this function runs all the bios we've collected for
	 * a particular device.  We don't want to wander off to
	 * another device without first sending all of these down.
	 * So, setup a plug here and finish it off before we return
	 */
	blk_start_plug(&plug);
438

439
	bdi = device->bdev->bd_bdi;
440

441 442 443
loop:
	spin_lock(&device->io_lock);

444
loop_lock:
445
	num_run = 0;
446

447 448 449 450 451
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
452
	if (!force_reg && device->pending_sync_bios.head) {
453
		pending_bios = &device->pending_sync_bios;
454 455
		force_reg = 1;
	} else {
456
		pending_bios = &device->pending_bios;
457 458
		force_reg = 0;
	}
459 460 461

	pending = pending_bios->head;
	tail = pending_bios->tail;
462 463 464 465 466 467 468 469 470 471
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
472 473
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
474 475
		again = 0;
		device->running_pending = 0;
476 477 478
	} else {
		again = 1;
		device->running_pending = 1;
479
	}
480 481 482 483

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

484 485
	spin_unlock(&device->io_lock);

C
Chris Mason 已提交
486
	while (pending) {
487 488

		rmb();
489 490 491 492 493 494 495 496
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
497 498 499 500 501
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

502 503 504
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
505

506
		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
507

508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
		/*
		 * if we're doing the sync list, record that our
		 * plug has some sync requests on it
		 *
		 * If we're doing the regular list and there are
		 * sync requests sitting around, unplug before
		 * we add more
		 */
		if (pending_bios == &device->pending_sync_bios) {
			sync_pending = 1;
		} else if (sync_pending) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}

524
		btrfsic_submit_bio(cur);
525 526
		num_run++;
		batch_run++;
527 528

		cond_resched();
529 530 531 532 533 534

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
C
Chris Mason 已提交
535
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
536
		    fs_info->fs_devices->open_devices > 1) {
537
			struct io_context *ioc;
538

539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
561
				cond_resched();
562 563
				continue;
			}
564
			spin_lock(&device->io_lock);
565
			requeue_list(pending_bios, pending, tail);
566
			device->running_pending = 1;
567 568

			spin_unlock(&device->io_lock);
569 570
			btrfs_queue_work(fs_info->submit_workers,
					 &device->work);
571 572 573
			goto done;
		}
	}
574

575 576 577 578 579 580 581 582 583
	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

584
done:
585
	blk_finish_plug(&plug);
586 587
}

588
static void pending_bios_fn(struct btrfs_work *work)
589 590 591 592 593 594 595
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

596 597 598 599 600 601 602 603 604 605
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
 */
static void btrfs_free_stale_devices(const char *path,
				     struct btrfs_device *skip_dev)
A
Anand Jain 已提交
606
{
607 608
	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
	struct btrfs_device *dev, *tmp_dev;
A
Anand Jain 已提交
609

610
	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
A
Anand Jain 已提交
611 612 613 614

		if (fs_devs->opened)
			continue;

615 616
		list_for_each_entry_safe(dev, tmp_dev,
					 &fs_devs->devices, dev_list) {
617
			int not_found = 0;
A
Anand Jain 已提交
618

619 620 621
			if (skip_dev && skip_dev == dev)
				continue;
			if (path && !dev->name)
A
Anand Jain 已提交
622 623 624
				continue;

			rcu_read_lock();
625
			if (path)
626
				not_found = strcmp(rcu_str_deref(dev->name),
627
						   path);
A
Anand Jain 已提交
628
			rcu_read_unlock();
629 630
			if (not_found)
				continue;
A
Anand Jain 已提交
631 632 633 634 635 636

			/* delete the stale device */
			if (fs_devs->num_devices == 1) {
				btrfs_sysfs_remove_fsid(fs_devs);
				list_del(&fs_devs->list);
				free_fs_devices(fs_devs);
637
				break;
A
Anand Jain 已提交
638 639 640
			} else {
				fs_devs->num_devices--;
				list_del(&dev->dev_list);
641
				free_device(dev);
A
Anand Jain 已提交
642 643 644 645 646
			}
		}
	}
}

647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				    &bdev, &bh);
	if (ret)
		return ret;

	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
		goto error_brelse;

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
		goto error_brelse;

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
679
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
680 681
		fs_devices->seeding = 1;
	} else {
682 683 684 685
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
686 687 688 689 690 691 692
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
		fs_devices->rotating = 1;

	device->bdev = bdev;
693
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
694 695 696
	device->mode = flags;

	fs_devices->open_devices++;
697 698
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
699
		fs_devices->rw_devices++;
700
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
701 702 703 704 705 706 707 708 709 710 711 712
	}
	brelse(bh);

	return 0;

error_brelse:
	brelse(bh);
	blkdev_put(bdev, flags);

	return -EINVAL;
}

713 714 715 716
/*
 * Add new device to list of registered devices
 *
 * Returns:
717 718
 * device pointer which was just added or updated when successful
 * error pointer when failed
719
 */
720
static noinline struct btrfs_device *device_list_add(const char *path,
721
			   struct btrfs_super_block *disk_super)
722 723 724
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
725
	struct rcu_string *name;
726
	u64 found_transid = btrfs_super_generation(disk_super);
727
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
728 729 730

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
731 732
		fs_devices = alloc_fs_devices(disk_super->fsid);
		if (IS_ERR(fs_devices))
733
			return ERR_CAST(fs_devices);
734

735
		list_add(&fs_devices->list, &fs_uuids);
736

737 738
		device = NULL;
	} else {
739 740
		device = find_device(fs_devices, devid,
				disk_super->dev_item.uuid);
741
	}
742

743
	if (!device) {
Y
Yan Zheng 已提交
744
		if (fs_devices->opened)
745
			return ERR_PTR(-EBUSY);
Y
Yan Zheng 已提交
746

747 748 749
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
750
			/* we can safely leave the fs_devices entry around */
751
			return device;
752
		}
753 754 755

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
756
			free_device(device);
757
			return ERR_PTR(-ENOMEM);
758
		}
759
		rcu_assign_pointer(device->name, name);
760

761
		mutex_lock(&fs_devices->device_list_mutex);
762
		list_add_rcu(&device->dev_list, &fs_devices->devices);
763
		fs_devices->num_devices++;
764 765
		mutex_unlock(&fs_devices->device_list_mutex);

Y
Yan Zheng 已提交
766
		device->fs_devices = fs_devices;
767
		btrfs_free_stale_devices(path, device);
768 769 770 771 772 773 774 775

		if (disk_super->label[0])
			pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
				disk_super->label, devid, found_transid, path);
		else
			pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
				disk_super->fsid, devid, found_transid, path);

776
	} else if (!device->name || strcmp(device->name->str, path)) {
777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
798 799 800 801
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
802
		 */
803
		if (!fs_devices->opened && found_transid < device->generation) {
804 805 806 807 808 809 810
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
811
			return ERR_PTR(-EEXIST);
812
		}
813

814
		name = rcu_string_strdup(path, GFP_NOFS);
815
		if (!name)
816
			return ERR_PTR(-ENOMEM);
817 818
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
819
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
820
			fs_devices->missing_devices--;
821
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
822
		}
823 824
	}

825 826 827 828 829 830 831 832 833
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
	if (!fs_devices->opened)
		device->generation = found_transid;

834 835
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

836
	return device;
837 838
}

Y
Yan Zheng 已提交
839 840 841 842 843 844
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

845 846 847
	fs_devices = alloc_fs_devices(orig->fsid);
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
848

849
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
850
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
851

852
	/* We have held the volume lock, it is safe to get the devices. */
Y
Yan Zheng 已提交
853
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
854 855
		struct rcu_string *name;

856 857 858
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
		if (IS_ERR(device))
Y
Yan Zheng 已提交
859 860
			goto error;

861 862 863 864
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
865
		if (orig_dev->name) {
866 867
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
868
			if (!name) {
869
				free_device(device);
870 871 872
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
873
		}
Y
Yan Zheng 已提交
874 875 876 877 878

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
879
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
880 881
	return fs_devices;
error:
882
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
883 884 885 886
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

887 888 889 890 891
/*
 * After we have read the system tree and know devids belonging to
 * this filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
892
{
Q
Qinghuang Feng 已提交
893
	struct btrfs_device *device, *next;
894
	struct btrfs_device *latest_dev = NULL;
895

896 897
	mutex_lock(&uuid_mutex);
again:
898
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
899
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
900 901
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
							&device->dev_state)) {
902 903 904 905
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
			     &device->dev_state) &&
			     (!latest_dev ||
			      device->generation > latest_dev->generation)) {
906
				latest_dev = device;
907
			}
Y
Yan Zheng 已提交
908
			continue;
909
		}
Y
Yan Zheng 已提交
910

911 912 913 914 915 916 917 918 919 920 921
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
922 923
			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
						  &device->dev_state)) {
924 925 926
				continue;
			}
		}
Y
Yan Zheng 已提交
927
		if (device->bdev) {
928
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
929 930 931
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
932
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
933
			list_del_init(&device->dev_alloc_list);
934
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
935 936
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				      &device->dev_state))
937
				fs_devices->rw_devices--;
Y
Yan Zheng 已提交
938
		}
Y
Yan Zheng 已提交
939 940
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
941
		free_device(device);
942
	}
Y
Yan Zheng 已提交
943 944 945 946 947 948

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

949
	fs_devices->latest_bdev = latest_dev->bdev;
950

951 952
	mutex_unlock(&uuid_mutex);
}
953

954
static void free_device_rcu(struct rcu_head *head)
955 956 957
{
	struct btrfs_device *device;

L
Liu Bo 已提交
958
	device = container_of(head, struct btrfs_device, rcu);
959
	free_device(device);
960 961
}

962 963
static void btrfs_close_bdev(struct btrfs_device *device)
{
D
David Sterba 已提交
964 965 966
	if (!device->bdev)
		return;

967
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
968 969 970 971
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

D
David Sterba 已提交
972
	blkdev_put(device->bdev, device->mode);
973 974
}

975
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
976 977 978 979 980 981 982 983
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;
	struct btrfs_device *new_device;
	struct rcu_string *name;

	if (device->bdev)
		fs_devices->open_devices--;

984
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
985 986 987 988 989
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

990
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
		fs_devices->missing_devices--;

	new_device = btrfs_alloc_device(NULL, &device->devid,
					device->uuid);
	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */

	/* Safe because we are under uuid_mutex */
	if (device->name) {
		name = rcu_string_strdup(device->name->str, GFP_NOFS);
		BUG_ON(!name); /* -ENOMEM */
		rcu_assign_pointer(new_device->name, name);
	}

	list_replace_rcu(&device->dev_list, &new_device->dev_list);
	new_device->fs_devices = device->fs_devices;
}

Y
Yan Zheng 已提交
1008
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1009
{
1010
	struct btrfs_device *device, *tmp;
1011 1012 1013
	struct list_head pending_put;

	INIT_LIST_HEAD(&pending_put);
Y
Yan Zheng 已提交
1014

Y
Yan Zheng 已提交
1015 1016
	if (--fs_devices->opened > 0)
		return 0;
1017

1018
	mutex_lock(&fs_devices->device_list_mutex);
1019
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1020 1021
		btrfs_prepare_close_one_device(device);
		list_add(&device->dev_list, &pending_put);
1022
	}
1023 1024
	mutex_unlock(&fs_devices->device_list_mutex);

1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
	/*
	 * btrfs_show_devname() is using the device_list_mutex,
	 * sometimes call to blkdev_put() leads vfs calling
	 * into this func. So do put outside of device_list_mutex,
	 * as of now.
	 */
	while (!list_empty(&pending_put)) {
		device = list_first_entry(&pending_put,
				struct btrfs_device, dev_list);
		list_del(&device->dev_list);
		btrfs_close_bdev(device);
1036
		call_rcu(&device->rcu, free_device_rcu);
1037 1038
	}

Y
Yan Zheng 已提交
1039 1040
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
1041 1042 1043
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

1044 1045 1046
	return 0;
}

Y
Yan Zheng 已提交
1047 1048
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Y
Yan Zheng 已提交
1049
	struct btrfs_fs_devices *seed_devices = NULL;
Y
Yan Zheng 已提交
1050 1051 1052 1053
	int ret;

	mutex_lock(&uuid_mutex);
	ret = __btrfs_close_devices(fs_devices);
Y
Yan Zheng 已提交
1054 1055 1056 1057
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Y
Yan Zheng 已提交
1058
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
1059 1060 1061 1062 1063 1064 1065

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
	}
Y
Yan Zheng 已提交
1066 1067 1068
	return ret;
}

Y
Yan Zheng 已提交
1069 1070
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				fmode_t flags, void *holder)
1071 1072 1073
{
	struct list_head *head = &fs_devices->devices;
	struct btrfs_device *device;
1074
	struct btrfs_device *latest_dev = NULL;
1075
	int ret = 0;
1076

1077 1078
	flags |= FMODE_EXCL;

Q
Qinghuang Feng 已提交
1079
	list_for_each_entry(device, head, dev_list) {
1080
		/* Just open everything we can; ignore failures here */
1081
		if (btrfs_open_one_device(fs_devices, device, flags, holder))
1082
			continue;
1083

1084 1085 1086
		if (!latest_dev ||
		    device->generation > latest_dev->generation)
			latest_dev = device;
1087
	}
1088
	if (fs_devices->open_devices == 0) {
1089
		ret = -EINVAL;
1090 1091
		goto out;
	}
Y
Yan Zheng 已提交
1092
	fs_devices->opened = 1;
1093
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1094
	fs_devices->total_rw_bytes = 0;
1095
out:
Y
Yan Zheng 已提交
1096 1097 1098
	return ret;
}

A
Anand Jain 已提交
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	struct btrfs_device *dev1, *dev2;

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Y
Yan Zheng 已提交
1113
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1114
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1115 1116 1117 1118 1119
{
	int ret;

	mutex_lock(&uuid_mutex);
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1120 1121
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1122
	} else {
A
Anand Jain 已提交
1123
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1124
		ret = __btrfs_open_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1125
	}
1126 1127 1128 1129
	mutex_unlock(&uuid_mutex);
	return ret;
}

1130
static void btrfs_release_disk_super(struct page *page)
1131 1132 1133 1134 1135
{
	kunmap(page);
	put_page(page);
}

1136 1137 1138
static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				 struct page **page,
				 struct btrfs_super_block **disk_super)
1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
{
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
		return 1;

	/* make sure our super fits in the page */
	if (sizeof(**disk_super) > PAGE_SIZE)
		return 1;

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
		return 1;

	/* pull in the page with our super */
	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				   index, GFP_KERNEL);

	if (IS_ERR_OR_NULL(*page))
		return 1;

	p = kmap(*page);

	/* align our pointer to the offset of the super block */
	*disk_super = p + (bytenr & ~PAGE_MASK);

	if (btrfs_super_bytenr(*disk_super) != bytenr ||
	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(*page);
		return 1;
	}

	if ((*disk_super)->label[0] &&
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';

	return 0;
}

1181 1182 1183 1184 1185
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1186
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1187 1188 1189
			  struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_super_block *disk_super;
1190
	struct btrfs_device *device;
1191
	struct block_device *bdev;
1192
	struct page *page;
1193
	int ret = 0;
1194
	u64 bytenr;
1195

1196 1197 1198 1199 1200 1201 1202
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	bytenr = btrfs_sb_offset(0);
1203
	flags |= FMODE_EXCL;
1204
	mutex_lock(&uuid_mutex);
1205 1206 1207 1208

	bdev = blkdev_get_by_path(path, flags, holder);
	if (IS_ERR(bdev)) {
		ret = PTR_ERR(bdev);
1209
		goto error;
1210 1211
	}

1212 1213
	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
		ret = -EINVAL;
1214
		goto error_bdev_put;
1215
	}
1216

1217
	device = device_list_add(path, disk_super);
1218 1219 1220 1221
	if (IS_ERR(device))
		ret = PTR_ERR(device);
	else
		*fs_devices_ret = device->fs_devices;
1222

1223
	btrfs_release_disk_super(page);
1224 1225

error_bdev_put:
1226
	blkdev_put(bdev, flags);
1227
error:
1228
	mutex_unlock(&uuid_mutex);
1229 1230
	return ret;
}
1231

1232 1233 1234 1235 1236
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
				   u64 end, u64 *length)
{
	struct btrfs_key key;
1237
	struct btrfs_root *root = device->fs_info->dev_root;
1238 1239 1240 1241 1242 1243 1244 1245 1246
	struct btrfs_dev_extent *dev_extent;
	struct btrfs_path *path;
	u64 extent_end;
	int ret;
	int slot;
	struct extent_buffer *l;

	*length = 0;

1247 1248
	if (start >= device->total_bytes ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1249 1250 1251 1252 1253
		return 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1254
	path->reada = READA_FORWARD;
1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
			goto out;
	}

	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto out;

			break;
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
			break;

1289
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316
			goto next;

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (key.offset <= start && extent_end > end) {
			*length = end - start + 1;
			break;
		} else if (key.offset <= start && extent_end > start)
			*length += extent_end - start;
		else if (key.offset > start && extent_end <= end)
			*length += extent_end - key.offset;
		else if (key.offset > start && key.offset <= end) {
			*length += end - key.offset + 1;
			break;
		} else if (key.offset > end)
			break;

next:
		path->slots[0]++;
	}
	ret = 0;
out:
	btrfs_free_path(path);
	return ret;
}

1317
static int contains_pending_extent(struct btrfs_transaction *transaction,
1318 1319 1320
				   struct btrfs_device *device,
				   u64 *start, u64 len)
{
1321
	struct btrfs_fs_info *fs_info = device->fs_info;
1322
	struct extent_map *em;
1323
	struct list_head *search_list = &fs_info->pinned_chunks;
1324
	int ret = 0;
1325
	u64 physical_start = *start;
1326

1327 1328
	if (transaction)
		search_list = &transaction->pending_chunks;
1329 1330
again:
	list_for_each_entry(em, search_list, list) {
1331 1332 1333
		struct map_lookup *map;
		int i;

1334
		map = em->map_lookup;
1335
		for (i = 0; i < map->num_stripes; i++) {
1336 1337
			u64 end;

1338 1339
			if (map->stripes[i].dev != device)
				continue;
1340
			if (map->stripes[i].physical >= physical_start + len ||
1341
			    map->stripes[i].physical + em->orig_block_len <=
1342
			    physical_start)
1343
				continue;
1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
			/*
			 * Make sure that while processing the pinned list we do
			 * not override our *start with a lower value, because
			 * we can have pinned chunks that fall within this
			 * device hole and that have lower physical addresses
			 * than the pending chunks we processed before. If we
			 * do not take this special care we can end up getting
			 * 2 pending chunks that start at the same physical
			 * device offsets because the end offset of a pinned
			 * chunk can be equal to the start offset of some
			 * pending chunk.
			 */
			end = map->stripes[i].physical + em->orig_block_len;
			if (end > *start) {
				*start = end;
				ret = 1;
			}
1361 1362
		}
	}
1363 1364
	if (search_list != &fs_info->pinned_chunks) {
		search_list = &fs_info->pinned_chunks;
1365 1366
		goto again;
	}
1367 1368 1369 1370 1371

	return ret;
}


1372
/*
1373 1374 1375 1376 1377 1378 1379
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1380
 *
1381 1382 1383
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1384 1385 1386 1387 1388 1389 1390 1391
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1392
 */
1393 1394 1395
int find_free_dev_extent_start(struct btrfs_transaction *transaction,
			       struct btrfs_device *device, u64 num_bytes,
			       u64 search_start, u64 *start, u64 *len)
1396
{
1397 1398
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1399
	struct btrfs_key key;
1400
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1401
	struct btrfs_path *path;
1402 1403 1404 1405
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1406 1407
	u64 search_end = device->total_bytes;
	int ret;
1408
	int slot;
1409
	struct extent_buffer *l;
1410 1411 1412 1413 1414 1415

	/*
	 * We don't want to overwrite the superblock on the drive nor any area
	 * used by the boot loader (grub for example), so we make sure to start
	 * at an offset of at least 1MB.
	 */
1416
	search_start = max_t(u64, search_start, SZ_1M);
1417

1418 1419 1420
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1421

1422 1423 1424
	max_hole_start = search_start;
	max_hole_size = 0;

1425
again:
1426 1427
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1428
		ret = -ENOSPC;
1429
		goto out;
1430 1431
	}

1432
	path->reada = READA_FORWARD;
1433 1434
	path->search_commit_root = 1;
	path->skip_locking = 1;
1435

1436 1437 1438
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1439

1440
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1441
	if (ret < 0)
1442
		goto out;
1443 1444 1445
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
1446
			goto out;
1447
	}
1448

1449 1450 1451 1452 1453 1454 1455 1456
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1457 1458 1459
				goto out;

			break;
1460 1461 1462 1463 1464 1465 1466
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1467
			break;
1468

1469
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1470
			goto next;
1471

1472 1473
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1474

1475 1476 1477 1478
			/*
			 * Have to check before we set max_hole_start, otherwise
			 * we could end up sending back this offset anyway.
			 */
1479
			if (contains_pending_extent(transaction, device,
1480
						    &search_start,
1481 1482 1483 1484 1485 1486 1487 1488
						    hole_size)) {
				if (key.offset >= search_start) {
					hole_size = key.offset - search_start;
				} else {
					WARN_ON_ONCE(1);
					hole_size = 0;
				}
			}
1489

1490 1491 1492 1493
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1494

1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1507 1508 1509 1510
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1511 1512 1513 1514
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1515 1516 1517 1518 1519
next:
		path->slots[0]++;
		cond_resched();
	}

1520 1521 1522 1523 1524
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1525
	if (search_end > search_start) {
1526 1527
		hole_size = search_end - search_start;

1528
		if (contains_pending_extent(transaction, device, &search_start,
1529 1530 1531 1532
					    hole_size)) {
			btrfs_release_path(path);
			goto again;
		}
1533

1534 1535 1536 1537
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1538 1539
	}

1540
	/* See above. */
1541
	if (max_hole_size < num_bytes)
1542 1543 1544 1545 1546
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1547
	btrfs_free_path(path);
1548
	*start = max_hole_start;
1549
	if (len)
1550
		*len = max_hole_size;
1551 1552 1553
	return ret;
}

1554 1555 1556 1557 1558 1559
int find_free_dev_extent(struct btrfs_trans_handle *trans,
			 struct btrfs_device *device, u64 num_bytes,
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
	return find_free_dev_extent_start(trans->transaction, device,
1560
					  num_bytes, 0, start, len);
1561 1562
}

1563
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1564
			  struct btrfs_device *device,
M
Miao Xie 已提交
1565
			  u64 start, u64 *dev_extent_len)
1566
{
1567 1568
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1569 1570 1571
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1572 1573 1574
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1575 1576 1577 1578 1579 1580 1581 1582

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1583
again:
1584
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1585 1586 1587
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1588 1589
		if (ret)
			goto out;
1590 1591 1592 1593 1594 1595
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1596 1597 1598
		key = found_key;
		btrfs_release_path(path);
		goto again;
1599 1600 1601 1602
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1603
	} else {
1604
		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1605
		goto out;
1606
	}
1607

M
Miao Xie 已提交
1608 1609
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1610
	ret = btrfs_del_item(trans, root, path);
1611
	if (ret) {
1612 1613
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to remove dev extent item");
Z
Zhao Lei 已提交
1614
	} else {
1615
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1616
	}
1617
out:
1618 1619 1620 1621
	btrfs_free_path(path);
	return ret;
}

1622 1623 1624
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_device *device,
				  u64 chunk_offset, u64 start, u64 num_bytes)
1625 1626 1627
{
	int ret;
	struct btrfs_path *path;
1628 1629
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1630 1631 1632 1633
	struct btrfs_dev_extent *extent;
	struct extent_buffer *leaf;
	struct btrfs_key key;

1634
	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1635
	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1636 1637 1638 1639 1640
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
Y
Yan Zheng 已提交
1641
	key.offset = start;
1642 1643 1644
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*extent));
1645 1646
	if (ret)
		goto out;
1647 1648 1649 1650

	leaf = path->nodes[0];
	extent = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_dev_extent);
1651 1652
	btrfs_set_dev_extent_chunk_tree(leaf, extent,
					BTRFS_CHUNK_TREE_OBJECTID);
1653 1654
	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1655 1656
	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

1657 1658
	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
	btrfs_mark_buffer_dirty(leaf);
1659
out:
1660 1661 1662 1663
	btrfs_free_path(path);
	return ret;
}

1664
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1665
{
1666 1667 1668 1669
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1670

1671 1672 1673 1674 1675 1676
	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	n = rb_last(&em_tree->map);
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1677
	}
1678 1679
	read_unlock(&em_tree->lock);

1680 1681 1682
	return ret;
}

1683 1684
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1685 1686 1687 1688
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1689 1690 1691 1692 1693
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1694 1695 1696 1697 1698

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1699
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1700 1701 1702
	if (ret < 0)
		goto error;

1703
	BUG_ON(ret == 0); /* Corruption */
1704

1705 1706
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1707 1708
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1709
		*devid_ret = 1;
1710 1711 1712
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1713
		*devid_ret = found_key.offset + 1;
1714 1715 1716
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1717
	btrfs_free_path(path);
1718 1719 1720 1721 1722 1723 1724
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1725
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1726
			    struct btrfs_fs_info *fs_info,
1727
			    struct btrfs_device *device)
1728
{
1729
	struct btrfs_root *root = fs_info->chunk_root;
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1743
	key.offset = device->devid;
1744 1745

	ret = btrfs_insert_empty_item(trans, root, path, &key,
1746
				      sizeof(*dev_item));
1747 1748 1749 1750 1751 1752 1753
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1754
	btrfs_set_device_generation(leaf, dev_item, 0);
1755 1756 1757 1758
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1759 1760 1761 1762
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1763 1764 1765
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1766
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1767

1768
	ptr = btrfs_device_uuid(dev_item);
1769
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1770
	ptr = btrfs_device_fsid(dev_item);
1771
	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1772 1773
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1774
	ret = 0;
1775 1776 1777 1778
out:
	btrfs_free_path(path);
	return ret;
}
1779

1780 1781 1782 1783
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1784
static void update_dev_time(const char *path_name)
1785 1786 1787 1788
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1789
	if (IS_ERR(filp))
1790 1791 1792 1793 1794
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1795
static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1796 1797
			     struct btrfs_device *device)
{
1798
	struct btrfs_root *root = fs_info->chunk_root;
1799 1800 1801 1802 1803 1804 1805 1806 1807
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1808
	trans = btrfs_start_transaction(root, 0);
1809 1810 1811 1812
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1813 1814 1815 1816 1817
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1818 1819 1820 1821 1822
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1823 1824 1825 1826
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1827 1828 1829 1830 1831
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1832 1833
out:
	btrfs_free_path(path);
1834 1835
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1836 1837 1838
	return ret;
}

1839 1840 1841 1842 1843 1844 1845
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1846 1847
{
	u64 all_avail;
1848
	unsigned seq;
1849
	int i;
1850

1851
	do {
1852
		seq = read_seqbegin(&fs_info->profiles_lock);
1853

1854 1855 1856 1857
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1858

1859 1860 1861
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
		if (!(all_avail & btrfs_raid_group[i]))
			continue;
1862

1863 1864
		if (num_devices < btrfs_raid_array[i].devs_min) {
			int ret = btrfs_raid_mindev_error[i];
1865

1866 1867 1868
			if (ret)
				return ret;
		}
D
David Woodhouse 已提交
1869 1870
	}

1871
	return 0;
1872 1873
}

1874 1875
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1876
{
Y
Yan Zheng 已提交
1877
	struct btrfs_device *next_device;
1878 1879 1880

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
1881 1882
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
		struct btrfs_device *device, struct btrfs_device *this_dev)
{
	struct btrfs_device *next_device;

	if (this_dev)
		next_device = this_dev;
	else
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
								device);
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

1915 1916
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
		u64 devid)
1917 1918
{
	struct btrfs_device *device;
1919
	struct btrfs_fs_devices *cur_devices;
Y
Yan Zheng 已提交
1920
	u64 num_devices;
1921 1922
	int ret = 0;

1923
	mutex_lock(&fs_info->volume_mutex);
1924 1925
	mutex_lock(&uuid_mutex);

1926
	num_devices = fs_info->fs_devices->num_devices;
1927
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1928
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1929 1930 1931
		WARN_ON(num_devices < 1);
		num_devices--;
	}
1932
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1933

1934
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1935
	if (ret)
1936 1937
		goto out;

1938 1939
	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
					   &device);
1940
	if (ret)
D
David Woodhouse 已提交
1941
		goto out;
1942

1943
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1944
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1945
		goto out;
1946 1947
	}

1948 1949
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    fs_info->fs_devices->rw_devices == 1) {
1950
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1951
		goto out;
Y
Yan Zheng 已提交
1952 1953
	}

1954
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1955
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
1956
		list_del_init(&device->dev_alloc_list);
1957
		device->fs_devices->rw_devices--;
1958
		mutex_unlock(&fs_info->chunk_mutex);
1959
	}
1960

1961
	mutex_unlock(&uuid_mutex);
1962
	ret = btrfs_shrink_device(device, 0);
1963
	mutex_lock(&uuid_mutex);
1964
	if (ret)
1965
		goto error_undo;
1966

1967 1968 1969 1970 1971
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
1972
	ret = btrfs_rm_dev_item(fs_info, device);
1973
	if (ret)
1974
		goto error_undo;
1975

1976
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1977
	btrfs_scrub_cancel_dev(fs_info, device);
1978 1979 1980 1981

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
1982 1983 1984 1985 1986
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
1987
	 */
1988 1989

	cur_devices = device->fs_devices;
1990
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
1991
	list_del_rcu(&device->dev_list);
1992

Y
Yan Zheng 已提交
1993
	device->fs_devices->num_devices--;
J
Josef Bacik 已提交
1994
	device->fs_devices->total_devices--;
Y
Yan Zheng 已提交
1995

1996
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1997
		device->fs_devices->missing_devices--;
1998

1999
	btrfs_assign_next_active_device(fs_info, device, NULL);
Y
Yan Zheng 已提交
2000

2001
	if (device->bdev) {
Y
Yan Zheng 已提交
2002
		device->fs_devices->open_devices--;
2003
		/* remove sysfs entry */
2004
		btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2005
	}
2006

2007 2008 2009
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2010

2011 2012 2013 2014 2015
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
2016
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2017 2018 2019
		btrfs_scratch_superblocks(device->bdev, device->name->str);

	btrfs_close_bdev(device);
2020
	call_rcu(&device->rcu, free_device_rcu);
2021

2022
	if (cur_devices->open_devices == 0) {
Y
Yan Zheng 已提交
2023
		struct btrfs_fs_devices *fs_devices;
2024
		fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2025
		while (fs_devices) {
2026 2027
			if (fs_devices->seed == cur_devices) {
				fs_devices->seed = cur_devices->seed;
Y
Yan Zheng 已提交
2028
				break;
2029
			}
Y
Yan Zheng 已提交
2030
			fs_devices = fs_devices->seed;
Y
Yan Zheng 已提交
2031
		}
2032 2033 2034
		cur_devices->seed = NULL;
		__btrfs_close_devices(cur_devices);
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
2035 2036
	}

2037 2038
out:
	mutex_unlock(&uuid_mutex);
2039
	mutex_unlock(&fs_info->volume_mutex);
2040
	return ret;
2041

2042
error_undo:
2043
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2044
		mutex_lock(&fs_info->chunk_mutex);
2045
		list_add(&device->dev_alloc_list,
2046
			 &fs_info->fs_devices->alloc_list);
2047
		device->fs_devices->rw_devices++;
2048
		mutex_unlock(&fs_info->chunk_mutex);
2049
	}
2050
	goto out;
2051 2052
}

2053 2054
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
					struct btrfs_device *srcdev)
2055
{
2056 2057
	struct btrfs_fs_devices *fs_devices;

2058
	lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2059

2060 2061 2062 2063 2064 2065 2066
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2067

2068
	list_del_rcu(&srcdev->dev_list);
2069
	list_del(&srcdev->dev_alloc_list);
2070
	fs_devices->num_devices--;
2071
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2072
		fs_devices->missing_devices--;
2073

2074
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2075
		fs_devices->rw_devices--;
2076

2077
	if (srcdev->bdev)
2078
		fs_devices->open_devices--;
2079 2080 2081 2082 2083 2084
}

void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *srcdev)
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2085

2086
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2087 2088 2089
		/* zero out the old super if it is writable */
		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
	}
2090 2091

	btrfs_close_bdev(srcdev);
2092
	call_rcu(&srcdev->rcu, free_device_rcu);
2093 2094 2095 2096 2097

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
		struct btrfs_fs_devices *tmp_fs_devices;

2098 2099 2100 2101 2102 2103 2104 2105
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2106 2107 2108 2109 2110 2111 2112 2113 2114
		tmp_fs_devices = fs_info->fs_devices;
		while (tmp_fs_devices) {
			if (tmp_fs_devices->seed == fs_devices) {
				tmp_fs_devices->seed = fs_devices->seed;
				break;
			}
			tmp_fs_devices = tmp_fs_devices->seed;
		}
		fs_devices->seed = NULL;
2115 2116
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
2117
	}
2118 2119 2120 2121 2122
}

void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *tgtdev)
{
2123
	mutex_lock(&uuid_mutex);
2124 2125
	WARN_ON(!tgtdev);
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2126

2127
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2128

2129
	if (tgtdev->bdev)
2130
		fs_info->fs_devices->open_devices--;
2131

2132 2133
	fs_info->fs_devices->num_devices--;

2134
	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2135 2136 2137 2138

	list_del_rcu(&tgtdev->dev_list);

	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2139
	mutex_unlock(&uuid_mutex);
2140 2141 2142 2143 2144 2145 2146 2147 2148

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2149 2150

	btrfs_close_bdev(tgtdev);
2151
	call_rcu(&tgtdev->rcu, free_device_rcu);
2152 2153
}

2154
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2155
				     const char *device_path,
2156
				     struct btrfs_device **device)
2157 2158 2159 2160 2161 2162 2163 2164 2165 2166
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
	struct buffer_head *bh;

	*device = NULL;
	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2167
				    fs_info->bdev_holder, 0, &bdev, &bh);
2168 2169 2170 2171 2172
	if (ret)
		return ret;
	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2173
	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2174 2175 2176 2177 2178 2179 2180
	brelse(bh);
	if (!*device)
		ret = -ENOENT;
	blkdev_put(bdev, FMODE_READ);
	return ret;
}

2181
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2182
					 const char *device_path,
2183 2184 2185 2186 2187 2188 2189
					 struct btrfs_device **device)
{
	*device = NULL;
	if (strcmp(device_path, "missing") == 0) {
		struct list_head *devices;
		struct btrfs_device *tmp;

2190
		devices = &fs_info->fs_devices->devices;
2191 2192 2193 2194 2195
		/*
		 * It is safe to read the devices since the volume_mutex
		 * is held by the caller.
		 */
		list_for_each_entry(tmp, devices, dev_list) {
2196 2197
			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&tmp->dev_state) && !tmp->bdev) {
2198 2199 2200 2201 2202
				*device = tmp;
				break;
			}
		}

2203 2204
		if (!*device)
			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2205 2206 2207

		return 0;
	} else {
2208
		return btrfs_find_device_by_path(fs_info, device_path, device);
2209 2210 2211
	}
}

2212 2213 2214
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2215
int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2216 2217
				 const char *devpath,
				 struct btrfs_device **device)
2218 2219 2220
{
	int ret;

2221
	if (devid) {
2222
		ret = 0;
2223
		*device = btrfs_find_device(fs_info, devid, NULL, NULL);
2224 2225 2226
		if (!*device)
			ret = -ENOENT;
	} else {
2227
		if (!devpath || !devpath[0])
2228 2229
			return -EINVAL;

2230
		ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2231 2232 2233 2234 2235
							   device);
	}
	return ret;
}

Y
Yan Zheng 已提交
2236 2237 2238
/*
 * does all the dirty work required for changing file system's UUID.
 */
2239
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2240
{
2241
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2242
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2243
	struct btrfs_fs_devices *seed_devices;
2244
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2245 2246 2247
	struct btrfs_device *device;
	u64 super_flags;

2248
	lockdep_assert_held(&uuid_mutex);
Y
Yan Zheng 已提交
2249
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2250 2251
		return -EINVAL;

D
David Sterba 已提交
2252
	seed_devices = alloc_fs_devices(NULL);
2253 2254
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2255

Y
Yan Zheng 已提交
2256 2257 2258 2259
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2260
	}
Y
Yan Zheng 已提交
2261

Y
Yan Zheng 已提交
2262 2263
	list_add(&old_devices->list, &fs_uuids);

Y
Yan Zheng 已提交
2264 2265 2266 2267
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2268
	mutex_init(&seed_devices->device_list_mutex);
2269

2270
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2271 2272
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2273 2274
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2275

2276
	mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2277
	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2278
	mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2279

Y
Yan Zheng 已提交
2280 2281 2282
	fs_devices->seeding = 0;
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2283 2284
	fs_devices->missing_devices = 0;
	fs_devices->rotating = 0;
Y
Yan Zheng 已提交
2285
	fs_devices->seed = seed_devices;
Y
Yan Zheng 已提交
2286 2287

	generate_random_uuid(fs_devices->fsid);
2288
	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2289
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2290
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2291

Y
Yan Zheng 已提交
2292 2293 2294 2295 2296 2297 2298 2299
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2300
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2301 2302
 */
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2303
			       struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2304
{
2305
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2306 2307 2308 2309 2310
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2311
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2339
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2351
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2352
				   BTRFS_UUID_SIZE);
2353
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2354
				   BTRFS_FSID_SIZE);
2355
		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2356
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2373
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2374
{
2375
	struct btrfs_root *root = fs_info->dev_root;
2376
	struct request_queue *q;
2377 2378 2379 2380
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
2381
	struct super_block *sb = fs_info->sb;
2382
	struct rcu_string *name;
2383
	u64 tmp;
Y
Yan Zheng 已提交
2384
	int seeding_dev = 0;
2385
	int ret = 0;
2386
	bool unlocked = false;
2387

2388
	if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2389
		return -EROFS;
2390

2391
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2392
				  fs_info->bdev_holder);
2393 2394
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2395

2396
	if (fs_info->fs_devices->seeding) {
Y
Yan Zheng 已提交
2397 2398 2399 2400 2401
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
	}

2402
	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2403

2404
	devices = &fs_info->fs_devices->devices;
2405

2406
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
Q
Qinghuang Feng 已提交
2407
	list_for_each_entry(device, devices, dev_list) {
2408 2409
		if (device->bdev == bdev) {
			ret = -EEXIST;
2410
			mutex_unlock(
2411
				&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2412
			goto error;
2413 2414
		}
	}
2415
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2416

2417
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2418
	if (IS_ERR(device)) {
2419
		/* we can safely leave the fs_devices entry around */
2420
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2421
		goto error;
2422 2423
	}

2424
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2425
	if (!name) {
Y
Yan Zheng 已提交
2426
		ret = -ENOMEM;
2427
		goto error_free_device;
2428
	}
2429
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2430

2431
	trans = btrfs_start_transaction(root, 0);
2432 2433
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2434
		goto error_free_device;
2435 2436
	}

2437
	q = bdev_get_queue(bdev);
2438
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Y
Yan Zheng 已提交
2439
	device->generation = trans->transid;
2440 2441 2442
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2443 2444
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2445
	device->disk_total_bytes = device->total_bytes;
2446
	device->commit_total_bytes = device->total_bytes;
2447
	device->fs_info = fs_info;
2448
	device->bdev = bdev;
2449
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2450
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2451
	device->mode = FMODE_EXCL;
2452
	device->dev_stats_valid = 1;
2453
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2454

Y
Yan Zheng 已提交
2455
	if (seeding_dev) {
2456
		sb->s_flags &= ~SB_RDONLY;
2457
		ret = btrfs_prepare_sprout(fs_info);
2458 2459 2460 2461
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2462
	}
2463

2464
	device->fs_devices = fs_info->fs_devices;
2465

2466
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2467
	mutex_lock(&fs_info->chunk_mutex);
2468
	list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
Y
Yan Zheng 已提交
2469
	list_add(&device->dev_alloc_list,
2470 2471 2472 2473 2474 2475
		 &fs_info->fs_devices->alloc_list);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
	fs_info->fs_devices->rw_devices++;
	fs_info->fs_devices->total_devices++;
	fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2476

2477
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2478

2479
	if (!blk_queue_nonrot(q))
2480
		fs_info->fs_devices->rotating = 1;
C
Chris Mason 已提交
2481

2482 2483
	tmp = btrfs_super_total_bytes(fs_info->super_copy);
	btrfs_set_super_total_bytes(fs_info->super_copy,
2484
		round_down(tmp + device->total_bytes, fs_info->sectorsize));
2485

2486 2487
	tmp = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2488 2489

	/* add sysfs device entry */
2490
	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2491

M
Miao Xie 已提交
2492 2493 2494 2495
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2496
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2497

2498
	mutex_unlock(&fs_info->chunk_mutex);
2499
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2500

Y
Yan Zheng 已提交
2501
	if (seeding_dev) {
2502
		mutex_lock(&fs_info->chunk_mutex);
2503
		ret = init_first_rw_device(trans, fs_info);
2504
		mutex_unlock(&fs_info->chunk_mutex);
2505
		if (ret) {
2506
			btrfs_abort_transaction(trans, ret);
2507
			goto error_sysfs;
2508
		}
M
Miao Xie 已提交
2509 2510
	}

2511
	ret = btrfs_add_dev_item(trans, fs_info, device);
M
Miao Xie 已提交
2512
	if (ret) {
2513
		btrfs_abort_transaction(trans, ret);
2514
		goto error_sysfs;
M
Miao Xie 已提交
2515 2516 2517 2518 2519
	}

	if (seeding_dev) {
		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];

2520
		ret = btrfs_finish_sprout(trans, fs_info);
2521
		if (ret) {
2522
			btrfs_abort_transaction(trans, ret);
2523
			goto error_sysfs;
2524
		}
2525 2526 2527 2528 2529

		/* Sprouting would change fsid of the mounted root,
		 * so rename the fsid on the sysfs
		 */
		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2530 2531 2532 2533
						fs_info->fsid);
		if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
			btrfs_warn(fs_info,
				   "sysfs: failed to create fsid for sprout");
Y
Yan Zheng 已提交
2534 2535
	}

2536
	ret = btrfs_commit_transaction(trans);
2537

Y
Yan Zheng 已提交
2538 2539 2540
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2541
		unlocked = true;
2542

2543 2544 2545
		if (ret) /* transaction commit */
			return ret;

2546
		ret = btrfs_relocate_sys_chunks(fs_info);
2547
		if (ret < 0)
2548
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2549
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2550 2551 2552 2553
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2554 2555 2556
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2557
		}
2558
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2559
	}
2560

2561 2562
	/* Update ctime/mtime for libblkid */
	update_dev_time(device_path);
Y
Yan Zheng 已提交
2563
	return ret;
2564

2565 2566
error_sysfs:
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2567
error_trans:
2568
	if (seeding_dev)
2569
		sb->s_flags |= SB_RDONLY;
2570 2571
	if (trans)
		btrfs_end_transaction(trans);
2572
error_free_device:
2573
	free_device(device);
Y
Yan Zheng 已提交
2574
error:
2575
	blkdev_put(bdev, FMODE_EXCL);
2576
	if (seeding_dev && !unlocked) {
Y
Yan Zheng 已提交
2577 2578 2579
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2580
	return ret;
2581 2582
}

2583
int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2584
				  const char *device_path,
2585
				  struct btrfs_device *srcdev,
2586 2587 2588 2589 2590 2591
				  struct btrfs_device **device_out)
{
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
	struct rcu_string *name;
2592
	u64 devid = BTRFS_DEV_REPLACE_DEVID;
2593 2594 2595
	int ret = 0;

	*device_out = NULL;
2596 2597
	if (fs_info->fs_devices->seeding) {
		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2598
		return -EINVAL;
2599
	}
2600 2601 2602

	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
				  fs_info->bdev_holder);
2603 2604
	if (IS_ERR(bdev)) {
		btrfs_err(fs_info, "target device %s is invalid!", device_path);
2605
		return PTR_ERR(bdev);
2606
	}
2607 2608 2609 2610 2611 2612

	filemap_write_and_wait(bdev->bd_inode->i_mapping);

	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
		if (device->bdev == bdev) {
J
Jeff Mahoney 已提交
2613 2614
			btrfs_err(fs_info,
				  "target device is in the filesystem!");
2615 2616 2617 2618 2619
			ret = -EEXIST;
			goto error;
		}
	}

2620

2621 2622
	if (i_size_read(bdev->bd_inode) <
	    btrfs_device_get_total_bytes(srcdev)) {
J
Jeff Mahoney 已提交
2623 2624
		btrfs_err(fs_info,
			  "target device is smaller than source device!");
2625 2626 2627 2628 2629
		ret = -EINVAL;
		goto error;
	}


2630 2631 2632
	device = btrfs_alloc_device(NULL, &devid, NULL);
	if (IS_ERR(device)) {
		ret = PTR_ERR(device);
2633 2634 2635
		goto error;
	}

2636
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2637
	if (!name) {
2638
		free_device(device);
2639 2640 2641 2642 2643
		ret = -ENOMEM;
		goto error;
	}
	rcu_assign_pointer(device->name, name);

2644
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2645
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2646
	device->generation = 0;
2647 2648 2649
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2650 2651 2652
	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2653
	device->commit_total_bytes = srcdev->commit_total_bytes;
2654
	device->commit_bytes_used = device->bytes_used;
2655
	device->fs_info = fs_info;
2656
	device->bdev = bdev;
2657
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2658
	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2659
	device->mode = FMODE_EXCL;
2660
	device->dev_stats_valid = 1;
2661
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2662 2663 2664 2665
	device->fs_devices = fs_info->fs_devices;
	list_add(&device->dev_list, &fs_info->fs_devices->devices);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
2666
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2667 2668 2669 2670 2671 2672 2673 2674 2675

	*device_out = device;
	return ret;

error:
	blkdev_put(bdev, FMODE_EXCL);
	return ret;
}

C
Chris Mason 已提交
2676 2677
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2678 2679 2680
{
	int ret;
	struct btrfs_path *path;
2681
	struct btrfs_root *root = device->fs_info->chunk_root;
2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2711 2712 2713 2714
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2715 2716 2717 2718 2719 2720 2721
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2722
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2723 2724
		      struct btrfs_device *device, u64 new_size)
{
2725 2726
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2727
	struct btrfs_fs_devices *fs_devices;
M
Miao Xie 已提交
2728 2729
	u64 old_total;
	u64 diff;
2730

2731
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Y
Yan Zheng 已提交
2732
		return -EACCES;
M
Miao Xie 已提交
2733

2734 2735
	new_size = round_down(new_size, fs_info->sectorsize);

2736
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2737
	old_total = btrfs_super_total_bytes(super_copy);
2738
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2739

2740
	if (new_size <= device->total_bytes ||
2741
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2742
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2743
		return -EINVAL;
M
Miao Xie 已提交
2744
	}
Y
Yan Zheng 已提交
2745

2746
	fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2747

2748 2749
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2750 2751
	device->fs_devices->total_rw_bytes += diff;

2752 2753
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2754
	btrfs_clear_space_info_full(device->fs_info);
2755 2756 2757
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
			      &fs_devices->resized_devices);
2758
	mutex_unlock(&fs_info->chunk_mutex);
2759

2760 2761 2762 2763
	return btrfs_update_device(trans, device);
}

static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2764
			    struct btrfs_fs_info *fs_info, u64 chunk_offset)
2765
{
2766
	struct btrfs_root *root = fs_info->chunk_root;
2767 2768 2769 2770 2771 2772 2773 2774
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2775
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2776 2777 2778 2779
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2780 2781 2782
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2783 2784
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2785 2786 2787
		ret = -ENOENT;
		goto out;
	}
2788 2789

	ret = btrfs_del_item(trans, root, path);
2790
	if (ret < 0)
2791 2792
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2793
out:
2794
	btrfs_free_path(path);
2795
	return ret;
2796 2797
}

2798
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2799
{
2800
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2801 2802 2803 2804 2805 2806 2807 2808 2809 2810
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2811
	mutex_lock(&fs_info->chunk_mutex);
2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2831
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2832 2833 2834 2835 2836 2837 2838 2839 2840
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
2841
	mutex_unlock(&fs_info->chunk_mutex);
2842 2843 2844
	return ret;
}

2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873
static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
					u64 logical, u64 length)
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2874
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2875
		       struct btrfs_fs_info *fs_info, u64 chunk_offset)
2876 2877 2878
{
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2879
	u64 dev_extent_len = 0;
2880
	int i, ret = 0;
2881
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2882

2883 2884
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em)) {
2885 2886
		/*
		 * This is a logic error, but we don't want to just rely on the
2887
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2888 2889 2890
		 * do anything we still error out.
		 */
		ASSERT(0);
2891
		return PTR_ERR(em);
2892
	}
2893
	map = em->map_lookup;
2894
	mutex_lock(&fs_info->chunk_mutex);
2895
	check_system_chunk(trans, fs_info, map->type);
2896
	mutex_unlock(&fs_info->chunk_mutex);
2897

2898 2899 2900 2901 2902 2903
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
	mutex_lock(&fs_devices->device_list_mutex);
2904
	for (i = 0; i < map->num_stripes; i++) {
2905
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
2906 2907 2908
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
2909
		if (ret) {
2910
			mutex_unlock(&fs_devices->device_list_mutex);
2911
			btrfs_abort_transaction(trans, ret);
2912 2913
			goto out;
		}
2914

M
Miao Xie 已提交
2915
		if (device->bytes_used > 0) {
2916
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2917 2918
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
2919
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2920
			btrfs_clear_space_info_full(fs_info);
2921
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2922
		}
2923

2924 2925
		if (map->stripes[i].dev) {
			ret = btrfs_update_device(trans, map->stripes[i].dev);
2926
			if (ret) {
2927
				mutex_unlock(&fs_devices->device_list_mutex);
2928
				btrfs_abort_transaction(trans, ret);
2929 2930
				goto out;
			}
2931
		}
2932
	}
2933 2934
	mutex_unlock(&fs_devices->device_list_mutex);

2935
	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2936
	if (ret) {
2937
		btrfs_abort_transaction(trans, ret);
2938 2939
		goto out;
	}
2940

2941
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2942

2943
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2944
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2945
		if (ret) {
2946
			btrfs_abort_transaction(trans, ret);
2947 2948
			goto out;
		}
2949 2950
	}

2951
	ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2952
	if (ret) {
2953
		btrfs_abort_transaction(trans, ret);
2954 2955
		goto out;
	}
Y
Yan Zheng 已提交
2956

2957
out:
Y
Yan Zheng 已提交
2958 2959
	/* once for us */
	free_extent_map(em);
2960 2961
	return ret;
}
Y
Yan Zheng 已提交
2962

2963
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2964
{
2965
	struct btrfs_root *root = fs_info->chunk_root;
2966
	struct btrfs_trans_handle *trans;
2967
	int ret;
Y
Yan Zheng 已提交
2968

2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
2981
	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2982

2983
	ret = btrfs_can_relocate(fs_info, chunk_offset);
2984 2985 2986 2987
	if (ret)
		return -ENOSPC;

	/* step one, relocate all the extents inside this chunk */
2988
	btrfs_scrub_pause(fs_info);
2989
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2990
	btrfs_scrub_continue(fs_info);
2991 2992 2993
	if (ret)
		return ret;

2994 2995 2996 2997 2998 2999 3000 3001 3002 3003
	/*
	 * We add the kobjects here (and after forcing data chunk creation)
	 * since relocation is the only place we'll create chunks of a new
	 * type at runtime.  The only place where we'll remove the last
	 * chunk of a type is the call immediately below this one.  Even
	 * so, we're protected against races with the cleaner thread since
	 * we're covered by the delete_unused_bgs_mutex.
	 */
	btrfs_add_raid_kobjects(fs_info);

3004 3005 3006 3007 3008 3009 3010 3011
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

3012
	/*
3013 3014
	 * step two, delete the device extents and the
	 * chunk tree entries
3015
	 */
3016
	ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
3017
	btrfs_end_transaction(trans);
3018
	return ret;
Y
Yan Zheng 已提交
3019 3020
}

3021
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
3022
{
3023
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
3024 3025 3026 3027 3028 3029
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
3030 3031
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
3032 3033 3034 3035 3036 3037
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3038
again:
Y
Yan Zheng 已提交
3039 3040 3041 3042 3043
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
3044
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
3045
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3046
		if (ret < 0) {
3047
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
3048
			goto error;
3049
		}
3050
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
3051 3052 3053

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
3054
		if (ret)
3055
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
3056 3057 3058 3059
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
3060

Y
Yan Zheng 已提交
3061 3062
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
3063

Y
Yan Zheng 已提交
3064 3065 3066
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3067
		btrfs_release_path(path);
3068

Y
Yan Zheng 已提交
3069
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3070
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3071 3072
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
3073 3074
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
3075
		}
3076
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3077

Y
Yan Zheng 已提交
3078 3079 3080 3081 3082
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3083 3084 3085 3086
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3087
	} else if (WARN_ON(failed && retried)) {
3088 3089
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
3090 3091 3092
error:
	btrfs_free_path(path);
	return ret;
3093 3094
}

3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
	struct btrfs_block_group_cache *cache;
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
		spin_lock(&fs_info->data_sinfo->lock);
		bytes_used = fs_info->data_sinfo->bytes_used;
		spin_unlock(&fs_info->data_sinfo->lock);

		if (!bytes_used) {
			struct btrfs_trans_handle *trans;
			int ret;

			trans =	btrfs_join_transaction(fs_info->tree_root);
			if (IS_ERR(trans))
				return PTR_ERR(trans);

			ret = btrfs_force_chunk_alloc(trans, fs_info,
						      BTRFS_BLOCK_GROUP_DATA);
			btrfs_end_transaction(trans);
			if (ret < 0)
				return ret;

3131 3132
			btrfs_add_raid_kobjects(fs_info);

3133 3134 3135 3136 3137 3138
			return 1;
		}
	}
	return 0;
}

3139
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3140 3141
			       struct btrfs_balance_control *bctl)
{
3142
	struct btrfs_root *root = fs_info->tree_root;
3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3162
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3163 3164 3165 3166 3167 3168 3169 3170 3171 3172
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3173
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3187
	err = btrfs_commit_transaction(trans);
3188 3189 3190 3191 3192
	if (err && !ret)
		ret = err;
	return ret;
}

3193
static int del_balance_item(struct btrfs_fs_info *fs_info)
3194
{
3195
	struct btrfs_root *root = fs_info->tree_root;
3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3212
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3226
	err = btrfs_commit_transaction(trans);
3227 3228 3229 3230 3231
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3256
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3257 3258 3259 3260 3261
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3262
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3263 3264 3265 3266 3267
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3268
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3269 3270 3271 3272 3273 3274
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303
/*
 * Should be called with both balance and volume mutexes held to
 * serialize other volume operations (add_dev/rm_dev/resize) with
 * restriper.  Same goes for unset_balance_control.
 */
static void set_balance_control(struct btrfs_balance_control *bctl)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;

	BUG_ON(fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
}

static void unset_balance_control(struct btrfs_fs_info *fs_info)
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
}

I
Ilya Dryomov 已提交
3304 3305 3306 3307
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3308
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3309 3310
				 struct btrfs_balance_args *bargs)
{
3311 3312
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3313

3314
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3315 3316 3317 3318 3319
		return 0;

	return 1;
}

3320
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3321
			      struct btrfs_balance_args *bargs)
3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
		user_thresh_min = div_factor_fine(cache->key.offset,
					bargs->usage_min);

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
		user_thresh_max = cache->key.offset;
	else
		user_thresh_max = div_factor_fine(cache->key.offset,
					bargs->usage_max);

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3353
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3354
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3355 3356 3357 3358 3359 3360 3361 3362
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

3363
	if (bargs->usage_min == 0)
3364
		user_thresh = 1;
3365 3366 3367 3368 3369 3370
	else if (bargs->usage > 100)
		user_thresh = cache->key.offset;
	else
		user_thresh = div_factor_fine(cache->key.offset,
					      bargs->usage);

I
Ilya Dryomov 已提交
3371 3372 3373 3374 3375 3376 3377
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

I
Ilya Dryomov 已提交
3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
D
David Woodhouse 已提交
3411 3412 3413 3414 3415 3416 3417 3418 3419
	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
		factor = num_stripes / 2;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
		factor = num_stripes - 1;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
		factor = num_stripes - 2;
	} else {
		factor = num_stripes;
	}
I
Ilya Dryomov 已提交
3420 3421 3422 3423 3424 3425 3426 3427

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3428
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3429 3430 3431 3432 3433 3434 3435 3436 3437

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3465
static int chunk_soft_convert_filter(u64 chunk_type,
3466 3467 3468 3469 3470
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3471 3472
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3473

3474
	if (bargs->target == chunk_type)
3475 3476 3477 3478 3479
		return 1;

	return 0;
}

3480
static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3481 3482 3483
				struct extent_buffer *leaf,
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3484
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3501 3502 3503 3504
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3505 3506 3507 3508
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3509
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3510
		return 0;
3511
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3512
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3513
		return 0;
I
Ilya Dryomov 已提交
3514 3515 3516 3517 3518 3519
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3520 3521 3522 3523
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3524
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3525
		return 0;
3526 3527 3528 3529 3530 3531
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3532 3533
	}

3534 3535 3536 3537 3538 3539
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3540 3541 3542 3543 3544 3545
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3546 3547 3548 3549 3550 3551 3552 3553
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3554 3555 3556
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3557
		 * determined here because we do not have the global information
3558 3559 3560 3561 3562 3563
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3564 3565
	}

3566 3567 3568
	return 1;
}

3569
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3570
{
3571
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3572 3573 3574
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct list_head *devices;
3575 3576 3577
	struct btrfs_device *device;
	u64 old_size;
	u64 size_to_free;
3578
	u64 chunk_type;
3579
	struct btrfs_chunk *chunk;
3580
	struct btrfs_path *path = NULL;
3581 3582
	struct btrfs_key key;
	struct btrfs_key found_key;
3583
	struct btrfs_trans_handle *trans;
3584 3585
	struct extent_buffer *leaf;
	int slot;
3586 3587
	int ret;
	int enospc_errors = 0;
3588
	bool counting = true;
3589
	/* The single value limit and min/max limits use the same bytes in the */
3590 3591 3592
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3593 3594 3595
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3596
	int chunk_reserved = 0;
3597 3598

	/* step one make some room on all the devices */
3599
	devices = &fs_info->fs_devices->devices;
Q
Qinghuang Feng 已提交
3600
	list_for_each_entry(device, devices, dev_list) {
3601
		old_size = btrfs_device_get_total_bytes(device);
3602
		size_to_free = div_factor(old_size, 1);
3603
		size_to_free = min_t(u64, size_to_free, SZ_1M);
3604
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3605 3606
		    btrfs_device_get_total_bytes(device) -
		    btrfs_device_get_bytes_used(device) > size_to_free ||
3607
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3608 3609 3610
			continue;

		ret = btrfs_shrink_device(device, old_size - size_to_free);
3611 3612
		if (ret == -ENOSPC)
			break;
3613 3614 3615 3616 3617
		if (ret) {
			/* btrfs_shrink_device never returns ret > 0 */
			WARN_ON(ret > 0);
			goto error;
		}
3618

3619
		trans = btrfs_start_transaction(dev_root, 0);
3620 3621 3622 3623 3624 3625 3626 3627
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3628 3629

		ret = btrfs_grow_device(trans, device, old_size);
3630
		if (ret) {
3631
			btrfs_end_transaction(trans);
3632 3633 3634 3635 3636 3637 3638 3639
			/* btrfs_grow_device never returns ret > 0 */
			WARN_ON(ret > 0);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3640

3641
		btrfs_end_transaction(trans);
3642 3643 3644 3645
	}

	/* step two, relocate all the chunks */
	path = btrfs_alloc_path();
3646 3647 3648 3649
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3650 3651 3652 3653 3654 3655

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3656
	if (!counting) {
3657 3658 3659 3660
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3661 3662 3663 3664
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3665 3666 3667 3668
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3669
	while (1) {
3670
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3671
		    atomic_read(&fs_info->balance_cancel_req)) {
3672 3673 3674 3675
			ret = -ECANCELED;
			goto error;
		}

3676
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
3677
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3678 3679
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3680
			goto error;
3681
		}
3682 3683 3684 3685 3686 3687

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3688
			BUG(); /* FIXME break ? */
3689 3690 3691

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3692
		if (ret) {
3693
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3694
			ret = 0;
3695
			break;
3696
		}
3697

3698 3699 3700
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3701

3702 3703
		if (found_key.objectid != key.objectid) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3704
			break;
3705
		}
3706

3707
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3708
		chunk_type = btrfs_chunk_type(leaf, chunk);
3709

3710 3711 3712 3713 3714 3715
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3716
		ret = should_balance_chunk(fs_info, leaf, chunk,
3717
					   found_key.offset);
3718

3719
		btrfs_release_path(path);
3720 3721
		if (!ret) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3722
			goto loop;
3723
		}
3724

3725
		if (counting) {
3726
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3727 3728 3729
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3752 3753 3754
			goto loop;
		}

3755 3756 3757 3758 3759 3760 3761 3762 3763
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
3764 3765 3766
			if (ret < 0) {
				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				goto error;
3767 3768
			} else if (ret == 1) {
				chunk_reserved = 1;
3769 3770 3771
			}
		}

3772
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3773
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3774 3775
		if (ret && ret != -ENOSPC)
			goto error;
3776
		if (ret == -ENOSPC) {
3777
			enospc_errors++;
3778 3779 3780 3781 3782
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3783
loop:
3784 3785
		if (found_key.offset == 0)
			break;
3786
		key.offset = found_key.offset - 1;
3787
	}
3788

3789 3790 3791 3792 3793
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3794 3795
error:
	btrfs_free_path(path);
3796
	if (enospc_errors) {
3797
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3798
			   enospc_errors);
3799 3800 3801 3802
		if (!ret)
			ret = -ENOSPC;
	}

3803 3804 3805
	return ret;
}

3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

	/* true if exactly one bit set */
	return (flags & (flags - 1)) == 0;
}

3830 3831
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3832 3833 3834 3835
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3836 3837
}

3838 3839
static void __cancel_balance(struct btrfs_fs_info *fs_info)
{
3840 3841
	int ret;

3842
	unset_balance_control(fs_info);
3843
	ret = del_balance_item(fs_info);
3844
	if (ret)
3845
		btrfs_handle_fs_error(fs_info, ret, NULL);
3846

3847
	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3848 3849
}

3850 3851 3852 3853 3854 3855 3856 3857 3858
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
		u64 allowed)
{
	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
		 (bctl_arg->target & ~allowed)));
}

3859 3860 3861 3862 3863 3864 3865
/*
 * Should be called with both balance and volume mutexes held
 */
int btrfs_balance(struct btrfs_balance_control *bctl,
		  struct btrfs_ioctl_balance_args *bargs)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;
3866
	u64 meta_target, data_target;
3867
	u64 allowed;
3868
	int mixed = 0;
3869
	int ret;
3870
	u64 num_devices;
3871
	unsigned seq;
3872

3873
	if (btrfs_fs_closing(fs_info) ||
3874 3875
	    atomic_read(&fs_info->balance_pause_req) ||
	    atomic_read(&fs_info->balance_cancel_req)) {
3876 3877 3878 3879
		ret = -EINVAL;
		goto out;
	}

3880 3881 3882 3883
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

3884 3885 3886 3887
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
3888 3889
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
3890 3891 3892
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
3893 3894
			btrfs_err(fs_info,
				  "with mixed groups data and metadata balance options must be the same");
3895 3896 3897 3898 3899
			ret = -EINVAL;
			goto out;
		}
	}

3900
	num_devices = fs_info->fs_devices->num_devices;
3901
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3902 3903 3904 3905
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		BUG_ON(num_devices < 1);
		num_devices--;
	}
3906
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3907 3908
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
	if (num_devices > 1)
3909
		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3910 3911 3912 3913 3914
	if (num_devices > 2)
		allowed |= BTRFS_BLOCK_GROUP_RAID5;
	if (num_devices > 3)
		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
			    BTRFS_BLOCK_GROUP_RAID6);
3915
	if (validate_convert_profile(&bctl->data, allowed)) {
J
Jeff Mahoney 已提交
3916 3917 3918
		btrfs_err(fs_info,
			  "unable to start balance with target data profile %llu",
			  bctl->data.target);
3919 3920 3921
		ret = -EINVAL;
		goto out;
	}
3922
	if (validate_convert_profile(&bctl->meta, allowed)) {
3923
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3924 3925
			  "unable to start balance with target metadata profile %llu",
			  bctl->meta.target);
3926 3927 3928
		ret = -EINVAL;
		goto out;
	}
3929
	if (validate_convert_profile(&bctl->sys, allowed)) {
3930
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3931 3932
			  "unable to start balance with target system profile %llu",
			  bctl->sys.target);
3933 3934 3935 3936 3937 3938
		ret = -EINVAL;
		goto out;
	}

	/* allow to reduce meta or sys integrity only if force set */
	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
3939 3940 3941
			BTRFS_BLOCK_GROUP_RAID10 |
			BTRFS_BLOCK_GROUP_RAID5 |
			BTRFS_BLOCK_GROUP_RAID6;
3942 3943 3944 3945 3946 3947 3948 3949 3950 3951
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
		     !(bctl->meta.target & allowed))) {
			if (bctl->flags & BTRFS_BALANCE_FORCE) {
J
Jeff Mahoney 已提交
3952 3953
				btrfs_info(fs_info,
					   "force reducing metadata integrity");
3954
			} else {
J
Jeff Mahoney 已提交
3955 3956
				btrfs_err(fs_info,
					  "balance will reduce metadata integrity, use force if you want this");
3957 3958 3959
				ret = -EINVAL;
				goto out;
			}
3960
		}
3961
	} while (read_seqretry(&fs_info->profiles_lock, seq));
3962

3963 3964 3965 3966 3967 3968 3969
	/* if we're not converting, the target field is uninitialized */
	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->data.target : fs_info->avail_data_alloc_bits;
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3970
		btrfs_warn(fs_info,
J
Jeff Mahoney 已提交
3971
			   "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
3972
			   meta_target, data_target);
3973 3974
	}

3975
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
3976
	if (ret && ret != -EEXIST)
3977 3978
		goto out;

I
Ilya Dryomov 已提交
3979 3980 3981 3982 3983 3984 3985 3986 3987
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
		set_balance_control(bctl);
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
3988

3989
	atomic_inc(&fs_info->balance_running);
3990 3991 3992 3993 3994
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
3995
	atomic_dec(&fs_info->balance_running);
3996 3997 3998

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
3999
		update_ioctl_balance_args(fs_info, 0, bargs);
4000 4001
	}

4002 4003 4004 4005 4006
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
		__cancel_balance(fs_info);
	}

4007
	wake_up(&fs_info->balance_wait_q);
4008 4009 4010

	return ret;
out:
I
Ilya Dryomov 已提交
4011 4012
	if (bctl->flags & BTRFS_BALANCE_RESUME)
		__cancel_balance(fs_info);
4013
	else {
I
Ilya Dryomov 已提交
4014
		kfree(bctl);
4015
		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4016
	}
I
Ilya Dryomov 已提交
4017 4018 4019 4020 4021
	return ret;
}

static int balance_kthread(void *data)
{
4022
	struct btrfs_fs_info *fs_info = data;
4023
	int ret = 0;
I
Ilya Dryomov 已提交
4024 4025 4026 4027

	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);

4028
	if (fs_info->balance_ctl) {
4029
		btrfs_info(fs_info, "continuing balance");
4030
		ret = btrfs_balance(fs_info->balance_ctl, NULL);
4031
	}
I
Ilya Dryomov 已提交
4032 4033 4034

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
4035

I
Ilya Dryomov 已提交
4036 4037 4038
	return ret;
}

4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

	spin_lock(&fs_info->balance_lock);
	if (!fs_info->balance_ctl) {
		spin_unlock(&fs_info->balance_lock);
		return 0;
	}
	spin_unlock(&fs_info->balance_lock);

4050
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4051
		btrfs_info(fs_info, "force skipping balance");
4052 4053 4054
		return 0;
	}

4055 4056 4057 4058 4059 4060 4061 4062 4063
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

4064
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4065
	return PTR_ERR_OR_ZERO(tsk);
4066 4067
}

4068
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
4083
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
4084 4085
	key.offset = 0;

4086
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
4087
	if (ret < 0)
4088
		goto out;
I
Ilya Dryomov 已提交
4089 4090
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
4091 4092 4093 4094 4095 4096 4097
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
4098 4099 4100 4101 4102
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

4103 4104 4105
	bctl->fs_info = fs_info;
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
4106 4107 4108 4109 4110 4111 4112 4113

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4114
	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
4115

4116 4117
	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
4118

4119 4120 4121 4122
	set_balance_control(bctl);

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
I
Ilya Dryomov 已提交
4123 4124
out:
	btrfs_free_path(path);
4125 4126 4127
	return ret;
}

4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	if (atomic_read(&fs_info->balance_running)) {
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
		BUG_ON(atomic_read(&fs_info->balance_running));
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4157 4158
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
4159
	if (sb_rdonly(fs_info->sb))
4160 4161
		return -EROFS;

4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
	if (atomic_read(&fs_info->balance_running)) {
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);
		mutex_lock(&fs_info->balance_mutex);
	} else {
		/* __cancel_balance needs volume_mutex */
		mutex_unlock(&fs_info->balance_mutex);
		mutex_lock(&fs_info->volume_mutex);
		mutex_lock(&fs_info->balance_mutex);

		if (fs_info->balance_ctl)
			__cancel_balance(fs_info);

		mutex_unlock(&fs_info->volume_mutex);
	}

	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

S
Stefan Behrens 已提交
4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206
static int btrfs_uuid_scan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4207
	struct btrfs_trans_handle *trans = NULL;
S
Stefan Behrens 已提交
4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4220 4221
		ret = btrfs_search_forward(root, &key, path,
				BTRFS_OLDEST_GENERATION);
S
Stefan Behrens 已提交
4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4245 4246 4247 4248 4249 4250 4251

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4252 4253 4254 4255 4256 4257 4258 4259 4260
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4261 4262 4263 4264 4265 4266
			continue;
		} else {
			goto skip;
		}
update_tree:
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4267
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4268 4269 4270 4271
						  root_item.uuid,
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4272
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4273 4274 4275 4276 4277 4278
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4279
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4280 4281 4282 4283
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4284
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4285 4286 4287 4288 4289
					ret);
				break;
			}
		}

4290
skip:
S
Stefan Behrens 已提交
4291
		if (trans) {
4292
			ret = btrfs_end_transaction(trans);
4293
			trans = NULL;
S
Stefan Behrens 已提交
4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315
			if (ret)
				break;
		}

		btrfs_release_path(path);
		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4316
	if (trans && !IS_ERR(trans))
4317
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4318
	if (ret)
4319
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4320
	else
4321
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4322 4323 4324 4325
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4326 4327 4328 4329
/*
 * Callback for btrfs_uuid_tree_iterate().
 * returns:
 * 0	check succeeded, the entry is not outdated.
4330
 * < 0	if an error occurred.
4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382
 * > 0	if the check failed, which means the caller shall remove the entry.
 */
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				       u8 *uuid, u8 type, u64 subid)
{
	struct btrfs_key key;
	int ret = 0;
	struct btrfs_root *subvol_root;

	if (type != BTRFS_UUID_KEY_SUBVOL &&
	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
		goto out;

	key.objectid = subid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(subvol_root)) {
		ret = PTR_ERR(subvol_root);
		if (ret == -ENOENT)
			ret = 1;
		goto out;
	}

	switch (type) {
	case BTRFS_UUID_KEY_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
			ret = 1;
		break;
	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.received_uuid,
			   BTRFS_UUID_SIZE))
			ret = 1;
		break;
	}

out:
	return ret;
}

static int btrfs_uuid_rescan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
	if (ret < 0) {
4383
		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4384 4385 4386 4387 4388 4389
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

4390 4391 4392 4393 4394
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4395 4396
	struct task_struct *task;
	int ret;
4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	uuid_root = btrfs_create_tree(trans, fs_info,
				      BTRFS_UUID_TREE_OBJECTID);
	if (IS_ERR(uuid_root)) {
4409
		ret = PTR_ERR(uuid_root);
4410
		btrfs_abort_transaction(trans, ret);
4411
		btrfs_end_transaction(trans);
4412
		return ret;
4413 4414 4415 4416
	}

	fs_info->uuid_root = uuid_root;

4417
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4418 4419 4420 4421 4422 4423
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4424
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4425
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4426 4427 4428 4429 4430
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4431
}
S
Stefan Behrens 已提交
4432

4433 4434 4435 4436 4437 4438 4439 4440
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4441
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
4442 4443 4444 4445 4446 4447 4448
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

4449 4450 4451 4452 4453 4454 4455
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4456 4457
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4458 4459 4460 4461 4462 4463 4464
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4465 4466
	int failed = 0;
	bool retried = false;
4467
	bool checked_pending_chunks = false;
4468 4469
	struct extent_buffer *l;
	struct btrfs_key key;
4470
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4471
	u64 old_total = btrfs_super_total_bytes(super_copy);
4472
	u64 old_size = btrfs_device_get_total_bytes(device);
4473 4474 4475
	u64 diff;

	new_size = round_down(new_size, fs_info->sectorsize);
4476
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4477

4478
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4479 4480
		return -EINVAL;

4481 4482 4483 4484
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4485
	path->reada = READA_FORWARD;
4486

4487
	mutex_lock(&fs_info->chunk_mutex);
4488

4489
	btrfs_device_set_total_bytes(device, new_size);
4490
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
4491
		device->fs_devices->total_rw_bytes -= diff;
4492
		atomic64_sub(diff, &fs_info->free_chunk_space);
4493
	}
4494
	mutex_unlock(&fs_info->chunk_mutex);
4495

4496
again:
4497 4498 4499 4500
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4501
	do {
4502
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
4503
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4504
		if (ret < 0) {
4505
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4506
			goto done;
4507
		}
4508 4509

		ret = btrfs_previous_item(root, path, 0, key.type);
4510
		if (ret)
4511
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4512 4513 4514 4515
		if (ret < 0)
			goto done;
		if (ret) {
			ret = 0;
4516
			btrfs_release_path(path);
4517
			break;
4518 4519 4520 4521 4522 4523
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4524
		if (key.objectid != device->devid) {
4525
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4526
			btrfs_release_path(path);
4527
			break;
4528
		}
4529 4530 4531 4532

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4533
		if (key.offset + length <= new_size) {
4534
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4535
			btrfs_release_path(path);
4536
			break;
4537
		}
4538 4539

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4540
		btrfs_release_path(path);
4541

4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
			goto done;
		}

4554 4555
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4556
		if (ret && ret != -ENOSPC)
4557
			goto done;
4558 4559
		if (ret == -ENOSPC)
			failed++;
4560
	} while (key.offset-- > 0);
4561 4562 4563 4564 4565 4566 4567 4568

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4569 4570
	}

4571
	/* Shrinking succeeded, else we would be at "done". */
4572
	trans = btrfs_start_transaction(root, 0);
4573 4574 4575 4576 4577
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4578
	mutex_lock(&fs_info->chunk_mutex);
4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595

	/*
	 * We checked in the above loop all device extents that were already in
	 * the device tree. However before we have updated the device's
	 * total_bytes to the new size, we might have had chunk allocations that
	 * have not complete yet (new block groups attached to transaction
	 * handles), and therefore their device extents were not yet in the
	 * device tree and we missed them in the loop above. So if we have any
	 * pending chunk using a device extent that overlaps the device range
	 * that we can not use anymore, commit the current transaction and
	 * repeat the search on the device tree - this way we guarantee we will
	 * not have chunks using device extents that end beyond 'new_size'.
	 */
	if (!checked_pending_chunks) {
		u64 start = new_size;
		u64 len = old_size - new_size;

4596 4597
		if (contains_pending_extent(trans->transaction, device,
					    &start, len)) {
4598
			mutex_unlock(&fs_info->chunk_mutex);
4599 4600 4601
			checked_pending_chunks = true;
			failed = 0;
			retried = false;
4602
			ret = btrfs_commit_transaction(trans);
4603 4604 4605 4606 4607 4608
			if (ret)
				goto done;
			goto again;
		}
	}

4609
	btrfs_device_set_disk_total_bytes(device, new_size);
4610 4611
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
4612
			      &fs_info->fs_devices->resized_devices);
4613 4614

	WARN_ON(diff > old_total);
4615 4616
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4617
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4618 4619 4620

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4621
	btrfs_end_transaction(trans);
4622 4623
done:
	btrfs_free_path(path);
4624
	if (ret) {
4625
		mutex_lock(&fs_info->chunk_mutex);
4626
		btrfs_device_set_total_bytes(device, old_size);
4627
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4628
			device->fs_devices->total_rw_bytes += diff;
4629
		atomic64_add(diff, &fs_info->free_chunk_space);
4630
		mutex_unlock(&fs_info->chunk_mutex);
4631
	}
4632 4633 4634
	return ret;
}

4635
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4636 4637 4638
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4639
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4640 4641 4642 4643
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4644
	mutex_lock(&fs_info->chunk_mutex);
4645
	array_size = btrfs_super_sys_array_size(super_copy);
4646
	if (array_size + item_size + sizeof(disk_key)
4647
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4648
		mutex_unlock(&fs_info->chunk_mutex);
4649
		return -EFBIG;
4650
	}
4651 4652 4653 4654 4655 4656 4657 4658

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4659
	mutex_unlock(&fs_info->chunk_mutex);
4660

4661 4662 4663
	return 0;
}

4664 4665 4666 4667
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4668
{
4669 4670
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4671

4672
	if (di_a->max_avail > di_b->max_avail)
4673
		return -1;
4674
	if (di_a->max_avail < di_b->max_avail)
4675
		return 1;
4676 4677 4678 4679 4680
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4681
}
4682

D
David Woodhouse 已提交
4683 4684
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4685
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4686 4687
		return;

4688
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4689 4690
}

4691
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
4692 4693 4694 4695 4696 4697 4698 4699
			- sizeof(struct btrfs_chunk))		\
			/ sizeof(struct btrfs_stripe) + 1)

#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
				- 2 * sizeof(struct btrfs_disk_key)	\
				- 2 * sizeof(struct btrfs_chunk))	\
				/ sizeof(struct btrfs_stripe) + 1)

4700
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4701
			       u64 start, u64 type)
4702
{
4703
	struct btrfs_fs_info *info = trans->fs_info;
4704
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
4705
	struct btrfs_device *device;
4706 4707 4708 4709 4710 4711
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct btrfs_device_info *devices_info = NULL;
	u64 total_avail;
	int num_stripes;	/* total number of stripes to allocate */
D
David Woodhouse 已提交
4712 4713
	int data_stripes;	/* number of stripes that count for
				   block group size */
4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727
	int sub_stripes;	/* sub_stripes info for map */
	int dev_stripes;	/* stripes per dev */
	int devs_max;		/* max devs to use */
	int devs_min;		/* min devs needed */
	int devs_increment;	/* ndevs has to be a multiple of this */
	int ncopies;		/* how many copies to data has */
	int ret;
	u64 max_stripe_size;
	u64 max_chunk_size;
	u64 stripe_size;
	u64 num_bytes;
	int ndevs;
	int i;
	int j;
4728
	int index;
4729

4730
	BUG_ON(!alloc_profile_is_valid(type, 0));
4731

4732 4733 4734
	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
4735
		return -ENOSPC;
4736
	}
4737

4738
	index = btrfs_bg_flags_to_raid_index(type);
4739

4740 4741 4742 4743 4744 4745
	sub_stripes = btrfs_raid_array[index].sub_stripes;
	dev_stripes = btrfs_raid_array[index].dev_stripes;
	devs_max = btrfs_raid_array[index].devs_max;
	devs_min = btrfs_raid_array[index].devs_min;
	devs_increment = btrfs_raid_array[index].devs_increment;
	ncopies = btrfs_raid_array[index].ncopies;
4746

4747
	if (type & BTRFS_BLOCK_GROUP_DATA) {
4748
		max_stripe_size = SZ_1G;
4749
		max_chunk_size = 10 * max_stripe_size;
4750
		if (!devs_max)
4751
			devs_max = BTRFS_MAX_DEVS(info);
4752
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4753
		/* for larger filesystems, use larger metadata chunks */
4754 4755
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			max_stripe_size = SZ_1G;
4756
		else
4757
			max_stripe_size = SZ_256M;
4758
		max_chunk_size = max_stripe_size;
4759
		if (!devs_max)
4760
			devs_max = BTRFS_MAX_DEVS(info);
4761
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4762
		max_stripe_size = SZ_32M;
4763
		max_chunk_size = 2 * max_stripe_size;
4764 4765
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4766
	} else {
4767
		btrfs_err(info, "invalid chunk type 0x%llx requested",
4768 4769
		       type);
		BUG_ON(1);
4770 4771
	}

Y
Yan Zheng 已提交
4772 4773 4774
	/* we don't want a chunk larger than 10% of writeable space */
	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
			     max_chunk_size);
4775

4776
	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4777 4778 4779
			       GFP_NOFS);
	if (!devices_info)
		return -ENOMEM;
4780

4781
	/*
4782 4783
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
4784
	 */
4785
	ndevs = 0;
4786
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4787 4788
		u64 max_avail;
		u64 dev_offset;
4789

4790
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
J
Julia Lawall 已提交
4791
			WARN(1, KERN_ERR
4792
			       "BTRFS: read-only device in alloc_list\n");
4793 4794
			continue;
		}
4795

4796 4797
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
4798
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4799
			continue;
4800

4801 4802 4803 4804
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
4805 4806 4807 4808

		/* If there is no space on this device, skip it. */
		if (total_avail == 0)
			continue;
4809

4810
		ret = find_free_dev_extent(trans, device,
4811 4812 4813 4814
					   max_stripe_size * dev_stripes,
					   &dev_offset, &max_avail);
		if (ret && ret != -ENOSPC)
			goto error;
4815

4816 4817
		if (ret == 0)
			max_avail = max_stripe_size * dev_stripes;
4818

4819 4820 4821 4822 4823 4824
		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
			"%s: devid %llu has no free space, have=%llu want=%u",
					    __func__, device->devid, max_avail,
					    BTRFS_STRIPE_LEN * dev_stripes);
4825
			continue;
4826
		}
4827

4828 4829 4830 4831 4832
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
4833 4834 4835 4836 4837 4838
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
4839

4840 4841 4842 4843 4844
	/*
	 * now sort the devices by hole size / available space
	 */
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
	     btrfs_cmp_device_info, NULL);
4845

4846
	/* round down to number of usable stripes */
4847
	ndevs = round_down(ndevs, devs_increment);
4848

4849
	if (ndevs < devs_min) {
4850
		ret = -ENOSPC;
4851 4852 4853
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
4854
				    __func__, ndevs, devs_min);
4855
		}
4856
		goto error;
4857
	}
4858

4859 4860
	ndevs = min(ndevs, devs_max);

4861
	/*
4862 4863 4864 4865 4866
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
4867
	 */
4868
	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4869
	num_stripes = ndevs * dev_stripes;
4870

D
David Woodhouse 已提交
4871 4872 4873 4874 4875 4876
	/*
	 * this will have to be fixed for RAID1 and RAID10 over
	 * more drives
	 */
	data_stripes = num_stripes / ncopies;

4877
	if (type & BTRFS_BLOCK_GROUP_RAID5)
D
David Woodhouse 已提交
4878
		data_stripes = num_stripes - 1;
4879 4880

	if (type & BTRFS_BLOCK_GROUP_RAID6)
D
David Woodhouse 已提交
4881
		data_stripes = num_stripes - 2;
4882 4883 4884 4885 4886 4887 4888

	/*
	 * Use the number of data stripes to figure out how big this chunk
	 * is really going to be in terms of logical address space,
	 * and compare that answer with the max chunk size
	 */
	if (stripe_size * data_stripes > max_chunk_size) {
4889
		stripe_size = div_u64(max_chunk_size, data_stripes);
4890 4891

		/* bump the answer up to a 16MB boundary */
4892
		stripe_size = round_up(stripe_size, SZ_16M);
4893

4894 4895 4896
		/*
		 * But don't go higher than the limits we found while searching
		 * for free extents
4897
		 */
4898 4899
		stripe_size = min(devices_info[ndevs - 1].max_avail,
				  stripe_size);
4900 4901
	}

4902
	/* align to BTRFS_STRIPE_LEN */
4903
	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
4904 4905 4906 4907 4908 4909 4910

	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
	if (!map) {
		ret = -ENOMEM;
		goto error;
	}
	map->num_stripes = num_stripes;
4911

4912 4913 4914 4915 4916 4917
	for (i = 0; i < ndevs; ++i) {
		for (j = 0; j < dev_stripes; ++j) {
			int s = i * dev_stripes + j;
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
						   j * stripe_size;
4918 4919
		}
	}
4920 4921 4922
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
4923 4924
	map->type = type;
	map->sub_stripes = sub_stripes;
4925

D
David Woodhouse 已提交
4926
	num_bytes = stripe_size * data_stripes;
4927

4928
	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4929

4930
	em = alloc_extent_map();
Y
Yan Zheng 已提交
4931
	if (!em) {
4932
		kfree(map);
4933 4934
		ret = -ENOMEM;
		goto error;
4935
	}
4936
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4937
	em->map_lookup = map;
Y
Yan Zheng 已提交
4938
	em->start = start;
4939
	em->len = num_bytes;
Y
Yan Zheng 已提交
4940 4941
	em->block_start = 0;
	em->block_len = em->len;
4942
	em->orig_block_len = stripe_size;
4943

4944
	em_tree = &info->mapping_tree.map_tree;
4945
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4946
	ret = add_extent_mapping(em_tree, em, 0);
4947
	if (ret) {
4948
		write_unlock(&em_tree->lock);
4949
		free_extent_map(em);
4950
		goto error;
4951
	}
4952

4953 4954 4955 4956
	list_add_tail(&em->list, &trans->transaction->pending_chunks);
	refcount_inc(&em->refs);
	write_unlock(&em_tree->lock);

4957
	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
4958 4959
	if (ret)
		goto error_del_extent;
Y
Yan Zheng 已提交
4960

4961 4962 4963 4964
	for (i = 0; i < map->num_stripes; i++) {
		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
	}
4965

4966
	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
4967

4968
	free_extent_map(em);
4969
	check_raid56_incompat_flag(info, type);
D
David Woodhouse 已提交
4970

4971
	kfree(devices_info);
Y
Yan Zheng 已提交
4972
	return 0;
4973

4974
error_del_extent:
4975 4976 4977 4978 4979 4980 4981 4982
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
4983 4984
	/* One for the pending_chunks list reference */
	free_extent_map(em);
4985 4986 4987
error:
	kfree(devices_info);
	return ret;
Y
Yan Zheng 已提交
4988 4989
}

4990
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4991
				struct btrfs_fs_info *fs_info,
4992
				u64 chunk_offset, u64 chunk_size)
Y
Yan Zheng 已提交
4993
{
4994 4995
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
4996 4997 4998 4999
	struct btrfs_key key;
	struct btrfs_device *device;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
5000 5001 5002 5003 5004 5005
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	u64 dev_offset;
	u64 stripe_size;
	int i = 0;
5006
	int ret = 0;
Y
Yan Zheng 已提交
5007

5008 5009 5010
	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	if (IS_ERR(em))
		return PTR_ERR(em);
5011

5012
	map = em->map_lookup;
5013 5014 5015
	item_size = btrfs_chunk_item_size(map->num_stripes);
	stripe_size = em->orig_block_len;

Y
Yan Zheng 已提交
5016
	chunk = kzalloc(item_size, GFP_NOFS);
5017 5018 5019 5020 5021
	if (!chunk) {
		ret = -ENOMEM;
		goto out;
	}

5022 5023 5024 5025 5026 5027 5028
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with the map's stripes, because the device object's id can change
	 * at any time during that final phase of the device replace operation
	 * (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
5029
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5030 5031 5032
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
Y
Yan Zheng 已提交
5033

5034
		ret = btrfs_update_device(trans, device);
5035
		if (ret)
5036
			break;
5037 5038
		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
					     dev_offset, stripe_size);
5039
		if (ret)
5040 5041 5042
			break;
	}
	if (ret) {
5043
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5044
		goto out;
Y
Yan Zheng 已提交
5045 5046 5047
	}

	stripe = &chunk->stripe;
5048 5049 5050
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
5051

5052 5053 5054
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
5055
		stripe++;
5056
	}
5057
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5058

Y
Yan Zheng 已提交
5059
	btrfs_set_stack_chunk_length(chunk, chunk_size);
5060
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
5061 5062 5063 5064 5065
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5066
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
5067
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5068

Y
Yan Zheng 已提交
5069 5070 5071
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
	key.offset = chunk_offset;
5072

Y
Yan Zheng 已提交
5073
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5074 5075 5076 5077 5078
	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
		/*
		 * TODO: Cleanup of inserted chunk root in case of
		 * failure.
		 */
5079
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5080
	}
5081

5082
out:
5083
	kfree(chunk);
5084
	free_extent_map(em);
5085
	return ret;
Y
Yan Zheng 已提交
5086
}
5087

Y
Yan Zheng 已提交
5088 5089 5090 5091 5092 5093 5094 5095
/*
 * Chunk allocation falls into two parts. The first part does works
 * that make the new allocated chunk useable, but not do any operation
 * that modifies the chunk tree. The second part does the works that
 * require modifying the chunk tree. This division is important for the
 * bootstrap process of adding storage to a seed btrfs.
 */
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5096
		      struct btrfs_fs_info *fs_info, u64 type)
Y
Yan Zheng 已提交
5097 5098 5099
{
	u64 chunk_offset;

5100
	lockdep_assert_held(&fs_info->chunk_mutex);
5101
	chunk_offset = find_next_chunk(fs_info);
5102
	return __btrfs_alloc_chunk(trans, chunk_offset, type);
Y
Yan Zheng 已提交
5103 5104
}

C
Chris Mason 已提交
5105
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5106
					 struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
5107 5108 5109 5110 5111 5112
{
	u64 chunk_offset;
	u64 sys_chunk_offset;
	u64 alloc_profile;
	int ret;

5113
	chunk_offset = find_next_chunk(fs_info);
5114
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5115
	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5116 5117
	if (ret)
		return ret;
Y
Yan Zheng 已提交
5118

5119
	sys_chunk_offset = find_next_chunk(fs_info);
5120
	alloc_profile = btrfs_system_alloc_profile(fs_info);
5121
	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5122
	return ret;
Y
Yan Zheng 已提交
5123 5124
}

5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
	int max_errors;

	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			 BTRFS_BLOCK_GROUP_RAID10 |
			 BTRFS_BLOCK_GROUP_RAID5 |
			 BTRFS_BLOCK_GROUP_DUP)) {
		max_errors = 1;
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
		max_errors = 2;
	} else {
		max_errors = 0;
5138
	}
Y
Yan Zheng 已提交
5139

5140
	return max_errors;
Y
Yan Zheng 已提交
5141 5142
}

5143
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5144 5145 5146 5147
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5148
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5149 5150
	int i;

5151 5152
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5153 5154
		return 1;

5155
	map = em->map_lookup;
Y
Yan Zheng 已提交
5156
	for (i = 0; i < map->num_stripes; i++) {
5157 5158
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5159 5160 5161
			miss_ndevs++;
			continue;
		}
5162 5163
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
Y
Yan Zheng 已提交
5164
			readonly = 1;
5165
			goto end;
Y
Yan Zheng 已提交
5166 5167
		}
	}
5168 5169 5170 5171 5172 5173 5174 5175 5176

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5177
	free_extent_map(em);
Y
Yan Zheng 已提交
5178
	return readonly;
5179 5180 5181 5182
}

void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
5183
	extent_map_tree_init(&tree->map_tree);
5184 5185 5186 5187 5188 5189
}

void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
{
	struct extent_map *em;

C
Chris Mason 已提交
5190
	while (1) {
5191
		write_lock(&tree->map_tree.lock);
5192 5193 5194
		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
		if (em)
			remove_extent_mapping(&tree->map_tree, em);
5195
		write_unlock(&tree->map_tree.lock);
5196 5197 5198 5199 5200 5201 5202 5203 5204
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5205
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5206 5207 5208 5209 5210
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5211 5212 5213 5214 5215 5216 5217 5218
	em = get_chunk_map(fs_info, logical, len);
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5219 5220
		return 1;

5221
	map = em->map_lookup;
5222 5223
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
		ret = map->num_stripes;
C
Chris Mason 已提交
5224 5225
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5226 5227 5228
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
L
Liu Bo 已提交
5229 5230 5231 5232 5233 5234 5235 5236
		/*
		 * There could be two corrupted data stripes, we need
		 * to loop retry in order to rebuild the correct data.
		 * 
		 * Fail a stripe at a time on every retry except the
		 * stripe under reconstruction.
		 */
		ret = map->num_stripes;
5237 5238 5239
	else
		ret = 1;
	free_extent_map(em);
5240

5241
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
5242 5243
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5244
		ret++;
5245
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
5246

5247 5248 5249
	return ret;
}

5250
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5251 5252 5253 5254
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5255
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5256

5257
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5258

5259 5260 5261 5262 5263 5264
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5265 5266 5267
	return len;
}

5268
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5269 5270 5271 5272 5273
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5274
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5275

5276 5277 5278 5279 5280 5281
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5282 5283 5284
	return ret;
}

5285
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5286
			    struct map_lookup *map, int first,
5287
			    int dev_replace_is_ongoing)
5288 5289
{
	int i;
5290
	int num_stripes;
5291
	int preferred_mirror;
5292 5293 5294
	int tolerance;
	struct btrfs_device *srcdev;

5295 5296 5297 5298 5299 5300 5301 5302
	ASSERT((map->type &
		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

5303 5304
	preferred_mirror = first + current->pid % num_stripes;

5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5318 5319 5320
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5321
		for (i = first; i < first + num_stripes; i++) {
5322 5323 5324 5325
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5326
	}
5327

5328 5329 5330
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5331
	return preferred_mirror;
5332 5333
}

D
David Woodhouse 已提交
5334 5335 5336 5337 5338 5339
static inline int parity_smaller(u64 a, u64 b)
{
	return a > b;
}

/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5340
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5341 5342 5343 5344 5345 5346 5347 5348
{
	struct btrfs_bio_stripe s;
	int i;
	u64 l;
	int again = 1;

	while (again) {
		again = 0;
5349
		for (i = 0; i < num_stripes - 1; i++) {
5350 5351
			if (parity_smaller(bbio->raid_map[i],
					   bbio->raid_map[i+1])) {
D
David Woodhouse 已提交
5352
				s = bbio->stripes[i];
5353
				l = bbio->raid_map[i];
D
David Woodhouse 已提交
5354
				bbio->stripes[i] = bbio->stripes[i+1];
5355
				bbio->raid_map[i] = bbio->raid_map[i+1];
D
David Woodhouse 已提交
5356
				bbio->stripes[i+1] = s;
5357
				bbio->raid_map[i+1] = l;
5358

D
David Woodhouse 已提交
5359 5360 5361 5362 5363 5364
				again = 1;
			}
		}
	}
}

5365 5366 5367
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5368
		 /* the size of the btrfs_bio */
5369
		sizeof(struct btrfs_bio) +
5370
		/* plus the variable array for the stripes */
5371
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5372
		/* plus the variable array for the tgt dev */
5373
		sizeof(int) * (real_stripes) +
5374 5375 5376 5377 5378
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5379
		GFP_NOFS|__GFP_NOFAIL);
5380 5381

	atomic_set(&bbio->error, 0);
5382
	refcount_set(&bbio->refs, 1);
5383 5384 5385 5386 5387 5388

	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5389 5390
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5391 5392 5393 5394 5395 5396
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5397
	if (refcount_dec_and_test(&bbio->refs))
5398 5399 5400
		kfree(bbio);
}

5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

	em = get_chunk_map(fs_info, logical, length);
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
	length = min_t(u64, em->len - offset, length);

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5458
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

5724 5725 5726 5727 5728
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

5729 5730
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
5731
			     u64 logical, u64 *length,
5732
			     struct btrfs_bio **bbio_ret,
5733
			     int mirror_num, int need_raid_map)
5734 5735 5736 5737
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 offset;
5738 5739
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
5740
	u64 stripe_len;
5741
	u32 stripe_index;
5742
	int i;
L
Li Zefan 已提交
5743
	int ret = 0;
5744
	int num_stripes;
5745
	int max_errors = 0;
5746
	int tgtdev_indexes = 0;
5747
	struct btrfs_bio *bbio = NULL;
5748 5749 5750
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
5751 5752
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
5753
	u64 raid56_full_stripe_start = (u64)-1;
5754

5755 5756 5757 5758
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     *length, bbio_ret);

5759 5760 5761
	em = get_chunk_map(fs_info, logical, *length);
	if (IS_ERR(em))
		return PTR_ERR(em);
5762

5763
	map = em->map_lookup;
5764
	offset = logical - em->start;
5765

D
David Woodhouse 已提交
5766
	stripe_len = map->stripe_len;
5767 5768 5769 5770 5771
	stripe_nr = offset;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
5772
	stripe_nr = div64_u64(stripe_nr, stripe_len);
5773

D
David Woodhouse 已提交
5774
	stripe_offset = stripe_nr * stripe_len;
5775
	if (offset < stripe_offset) {
J
Jeff Mahoney 已提交
5776 5777
		btrfs_crit(fs_info,
			   "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5778 5779 5780 5781 5782
			   stripe_offset, offset, em->start, logical,
			   stripe_len);
		free_extent_map(em);
		return -EINVAL;
	}
5783 5784 5785 5786

	/* stripe_offset is the offset of this block in its stripe*/
	stripe_offset = offset - stripe_offset;

D
David Woodhouse 已提交
5787
	/* if we're here for raid56, we need to know the stripe aligned start */
5788
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
D
David Woodhouse 已提交
5789 5790 5791 5792 5793 5794
		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
		raid56_full_stripe_start = offset;

		/* allow a write of a full stripe, but make sure we don't
		 * allow straddling of stripes
		 */
5795 5796
		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				full_stripe_len);
D
David Woodhouse 已提交
5797 5798 5799
		raid56_full_stripe_start *= full_stripe_len;
	}

5800
	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
D
David Woodhouse 已提交
5801 5802 5803 5804
		u64 max_len;
		/* For writes to RAID[56], allow a full stripeset across all disks.
		   For other RAID types and for RAID[56] reads, just allow a single
		   stripe (on a single disk). */
5805
		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5806
		    (op == BTRFS_MAP_WRITE)) {
D
David Woodhouse 已提交
5807 5808 5809 5810 5811 5812 5813
			max_len = stripe_len * nr_data_stripes(map) -
				(offset - raid56_full_stripe_start);
		} else {
			/* we limit the length of each bio to what fits in a stripe */
			max_len = stripe_len - stripe_offset;
		}
		*length = min_t(u64, em->len - offset, max_len);
5814 5815 5816
	} else {
		*length = em->len - offset;
	}
5817

D
David Woodhouse 已提交
5818 5819
	/* This is for when we're called from btrfs_merge_bio_hook() and all
	   it cares about is the length */
5820
	if (!bbio_ret)
5821 5822
		goto out;

5823
	btrfs_dev_replace_read_lock(dev_replace);
5824 5825
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	if (!dev_replace_is_ongoing)
5826
		btrfs_dev_replace_read_unlock(dev_replace);
5827 5828
	else
		btrfs_dev_replace_set_lock_blocking(dev_replace);
5829

5830
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5831
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5832 5833 5834 5835 5836
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
5837
			goto out;
5838 5839
		else
			patch_the_first_stripe_for_dev_replace = 1;
5840 5841 5842 5843
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

5844
	num_stripes = 1;
5845
	stripe_index = 0;
5846
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5847 5848
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5849
		if (!need_full_stripe(op))
5850
			mirror_num = 1;
5851
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5852
		if (need_full_stripe(op))
5853
			num_stripes = map->num_stripes;
5854
		else if (mirror_num)
5855
			stripe_index = mirror_num - 1;
5856
		else {
5857 5858
			stripe_index = find_live_mirror(fs_info, map, 0,
					    dev_replace_is_ongoing);
5859
			mirror_num = stripe_index + 1;
5860
		}
5861

5862
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5863
		if (need_full_stripe(op)) {
5864
			num_stripes = map->num_stripes;
5865
		} else if (mirror_num) {
5866
			stripe_index = mirror_num - 1;
5867 5868 5869
		} else {
			mirror_num = 1;
		}
5870

C
Chris Mason 已提交
5871
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5872
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
5873

5874
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
5875 5876
		stripe_index *= map->sub_stripes;

5877
		if (need_full_stripe(op))
5878
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
5879 5880
		else if (mirror_num)
			stripe_index += mirror_num - 1;
5881
		else {
J
Jan Schmidt 已提交
5882
			int old_stripe_index = stripe_index;
5883 5884 5885
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
5886
			mirror_num = stripe_index - old_stripe_index + 1;
5887
		}
D
David Woodhouse 已提交
5888

5889
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5890
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
5891
			/* push stripe_nr back to the start of the full stripe */
5892
			stripe_nr = div64_u64(raid56_full_stripe_start,
5893
					stripe_len * nr_data_stripes(map));
D
David Woodhouse 已提交
5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
5908 5909
			stripe_nr = div_u64_rem(stripe_nr,
					nr_data_stripes(map), &stripe_index);
D
David Woodhouse 已提交
5910 5911 5912 5913 5914
			if (mirror_num > 1)
				stripe_index = nr_data_stripes(map) +
						mirror_num - 2;

			/* We distribute the parity blocks across stripes */
5915 5916
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
5917
			if (!need_full_stripe(op) && mirror_num <= 1)
5918
				mirror_num = 1;
D
David Woodhouse 已提交
5919
		}
5920 5921
	} else {
		/*
5922 5923 5924
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
5925
		 */
5926 5927
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5928
		mirror_num = stripe_index + 1;
5929
	}
5930
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
5931 5932
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5933 5934 5935 5936
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
5937

5938
	num_alloc_stripes = num_stripes;
5939
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5940
		if (op == BTRFS_MAP_WRITE)
5941
			num_alloc_stripes <<= 1;
5942
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
5943
			num_alloc_stripes++;
5944
		tgtdev_indexes = num_stripes;
5945
	}
5946

5947
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
5948 5949 5950 5951
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
5952
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5953
		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
L
Li Zefan 已提交
5954

5955
	/* build raid_map */
5956 5957
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
5958
		u64 tmp;
5959
		unsigned rot;
5960 5961 5962 5963 5964 5965 5966

		bbio->raid_map = (u64 *)((void *)bbio->stripes +
				 sizeof(struct btrfs_bio_stripe) *
				 num_alloc_stripes +
				 sizeof(int) * tgtdev_indexes);

		/* Work out the disk rotation on this stripe-set */
5967
		div_u64_rem(stripe_nr, num_stripes, &rot);
5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980

		/* Fill in the logical address of each stripe */
		tmp = stripe_nr * nr_data_stripes(map);
		for (i = 0; i < nr_data_stripes(map); i++)
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;
	}

L
Liu Bo 已提交
5981

5982 5983 5984 5985 5986 5987 5988 5989
	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset +
			stripe_nr * map->stripe_len;
		bbio->stripes[i].dev =
			map->stripes[stripe_index].dev;
		stripe_index++;
5990
	}
L
Li Zefan 已提交
5991

5992
	if (need_full_stripe(op))
5993
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
5994

5995 5996
	if (bbio->raid_map)
		sort_parity_stripes(bbio, num_stripes);
5997

5998
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5999
	    need_full_stripe(op)) {
6000 6001
		handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
					  &max_errors);
6002 6003
	}

L
Li Zefan 已提交
6004
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
6005
	bbio->map_type = map->type;
L
Li Zefan 已提交
6006 6007 6008
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
6021
out:
6022 6023
	if (dev_replace_is_ongoing) {
		btrfs_dev_replace_clear_lock_blocking(dev_replace);
6024
		btrfs_dev_replace_read_unlock(dev_replace);
6025
	}
6026
	free_extent_map(em);
L
Li Zefan 已提交
6027
	return ret;
6028 6029
}

6030
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6031
		      u64 logical, u64 *length,
6032
		      struct btrfs_bio **bbio_ret, int mirror_num)
6033
{
6034
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6035
				 mirror_num, 0);
6036 6037
}

6038
/* For Scrub/replace */
6039
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6040
		     u64 logical, u64 *length,
6041
		     struct btrfs_bio **bbio_ret)
6042
{
6043
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6044 6045
}

6046
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
Y
Yan Zheng 已提交
6047 6048 6049 6050 6051 6052 6053 6054 6055
		     u64 chunk_start, u64 physical, u64 devid,
		     u64 **logical, int *naddrs, int *stripe_len)
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 *buf;
	u64 bytenr;
	u64 length;
	u64 stripe_nr;
D
David Woodhouse 已提交
6056
	u64 rmap_len;
Y
Yan Zheng 已提交
6057 6058
	int i, j, nr = 0;

6059 6060
	em = get_chunk_map(fs_info, chunk_start, 1);
	if (IS_ERR(em))
6061 6062
		return -EIO;

6063
	map = em->map_lookup;
Y
Yan Zheng 已提交
6064
	length = em->len;
D
David Woodhouse 已提交
6065 6066
	rmap_len = map->stripe_len;

Y
Yan Zheng 已提交
6067
	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
6068
		length = div_u64(length, map->num_stripes / map->sub_stripes);
Y
Yan Zheng 已提交
6069
	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
6070
		length = div_u64(length, map->num_stripes);
6071
	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6072
		length = div_u64(length, nr_data_stripes(map));
D
David Woodhouse 已提交
6073 6074
		rmap_len = map->stripe_len * nr_data_stripes(map);
	}
Y
Yan Zheng 已提交
6075

6076
	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
6077
	BUG_ON(!buf); /* -ENOMEM */
Y
Yan Zheng 已提交
6078 6079 6080 6081 6082 6083 6084 6085 6086

	for (i = 0; i < map->num_stripes; i++) {
		if (devid && map->stripes[i].dev->devid != devid)
			continue;
		if (map->stripes[i].physical > physical ||
		    map->stripes[i].physical + length <= physical)
			continue;

		stripe_nr = physical - map->stripes[i].physical;
6087
		stripe_nr = div64_u64(stripe_nr, map->stripe_len);
Y
Yan Zheng 已提交
6088 6089 6090

		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			stripe_nr = stripe_nr * map->num_stripes + i;
6091
			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
Y
Yan Zheng 已提交
6092 6093
		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			stripe_nr = stripe_nr * map->num_stripes + i;
D
David Woodhouse 已提交
6094 6095 6096 6097 6098
		} /* else if RAID[56], multiply by nr_data_stripes().
		   * Alternatively, just use rmap_len below instead of
		   * map->stripe_len */

		bytenr = chunk_start + stripe_nr * rmap_len;
6099
		WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
6100 6101 6102 6103
		for (j = 0; j < nr; j++) {
			if (buf[j] == bytenr)
				break;
		}
6104 6105
		if (j == nr) {
			WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
6106
			buf[nr++] = bytenr;
6107
		}
Y
Yan Zheng 已提交
6108 6109 6110 6111
	}

	*logical = buf;
	*naddrs = nr;
D
David Woodhouse 已提交
6112
	*stripe_len = rmap_len;
Y
Yan Zheng 已提交
6113 6114 6115

	free_extent_map(em);
	return 0;
6116 6117
}

6118
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6119
{
6120 6121
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
6122
	bio_endio(bio);
6123

6124
	btrfs_put_bbio(bbio);
6125 6126
}

6127
static void btrfs_end_bio(struct bio *bio)
6128
{
6129
	struct btrfs_bio *bbio = bio->bi_private;
6130
	int is_orig_bio = 0;
6131

6132
	if (bio->bi_status) {
6133
		atomic_inc(&bbio->error);
6134 6135
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
6136
			unsigned int stripe_index =
6137
				btrfs_io_bio(bio)->stripe_index;
6138
			struct btrfs_device *dev;
6139 6140 6141

			BUG_ON(stripe_index >= bbio->num_stripes);
			dev = bbio->stripes[stripe_index].dev;
6142
			if (dev->bdev) {
M
Mike Christie 已提交
6143
				if (bio_op(bio) == REQ_OP_WRITE)
6144
					btrfs_dev_stat_inc_and_print(dev,
6145 6146
						BTRFS_DEV_STAT_WRITE_ERRS);
				else
6147
					btrfs_dev_stat_inc_and_print(dev,
6148
						BTRFS_DEV_STAT_READ_ERRS);
6149
				if (bio->bi_opf & REQ_PREFLUSH)
6150
					btrfs_dev_stat_inc_and_print(dev,
6151 6152
						BTRFS_DEV_STAT_FLUSH_ERRS);
			}
6153 6154
		}
	}
6155

6156
	if (bio == bbio->orig_bio)
6157 6158
		is_orig_bio = 1;

6159 6160
	btrfs_bio_counter_dec(bbio->fs_info);

6161
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6162 6163
		if (!is_orig_bio) {
			bio_put(bio);
6164
			bio = bbio->orig_bio;
6165
		}
6166

6167
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6168
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6169
		 * beyond the tolerance of the btrfs bio
6170
		 */
6171
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6172
			bio->bi_status = BLK_STS_IOERR;
6173
		} else {
6174 6175 6176 6177
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6178
			bio->bi_status = BLK_STS_OK;
6179
		}
6180

6181
		btrfs_end_bbio(bbio, bio);
6182
	} else if (!is_orig_bio) {
6183 6184 6185 6186
		bio_put(bio);
	}
}

6187 6188 6189 6190 6191 6192 6193
/*
 * see run_scheduled_bios for a description of why bios are collected for
 * async submit.
 *
 * This will add one bio to the pending list for a device and make sure
 * the work struct is scheduled.
 */
6194
static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6195
					struct bio *bio)
6196
{
6197
	struct btrfs_fs_info *fs_info = device->fs_info;
6198
	int should_queue = 1;
6199
	struct btrfs_pending_bios *pending_bios;
6200

6201 6202
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
	    !device->bdev) {
6203
		bio_io_error(bio);
D
David Woodhouse 已提交
6204 6205 6206
		return;
	}

6207
	/* don't bother with additional async steps for reads, right now */
M
Mike Christie 已提交
6208
	if (bio_op(bio) == REQ_OP_READ) {
6209
		btrfsic_submit_bio(bio);
6210
		return;
6211 6212
	}

6213
	WARN_ON(bio->bi_next);
6214 6215 6216
	bio->bi_next = NULL;

	spin_lock(&device->io_lock);
6217
	if (op_is_sync(bio->bi_opf))
6218 6219 6220
		pending_bios = &device->pending_sync_bios;
	else
		pending_bios = &device->pending_bios;
6221

6222 6223
	if (pending_bios->tail)
		pending_bios->tail->bi_next = bio;
6224

6225 6226 6227
	pending_bios->tail = bio;
	if (!pending_bios->head)
		pending_bios->head = bio;
6228 6229 6230 6231 6232 6233
	if (device->running_pending)
		should_queue = 0;

	spin_unlock(&device->io_lock);

	if (should_queue)
6234
		btrfs_queue_work(fs_info->submit_workers, &device->work);
6235 6236
}

6237 6238
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
			      u64 physical, int dev_nr, int async)
6239 6240
{
	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6241
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6242 6243

	bio->bi_private = bbio;
6244
	btrfs_io_bio(bio)->stripe_index = dev_nr;
6245
	bio->bi_end_io = btrfs_end_bio;
6246
	bio->bi_iter.bi_sector = physical >> 9;
6247 6248 6249 6250 6251 6252
#ifdef DEBUG
	{
		struct rcu_string *name;

		rcu_read_lock();
		name = rcu_dereference(dev->name);
6253 6254 6255 6256 6257 6258
		btrfs_debug(fs_info,
			"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
			bio_op(bio), bio->bi_opf,
			(u64)bio->bi_iter.bi_sector,
			(u_long)dev->bdev->bd_dev, name->str, dev->devid,
			bio->bi_iter.bi_size);
6259 6260 6261
		rcu_read_unlock();
	}
#endif
6262
	bio_set_dev(bio, dev->bdev);
6263

6264
	btrfs_bio_counter_inc_noblocked(fs_info);
6265

6266
	if (async)
6267
		btrfs_schedule_bio(dev, bio);
6268
	else
6269
		btrfsic_submit_bio(bio);
6270 6271 6272 6273 6274 6275
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6276
		/* Should be the original bio. */
6277 6278
		WARN_ON(bio != bbio->orig_bio);

6279
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6280
		bio->bi_iter.bi_sector = logical >> 9;
6281 6282 6283 6284
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6285
		btrfs_end_bbio(bbio, bio);
6286 6287 6288
	}
}

6289 6290
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
			   int mirror_num, int async_submit)
6291 6292
{
	struct btrfs_device *dev;
6293
	struct bio *first_bio = bio;
6294
	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6295 6296 6297
	u64 length = 0;
	u64 map_length;
	int ret;
6298 6299
	int dev_nr;
	int total_devs;
6300
	struct btrfs_bio *bbio = NULL;
6301

6302
	length = bio->bi_iter.bi_size;
6303
	map_length = length;
6304

6305
	btrfs_bio_counter_inc_blocked(fs_info);
6306
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6307
				&map_length, &bbio, mirror_num, 1);
6308
	if (ret) {
6309
		btrfs_bio_counter_dec(fs_info);
6310
		return errno_to_blk_status(ret);
6311
	}
6312

6313
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6314 6315 6316
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6317
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6318 6319
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6320
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
M
Mike Christie 已提交
6321
	    ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6322 6323
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
M
Mike Christie 已提交
6324
		if (bio_op(bio) == REQ_OP_WRITE) {
6325 6326
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6327
		} else {
6328 6329
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6330
		}
6331

6332
		btrfs_bio_counter_dec(fs_info);
6333
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6334 6335
	}

6336
	if (map_length < length) {
6337
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6338 6339
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6340 6341
		BUG();
	}
6342

6343
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6344
		dev = bbio->stripes[dev_nr].dev;
M
Mike Christie 已提交
6345
		if (!dev || !dev->bdev ||
6346 6347
		    (bio_op(first_bio) == REQ_OP_WRITE &&
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6348 6349 6350 6351
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6352
		if (dev_nr < total_devs - 1)
6353
			bio = btrfs_bio_clone(first_bio);
6354
		else
6355
			bio = first_bio;
6356

6357 6358
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				  dev_nr, async_submit);
6359
	}
6360
	btrfs_bio_counter_dec(fs_info);
6361
	return BLK_STS_OK;
6362 6363
}

6364
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
Y
Yan Zheng 已提交
6365
				       u8 *uuid, u8 *fsid)
6366
{
Y
Yan Zheng 已提交
6367 6368 6369
	struct btrfs_device *device;
	struct btrfs_fs_devices *cur_devices;

6370
	cur_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
6371 6372
	while (cur_devices) {
		if (!fsid ||
6373
		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6374
			device = find_device(cur_devices, devid, uuid);
Y
Yan Zheng 已提交
6375 6376 6377 6378 6379 6380
			if (device)
				return device;
		}
		cur_devices = cur_devices->seed;
	}
	return NULL;
6381 6382
}

6383
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6384 6385 6386 6387
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;

6388 6389
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
	if (IS_ERR(device))
6390
		return device;
6391 6392

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6393
	device->fs_devices = fs_devices;
6394
	fs_devices->num_devices++;
6395

6396
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6397
	fs_devices->missing_devices++;
6398

6399 6400 6401
	return device;
}

6402 6403 6404 6405 6406 6407 6408 6409 6410 6411
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6412 6413
 * on error.  Returned struct is not linked onto any lists and must be
 * destroyed with free_device.
6414 6415 6416 6417 6418 6419 6420 6421
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6422
	if (WARN_ON(!devid && !fs_info))
6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435
		return ERR_PTR(-EINVAL);

	dev = __alloc_device();
	if (IS_ERR(dev))
		return dev;

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6436
			free_device(dev);
6437 6438 6439 6440 6441 6442 6443 6444 6445 6446
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

6447 6448
	btrfs_init_work(&dev->work, btrfs_submit_helper,
			pending_bios_fn, NULL, NULL);
6449 6450 6451 6452

	return dev;
}

6453
/* Return -EIO if any error, otherwise return 0. */
6454
static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6455 6456
				   struct extent_buffer *leaf,
				   struct btrfs_chunk *chunk, u64 logical)
6457 6458
{
	u64 length;
6459
	u64 stripe_len;
6460 6461 6462
	u16 num_stripes;
	u16 sub_stripes;
	u64 type;
6463

6464
	length = btrfs_chunk_length(leaf, chunk);
6465 6466
	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6467 6468 6469
	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
	type = btrfs_chunk_type(leaf, chunk);

6470
	if (!num_stripes) {
6471
		btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6472 6473 6474
			  num_stripes);
		return -EIO;
	}
6475 6476
	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6477 6478
		return -EIO;
	}
6479 6480
	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
		btrfs_err(fs_info, "invalid chunk sectorsize %u",
6481 6482 6483
			  btrfs_chunk_sector_size(leaf, chunk));
		return -EIO;
	}
6484 6485
	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk length %llu", length);
6486 6487
		return -EIO;
	}
6488
	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6489
		btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6490 6491 6492 6493
			  stripe_len);
		return -EIO;
	}
	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6494
	    type) {
6495
		btrfs_err(fs_info, "unrecognized chunk type: %llu",
6496 6497 6498 6499 6500
			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
			  btrfs_chunk_type(leaf, chunk));
		return -EIO;
	}
6501 6502 6503 6504 6505 6506 6507
	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
	     num_stripes != 1)) {
6508
		btrfs_err(fs_info,
6509 6510 6511 6512 6513 6514 6515 6516 6517
			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
			num_stripes, sub_stripes,
			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
		return -EIO;
	}

	return 0;
}

6518
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6519
					u64 devid, u8 *uuid, bool error)
6520
{
6521 6522 6523 6524 6525 6526
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6527 6528
}

6529
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6530 6531 6532
			  struct extent_buffer *leaf,
			  struct btrfs_chunk *chunk)
{
6533
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6548
	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6549 6550
	if (ret)
		return ret;
6551

6552
	read_lock(&map_tree->map_tree.lock);
6553
	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6554
	read_unlock(&map_tree->map_tree.lock);
6555 6556 6557 6558 6559 6560 6561 6562 6563

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6564
	em = alloc_extent_map();
6565 6566
	if (!em)
		return -ENOMEM;
6567
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6568 6569 6570 6571 6572
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6573
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6574
	em->map_lookup = map;
6575 6576
	em->start = logical;
	em->len = length;
6577
	em->orig_start = 0;
6578
	em->block_start = 0;
C
Chris Mason 已提交
6579
	em->block_len = em->len;
6580

6581 6582 6583 6584 6585
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	map->type = btrfs_chunk_type(leaf, chunk);
C
Chris Mason 已提交
6586
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6587 6588 6589 6590
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6591 6592 6593
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
6594
		map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6595
							uuid, NULL);
6596
		if (!map->stripes[i].dev &&
6597
		    !btrfs_test_opt(fs_info, DEGRADED)) {
6598
			free_extent_map(em);
6599
			btrfs_report_missing_device(fs_info, devid, uuid, true);
6600
			return -ENOENT;
6601
		}
6602 6603
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
6604 6605
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
6606
			if (IS_ERR(map->stripes[i].dev)) {
6607
				free_extent_map(em);
6608 6609 6610 6611
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
6612
			}
6613
			btrfs_report_missing_device(fs_info, devid, uuid, false);
6614
		}
6615 6616 6617
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));

6618 6619
	}

6620
	write_lock(&map_tree->map_tree.lock);
J
Josef Bacik 已提交
6621
	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6622
	write_unlock(&map_tree->map_tree.lock);
6623
	BUG_ON(ret); /* Tree corruption */
6624 6625 6626 6627 6628
	free_extent_map(em);

	return 0;
}

6629
static void fill_device_from_item(struct extent_buffer *leaf,
6630 6631 6632 6633 6634 6635
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
6636 6637
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
6638
	device->commit_total_bytes = device->disk_total_bytes;
6639
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6640
	device->commit_bytes_used = device->bytes_used;
6641 6642 6643 6644
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6645
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6646
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6647

6648
	ptr = btrfs_device_uuid(dev_item);
6649
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6650 6651
}

6652
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6653
						  u8 *fsid)
Y
Yan Zheng 已提交
6654 6655 6656 6657
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

6658
	lockdep_assert_held(&uuid_mutex);
D
David Sterba 已提交
6659
	ASSERT(fsid);
Y
Yan Zheng 已提交
6660

6661
	fs_devices = fs_info->fs_devices->seed;
Y
Yan Zheng 已提交
6662
	while (fs_devices) {
6663
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6664 6665
			return fs_devices;

Y
Yan Zheng 已提交
6666 6667 6668 6669 6670
		fs_devices = fs_devices->seed;
	}

	fs_devices = find_fsid(fsid);
	if (!fs_devices) {
6671
		if (!btrfs_test_opt(fs_info, DEGRADED))
6672 6673 6674 6675 6676 6677 6678 6679 6680
			return ERR_PTR(-ENOENT);

		fs_devices = alloc_fs_devices(fsid);
		if (IS_ERR(fs_devices))
			return fs_devices;

		fs_devices->seeding = 1;
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
6681
	}
Y
Yan Zheng 已提交
6682 6683

	fs_devices = clone_fs_devices(fs_devices);
6684 6685
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
6686

6687
	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6688
				   fs_info->bdev_holder);
6689 6690
	if (ret) {
		free_fs_devices(fs_devices);
6691
		fs_devices = ERR_PTR(ret);
Y
Yan Zheng 已提交
6692
		goto out;
6693
	}
Y
Yan Zheng 已提交
6694 6695 6696

	if (!fs_devices->seeding) {
		__btrfs_close_devices(fs_devices);
Y
Yan Zheng 已提交
6697
		free_fs_devices(fs_devices);
6698
		fs_devices = ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
6699 6700 6701
		goto out;
	}

6702 6703
	fs_devices->seed = fs_info->fs_devices->seed;
	fs_info->fs_devices->seed = fs_devices;
Y
Yan Zheng 已提交
6704
out:
6705
	return fs_devices;
Y
Yan Zheng 已提交
6706 6707
}

6708
static int read_one_dev(struct btrfs_fs_info *fs_info,
6709 6710 6711
			struct extent_buffer *leaf,
			struct btrfs_dev_item *dev_item)
{
6712
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6713 6714 6715
	struct btrfs_device *device;
	u64 devid;
	int ret;
6716
	u8 fs_uuid[BTRFS_FSID_SIZE];
6717 6718
	u8 dev_uuid[BTRFS_UUID_SIZE];

6719
	devid = btrfs_device_id(leaf, dev_item);
6720
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6721
			   BTRFS_UUID_SIZE);
6722
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6723
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
6724

6725
	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6726
		fs_devices = open_seed_devices(fs_info, fs_uuid);
6727 6728
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
6729 6730
	}

6731
	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6732
	if (!device) {
6733
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
6734 6735
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
6736
			return -ENOENT;
6737
		}
Y
Yan Zheng 已提交
6738

6739
		device = add_missing_dev(fs_devices, devid, dev_uuid);
6740 6741 6742 6743 6744 6745
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
6746
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6747
	} else {
6748
		if (!device->bdev) {
6749 6750 6751
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
6752
				return -ENOENT;
6753 6754 6755
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
6756
		}
6757

6758 6759
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6760 6761 6762 6763 6764 6765
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
6766
			device->fs_devices->missing_devices++;
6767
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Y
Yan Zheng 已提交
6768
		}
6769 6770 6771

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
6772 6773
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
6774 6775 6776 6777 6778 6779 6780 6781 6782 6783

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
6784 6785
	}

6786
	if (device->fs_devices != fs_info->fs_devices) {
6787
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Y
Yan Zheng 已提交
6788 6789 6790
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
6791
	}
6792 6793

	fill_device_from_item(leaf, dev_item, device);
6794
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6795
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6796
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Y
Yan Zheng 已提交
6797
		device->fs_devices->total_rw_bytes += device->total_bytes;
6798 6799
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
6800
	}
6801 6802 6803 6804
	ret = 0;
	return ret;
}

6805
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6806
{
6807
	struct btrfs_root *root = fs_info->tree_root;
6808
	struct btrfs_super_block *super_copy = fs_info->super_copy;
6809
	struct extent_buffer *sb;
6810 6811
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
6812 6813
	u8 *array_ptr;
	unsigned long sb_array_offset;
6814
	int ret = 0;
6815 6816 6817
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
6818
	u32 cur_offset;
6819
	u64 type;
6820
	struct btrfs_key key;
6821

6822
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6823 6824 6825 6826 6827
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
6828
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6829 6830
	if (IS_ERR(sb))
		return PTR_ERR(sb);
6831
	set_extent_buffer_uptodate(sb);
6832
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6833
	/*
6834
	 * The sb extent buffer is artificial and just used to read the system array.
6835
	 * set_extent_buffer_uptodate() call does not properly mark all it's
6836 6837 6838 6839 6840 6841 6842 6843 6844
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
6845
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6846
		SetPageUptodate(sb->pages[0]);
6847

6848
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6849 6850
	array_size = btrfs_super_sys_array_size(super_copy);

6851 6852 6853
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
6854

6855 6856
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
6857 6858 6859 6860
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

6861 6862
		btrfs_disk_key_to_cpu(&key, disk_key);

6863 6864 6865
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6866

6867
		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6868
			chunk = (struct btrfs_chunk *)sb_array_offset;
6869 6870 6871 6872 6873 6874 6875 6876 6877
			/*
			 * At least one btrfs_chunk with one stripe must be
			 * present, exact stripe count check comes afterwards
			 */
			len = btrfs_chunk_item_size(1);
			if (cur_offset + len > array_size)
				goto out_short_read;

			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6878
			if (!num_stripes) {
6879 6880
				btrfs_err(fs_info,
					"invalid number of stripes %u in sys_array at offset %u",
6881 6882 6883 6884 6885
					num_stripes, cur_offset);
				ret = -EIO;
				break;
			}

6886 6887
			type = btrfs_chunk_type(sb, chunk);
			if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6888
				btrfs_err(fs_info,
6889 6890 6891 6892 6893 6894
			    "invalid chunk type %llu in sys_array at offset %u",
					type, cur_offset);
				ret = -EIO;
				break;
			}

6895 6896 6897 6898
			len = btrfs_chunk_item_size(num_stripes);
			if (cur_offset + len > array_size)
				goto out_short_read;

6899
			ret = read_one_chunk(fs_info, &key, sb, chunk);
6900 6901
			if (ret)
				break;
6902
		} else {
6903 6904 6905
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
6906 6907
			ret = -EIO;
			break;
6908
		}
6909 6910 6911
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6912
	}
6913
	clear_extent_buffer_uptodate(sb);
6914
	free_extent_buffer_stale(sb);
6915
	return ret;
6916 6917

out_short_read:
6918
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6919
			len, cur_offset);
6920
	clear_extent_buffer_uptodate(sb);
6921
	free_extent_buffer_stale(sb);
6922
	return -EIO;
6923 6924
}

6925 6926 6927
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
6928 6929
 * If the @failing_dev is specified, it's accounted as missing.
 *
6930 6931 6932
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
6933 6934
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961
{
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

	read_lock(&map_tree->map_tree.lock);
	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
	read_unlock(&map_tree->map_tree.lock);
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

6962 6963
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6964 6965
			    dev->last_flush_error)
				missing++;
6966 6967
			else if (failing_dev && failing_dev == dev)
				missing++;
6968 6969
		}
		if (missing > max_tolerated) {
6970 6971
			if (!failing_dev)
				btrfs_warn(fs_info,
6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989
	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

		read_lock(&map_tree->map_tree.lock);
		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
					   (u64)(-1) - next_start);
		read_unlock(&map_tree->map_tree.lock);
	}
out:
	return ret;
}

6990
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
6991
{
6992
	struct btrfs_root *root = fs_info->chunk_root;
6993 6994 6995 6996 6997 6998
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
6999
	u64 total_dev = 0;
7000 7001 7002 7003 7004

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

7005
	mutex_lock(&uuid_mutex);
7006
	mutex_lock(&fs_info->chunk_mutex);
7007

7008 7009 7010 7011 7012
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7013 7014 7015 7016 7017
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7018 7019
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
7020
	while (1) {
7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7032 7033 7034
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
7035
						  struct btrfs_dev_item);
7036
			ret = read_one_dev(fs_info, leaf, dev_item);
7037 7038
			if (ret)
				goto error;
7039
			total_dev++;
7040 7041 7042
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7043
			ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
Y
Yan Zheng 已提交
7044 7045
			if (ret)
				goto error;
7046 7047 7048
		}
		path->slots[0]++;
	}
7049 7050 7051 7052 7053

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
7054 7055
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
7056
	   "super_num_devices %llu mismatch with num_devices %llu found here",
7057
			  btrfs_super_num_devices(fs_info->super_copy),
7058 7059 7060 7061
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
7062 7063 7064
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
7065
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7066 7067
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
7068 7069 7070
		ret = -EINVAL;
		goto error;
	}
7071 7072
	ret = 0;
error:
7073
	mutex_unlock(&fs_info->chunk_mutex);
7074 7075
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
7076
	btrfs_free_path(path);
7077 7078
	return ret;
}
7079

7080 7081 7082 7083 7084
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;

7085 7086 7087
	while (fs_devices) {
		mutex_lock(&fs_devices->device_list_mutex);
		list_for_each_entry(device, &fs_devices->devices, dev_list)
7088
			device->fs_info = fs_info;
7089 7090 7091 7092
		mutex_unlock(&fs_devices->device_list_mutex);

		fs_devices = fs_devices->seed;
	}
7093 7094
}

7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_dev_stat_reset(dev, i);
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct extent_buffer *eb;
	int slot;
	int ret = 0;
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
	int i;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		int item_size;
		struct btrfs_dev_stats_item *ptr;

7127 7128
		key.objectid = BTRFS_DEV_STATS_OBJECTID;
		key.type = BTRFS_PERSISTENT_ITEM_KEY;
7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164
		key.offset = device->devid;
		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
		if (ret) {
			__btrfs_reset_dev_stats(device);
			device->dev_stats_valid = 1;
			btrfs_release_path(path);
			continue;
		}
		slot = path->slots[0];
		eb = path->nodes[0];
		btrfs_item_key_to_cpu(eb, &found_key, slot);
		item_size = btrfs_item_size_nr(eb, slot);

		ptr = btrfs_item_ptr(eb, slot,
				     struct btrfs_dev_stats_item);

		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (item_size >= (1 + i) * sizeof(__le64))
				btrfs_dev_stat_set(device, i,
					btrfs_dev_stats_value(eb, ptr, i));
			else
				btrfs_dev_stat_reset(device, i);
		}

		device->dev_stats_valid = 1;
		btrfs_dev_stat_print_on_load(device);
		btrfs_release_path(path);
	}
	mutex_unlock(&fs_devices->device_list_mutex);

out:
	btrfs_free_path(path);
	return ret < 0 ? ret : 0;
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7165
				struct btrfs_fs_info *fs_info,
7166 7167
				struct btrfs_device *device)
{
7168
	struct btrfs_root *dev_root = fs_info->dev_root;
7169 7170 7171 7172 7173 7174 7175
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7176 7177
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7178 7179 7180
	key.offset = device->devid;

	path = btrfs_alloc_path();
7181 7182
	if (!path)
		return -ENOMEM;
7183 7184
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7185
		btrfs_warn_in_rcu(fs_info,
7186
			"error %d while searching for dev_stats item for device %s",
7187
			      ret, rcu_str_deref(device->name));
7188 7189 7190 7191 7192 7193 7194 7195
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7196
			btrfs_warn_in_rcu(fs_info,
7197
				"delete too small dev_stats item for device %s failed %d",
7198
				      rcu_str_deref(device->name), ret);
7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7210
			btrfs_warn_in_rcu(fs_info,
7211 7212
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7237
	int stats_cnt;
7238 7239 7240 7241
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7242 7243
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7244 7245
			continue;

7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7260
		ret = update_dev_stat_item(trans, fs_info, device);
7261
		if (!ret)
7262
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7263 7264 7265 7266 7267 7268
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7269 7270 7271 7272 7273 7274
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7275
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7276
{
7277 7278
	if (!dev->dev_stats_valid)
		return;
7279
	btrfs_err_rl_in_rcu(dev->fs_info,
7280
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7281
			   rcu_str_deref(dev->name),
7282 7283 7284
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7285 7286
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7287
}
7288

7289 7290
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7291 7292 7293 7294 7295 7296 7297 7298
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7299
	btrfs_info_in_rcu(dev->fs_info,
7300
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7301
	       rcu_str_deref(dev->name),
7302 7303 7304 7305 7306 7307 7308
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7309
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7310
			struct btrfs_ioctl_get_dev_stats *stats)
7311 7312
{
	struct btrfs_device *dev;
7313
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7314 7315 7316
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7317
	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7318 7319 7320
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7321
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7322
		return -ENODEV;
7323
	} else if (!dev->dev_stats_valid) {
7324
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7325
		return -ENODEV;
7326
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
				btrfs_dev_stat_reset(dev, i);
		}
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7343

7344
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7345 7346 7347
{
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
7348
	int copy_num;
7349

7350 7351
	if (!bdev)
		return;
7352

7353 7354
	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
		copy_num++) {
7355

7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371
		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
			continue;

		disk_super = (struct btrfs_super_block *)bh->b_data;

		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
		set_buffer_dirty(bh);
		sync_dirty_buffer(bh);
		brelse(bh);
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
7372
}
7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386

/*
 * Update the size of all devices, which is used for writing out the
 * super blocks.
 */
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *curr, *next;

	if (list_empty(&fs_devices->resized_devices))
		return;

	mutex_lock(&fs_devices->device_list_mutex);
7387
	mutex_lock(&fs_info->chunk_mutex);
7388 7389 7390 7391 7392
	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				 resized_list) {
		list_del_init(&curr->resized_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
	}
7393
	mutex_unlock(&fs_info->chunk_mutex);
7394 7395
	mutex_unlock(&fs_devices->device_list_mutex);
}
7396 7397

/* Must be invoked during the transaction commit */
7398
void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7399
{
7400
	struct btrfs_fs_info *fs_info = trans->fs_info;
7401 7402 7403 7404 7405
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_device *dev;
	int i;

7406
	if (list_empty(&trans->pending_chunks))
7407 7408 7409
		return;

	/* In order to kick the device replace finish process */
7410
	mutex_lock(&fs_info->chunk_mutex);
7411
	list_for_each_entry(em, &trans->pending_chunks, list) {
7412
		map = em->map_lookup;
7413 7414 7415 7416 7417 7418

		for (i = 0; i < map->num_stripes; i++) {
			dev = map->stripes[i].dev;
			dev->commit_bytes_used = dev->bytes_used;
		}
	}
7419
	mutex_unlock(&fs_info->chunk_mutex);
7420
}
7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438

void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = fs_info;
		fs_devices = fs_devices->seed;
	}
}

void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = NULL;
		fs_devices = fs_devices->seed;
	}
}