volumes.c 191.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6 7
#include <linux/sched.h>
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/buffer_head.h>
10
#include <linux/blkdev.h>
11
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
12
#include <linux/kthread.h>
D
David Woodhouse 已提交
13
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
14
#include <linux/semaphore.h>
15
#include <linux/uuid.h>
A
Anand Jain 已提交
16
#include <linux/list_sort.h>
17 18 19 20 21 22
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
23
#include "raid56.h"
24
#include "async-thread.h"
25
#include "check-integrity.h"
26
#include "rcu-string.h"
27
#include "math.h"
28
#include "dev-replace.h"
29
#include "sysfs.h"
30

Z
Zhao Lei 已提交
31 32 33 34 35 36
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
37
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
38 39
		.devs_increment	= 2,
		.ncopies	= 2,
40
		.raid_name	= "raid10",
41
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
42
		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
Z
Zhao Lei 已提交
43 44 45 46 47 48
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
49
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
50 51
		.devs_increment	= 2,
		.ncopies	= 2,
52
		.raid_name	= "raid1",
53
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
54
		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
Z
Zhao Lei 已提交
55 56 57 58 59 60
	},
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
61
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
62 63
		.devs_increment	= 1,
		.ncopies	= 2,
64
		.raid_name	= "dup",
65
		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
66
		.mindev_error	= 0,
Z
Zhao Lei 已提交
67 68 69 70 71 72
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
73
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
74 75
		.devs_increment	= 1,
		.ncopies	= 1,
76
		.raid_name	= "raid0",
77
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
78
		.mindev_error	= 0,
Z
Zhao Lei 已提交
79 80 81 82 83 84
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
85
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
86 87
		.devs_increment	= 1,
		.ncopies	= 1,
88
		.raid_name	= "single",
89
		.bg_flag	= 0,
90
		.mindev_error	= 0,
Z
Zhao Lei 已提交
91 92 93 94 95 96
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
97
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
98 99
		.devs_increment	= 1,
		.ncopies	= 2,
100
		.raid_name	= "raid5",
101
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
102
		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
Z
Zhao Lei 已提交
103 104 105 106 107 108
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
109
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
110 111
		.devs_increment	= 1,
		.ncopies	= 3,
112
		.raid_name	= "raid6",
113
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
114
		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
Z
Zhao Lei 已提交
115 116 117
	},
};

118 119 120 121 122 123 124 125
const char *get_raid_name(enum btrfs_raid_types type)
{
	if (type >= BTRFS_NR_RAID_TYPES)
		return NULL;

	return btrfs_raid_array[type].raid_name;
}

Y
Yan Zheng 已提交
126
static int init_first_rw_device(struct btrfs_trans_handle *trans,
127
				struct btrfs_fs_info *fs_info);
128
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
129
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
130
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
131
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
132 133 134 135 136
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
137

D
David Sterba 已提交
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
 * seeding, structure cloning, openning/closing devices at mount/umount time
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list!
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
 * device is added/removed
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   volume_mutex
 *     device_list_mutex
 *       chunk_mutex
 *     balance_mutex
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
 *
 *
 * Exclusive operations, BTRFS_FS_EXCL_OP
 * ======================================
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 * completed.
D
David Sterba 已提交
230 231
 */

232
DEFINE_MUTEX(uuid_mutex);
233
static LIST_HEAD(fs_uuids);
234 235 236 237
struct list_head *btrfs_get_fs_uuids(void)
{
	return &fs_uuids;
}
238

D
David Sterba 已提交
239 240 241 242 243 244 245 246 247
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
 * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
248 249 250
{
	struct btrfs_fs_devices *fs_devs;

251
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
252 253 254 255 256 257
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
258
	INIT_LIST_HEAD(&fs_devs->resized_devices);
259
	INIT_LIST_HEAD(&fs_devs->alloc_list);
260
	INIT_LIST_HEAD(&fs_devs->fs_list);
261 262 263 264 265 266
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

	return fs_devs;
}

267
void btrfs_free_device(struct btrfs_device *device)
268 269 270 271 272 273
{
	rcu_string_free(device->name);
	bio_put(device->flush_bio);
	kfree(device);
}

Y
Yan Zheng 已提交
274 275 276 277 278 279 280 281
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
282
		btrfs_free_device(device);
Y
Yan Zheng 已提交
283 284 285 286
	}
	kfree(fs_devices);
}

287 288 289 290 291 292 293
static void btrfs_kobject_uevent(struct block_device *bdev,
				 enum kobject_action action)
{
	int ret;

	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
	if (ret)
294
		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
295 296 297 298 299
			action,
			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			&disk_to_dev(bdev->bd_disk)->kobj);
}

300
void __exit btrfs_cleanup_fs_uuids(void)
301 302 303
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
304 305
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
306 307
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Y
Yan Zheng 已提交
308
		free_fs_devices(fs_devices);
309 310 311
	}
}

312 313 314
/*
 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 * Returned struct is not linked onto any lists and must be destroyed using
315
 * btrfs_free_device.
316
 */
317 318 319 320
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

321
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
322 323 324
	if (!dev)
		return ERR_PTR(-ENOMEM);

325 326 327 328 329 330 331 332 333 334
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

335 336
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
337
	INIT_LIST_HEAD(&dev->resized_list);
338 339 340 341

	spin_lock_init(&dev->io_lock);

	atomic_set(&dev->reada_in_flight, 0);
342
	atomic_set(&dev->dev_stats_ccnt, 0);
343
	btrfs_device_data_ordered_init(dev);
344
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
345
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
346 347 348 349

	return dev;
}

350 351 352 353 354 355 356 357 358
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
		u64 devid, const u8 *uuid)
359 360 361
{
	struct btrfs_device *dev;

362
	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
363
		if (dev->devid == devid &&
364
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
365
			return dev;
366
		}
367 368 369 370
	}
	return NULL;
}

371
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
372 373 374
{
	struct btrfs_fs_devices *fs_devices;

375
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
376 377 378 379 380 381
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
398
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
399 400 401 402 403 404
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
405 406
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
407 408 409 410 411 412 413 414 415 416 417 418
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

419 420 421 422 423 424 425 426 427 428 429 430 431 432
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

433 434 435 436 437 438 439 440 441 442 443
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
444
static noinline void run_scheduled_bios(struct btrfs_device *device)
445
{
446
	struct btrfs_fs_info *fs_info = device->fs_info;
447 448
	struct bio *pending;
	struct backing_dev_info *bdi;
449
	struct btrfs_pending_bios *pending_bios;
450 451 452
	struct bio *tail;
	struct bio *cur;
	int again = 0;
453
	unsigned long num_run;
454
	unsigned long batch_run = 0;
455
	unsigned long last_waited = 0;
456
	int force_reg = 0;
M
Miao Xie 已提交
457
	int sync_pending = 0;
458 459 460 461 462 463 464 465 466
	struct blk_plug plug;

	/*
	 * this function runs all the bios we've collected for
	 * a particular device.  We don't want to wander off to
	 * another device without first sending all of these down.
	 * So, setup a plug here and finish it off before we return
	 */
	blk_start_plug(&plug);
467

468
	bdi = device->bdev->bd_bdi;
469

470 471 472
loop:
	spin_lock(&device->io_lock);

473
loop_lock:
474
	num_run = 0;
475

476 477 478 479 480
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
481
	if (!force_reg && device->pending_sync_bios.head) {
482
		pending_bios = &device->pending_sync_bios;
483 484
		force_reg = 1;
	} else {
485
		pending_bios = &device->pending_bios;
486 487
		force_reg = 0;
	}
488 489 490

	pending = pending_bios->head;
	tail = pending_bios->tail;
491 492 493 494 495 496 497 498 499 500
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
501 502
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
503 504
		again = 0;
		device->running_pending = 0;
505 506 507
	} else {
		again = 1;
		device->running_pending = 1;
508
	}
509 510 511 512

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

513 514
	spin_unlock(&device->io_lock);

C
Chris Mason 已提交
515
	while (pending) {
516 517

		rmb();
518 519 520 521 522 523 524 525
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
526 527 528 529 530
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

531 532 533
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
534

535
		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
536

537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
		/*
		 * if we're doing the sync list, record that our
		 * plug has some sync requests on it
		 *
		 * If we're doing the regular list and there are
		 * sync requests sitting around, unplug before
		 * we add more
		 */
		if (pending_bios == &device->pending_sync_bios) {
			sync_pending = 1;
		} else if (sync_pending) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}

553
		btrfsic_submit_bio(cur);
554 555
		num_run++;
		batch_run++;
556 557

		cond_resched();
558 559 560 561 562 563

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
C
Chris Mason 已提交
564
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
565
		    fs_info->fs_devices->open_devices > 1) {
566
			struct io_context *ioc;
567

568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
590
				cond_resched();
591 592
				continue;
			}
593
			spin_lock(&device->io_lock);
594
			requeue_list(pending_bios, pending, tail);
595
			device->running_pending = 1;
596 597

			spin_unlock(&device->io_lock);
598 599
			btrfs_queue_work(fs_info->submit_workers,
					 &device->work);
600 601 602
			goto done;
		}
	}
603

604 605 606 607 608 609 610 611 612
	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

613
done:
614
	blk_finish_plug(&plug);
615 616
}

617
static void pending_bios_fn(struct btrfs_work *work)
618 619 620 621 622 623 624
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

625 626 627 628 629 630 631 632 633 634
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
 */
static void btrfs_free_stale_devices(const char *path,
				     struct btrfs_device *skip_dev)
A
Anand Jain 已提交
635
{
636 637
	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
	struct btrfs_device *dev, *tmp_dev;
A
Anand Jain 已提交
638

639
	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
A
Anand Jain 已提交
640 641 642 643

		if (fs_devs->opened)
			continue;

644 645
		list_for_each_entry_safe(dev, tmp_dev,
					 &fs_devs->devices, dev_list) {
646
			int not_found = 0;
A
Anand Jain 已提交
647

648 649 650
			if (skip_dev && skip_dev == dev)
				continue;
			if (path && !dev->name)
A
Anand Jain 已提交
651 652 653
				continue;

			rcu_read_lock();
654
			if (path)
655
				not_found = strcmp(rcu_str_deref(dev->name),
656
						   path);
A
Anand Jain 已提交
657
			rcu_read_unlock();
658 659
			if (not_found)
				continue;
A
Anand Jain 已提交
660 661 662 663

			/* delete the stale device */
			if (fs_devs->num_devices == 1) {
				btrfs_sysfs_remove_fsid(fs_devs);
664
				list_del(&fs_devs->fs_list);
A
Anand Jain 已提交
665
				free_fs_devices(fs_devs);
666
				break;
A
Anand Jain 已提交
667 668 669
			} else {
				fs_devs->num_devices--;
				list_del(&dev->dev_list);
670
				btrfs_free_device(dev);
A
Anand Jain 已提交
671 672 673 674 675
			}
		}
	}
}

676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				    &bdev, &bh);
	if (ret)
		return ret;

	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
		goto error_brelse;

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
		goto error_brelse;

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
708
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
709 710
		fs_devices->seeding = 1;
	} else {
711 712 713 714
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
715 716 717 718 719 720 721
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
		fs_devices->rotating = 1;

	device->bdev = bdev;
722
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
723 724 725
	device->mode = flags;

	fs_devices->open_devices++;
726 727
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
728
		fs_devices->rw_devices++;
729
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
730 731 732 733 734 735 736 737 738 739 740 741
	}
	brelse(bh);

	return 0;

error_brelse:
	brelse(bh);
	blkdev_put(bdev, flags);

	return -EINVAL;
}

742 743 744 745
/*
 * Add new device to list of registered devices
 *
 * Returns:
746 747
 * device pointer which was just added or updated when successful
 * error pointer when failed
748
 */
749
static noinline struct btrfs_device *device_list_add(const char *path,
750
			   struct btrfs_super_block *disk_super)
751 752 753
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
754
	struct rcu_string *name;
755
	u64 found_transid = btrfs_super_generation(disk_super);
756
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
757 758 759

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
760 761
		fs_devices = alloc_fs_devices(disk_super->fsid);
		if (IS_ERR(fs_devices))
762
			return ERR_CAST(fs_devices);
763

764
		list_add(&fs_devices->fs_list, &fs_uuids);
765

766 767
		device = NULL;
	} else {
768 769
		device = find_device(fs_devices, devid,
				disk_super->dev_item.uuid);
770
	}
771

772
	if (!device) {
Y
Yan Zheng 已提交
773
		if (fs_devices->opened)
774
			return ERR_PTR(-EBUSY);
Y
Yan Zheng 已提交
775

776 777 778
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
779
			/* we can safely leave the fs_devices entry around */
780
			return device;
781
		}
782 783 784

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
785
			btrfs_free_device(device);
786
			return ERR_PTR(-ENOMEM);
787
		}
788
		rcu_assign_pointer(device->name, name);
789

790
		mutex_lock(&fs_devices->device_list_mutex);
791
		list_add_rcu(&device->dev_list, &fs_devices->devices);
792
		fs_devices->num_devices++;
793 794
		mutex_unlock(&fs_devices->device_list_mutex);

Y
Yan Zheng 已提交
795
		device->fs_devices = fs_devices;
796
		btrfs_free_stale_devices(path, device);
797 798 799 800 801 802 803 804

		if (disk_super->label[0])
			pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
				disk_super->label, devid, found_transid, path);
		else
			pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
				disk_super->fsid, devid, found_transid, path);

805
	} else if (!device->name || strcmp(device->name->str, path)) {
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
827 828 829 830
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
831
		 */
832
		if (!fs_devices->opened && found_transid < device->generation) {
833 834 835 836 837 838 839
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
840
			return ERR_PTR(-EEXIST);
841
		}
842

843
		name = rcu_string_strdup(path, GFP_NOFS);
844
		if (!name)
845
			return ERR_PTR(-ENOMEM);
846 847
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
848
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
849
			fs_devices->missing_devices--;
850
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
851
		}
852 853
	}

854 855 856 857 858 859 860 861 862
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
	if (!fs_devices->opened)
		device->generation = found_transid;

863 864
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

865
	return device;
866 867
}

Y
Yan Zheng 已提交
868 869 870 871 872 873
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

874 875 876
	fs_devices = alloc_fs_devices(orig->fsid);
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
877

878
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
879
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
880

881
	/* We have held the volume lock, it is safe to get the devices. */
Y
Yan Zheng 已提交
882
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
883 884
		struct rcu_string *name;

885 886 887
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
		if (IS_ERR(device))
Y
Yan Zheng 已提交
888 889
			goto error;

890 891 892 893
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
894
		if (orig_dev->name) {
895 896
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
897
			if (!name) {
898
				btrfs_free_device(device);
899 900 901
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
902
		}
Y
Yan Zheng 已提交
903 904 905 906 907

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
908
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
909 910
	return fs_devices;
error:
911
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
912 913 914 915
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

916 917 918 919 920
/*
 * After we have read the system tree and know devids belonging to
 * this filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
921
{
Q
Qinghuang Feng 已提交
922
	struct btrfs_device *device, *next;
923
	struct btrfs_device *latest_dev = NULL;
924

925 926
	mutex_lock(&uuid_mutex);
again:
927
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
928
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
929 930
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
							&device->dev_state)) {
931 932 933 934
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
			     &device->dev_state) &&
			     (!latest_dev ||
			      device->generation > latest_dev->generation)) {
935
				latest_dev = device;
936
			}
Y
Yan Zheng 已提交
937
			continue;
938
		}
Y
Yan Zheng 已提交
939

940 941 942 943 944 945 946 947 948 949 950
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
951 952
			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
						  &device->dev_state)) {
953 954 955
				continue;
			}
		}
Y
Yan Zheng 已提交
956
		if (device->bdev) {
957
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
958 959 960
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
961
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
962
			list_del_init(&device->dev_alloc_list);
963
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
964 965
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				      &device->dev_state))
966
				fs_devices->rw_devices--;
Y
Yan Zheng 已提交
967
		}
Y
Yan Zheng 已提交
968 969
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
970
		btrfs_free_device(device);
971
	}
Y
Yan Zheng 已提交
972 973 974 975 976 977

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

978
	fs_devices->latest_bdev = latest_dev->bdev;
979

980 981
	mutex_unlock(&uuid_mutex);
}
982

983
static void free_device_rcu(struct rcu_head *head)
984 985 986
{
	struct btrfs_device *device;

L
Liu Bo 已提交
987
	device = container_of(head, struct btrfs_device, rcu);
988
	btrfs_free_device(device);
989 990
}

991 992
static void btrfs_close_bdev(struct btrfs_device *device)
{
D
David Sterba 已提交
993 994 995
	if (!device->bdev)
		return;

996
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
997 998 999 1000
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

D
David Sterba 已提交
1001
	blkdev_put(device->bdev, device->mode);
1002 1003
}

1004
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1005 1006 1007 1008 1009 1010 1011 1012
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;
	struct btrfs_device *new_device;
	struct rcu_string *name;

	if (device->bdev)
		fs_devices->open_devices--;

1013
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1014 1015 1016 1017 1018
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1019
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
		fs_devices->missing_devices--;

	new_device = btrfs_alloc_device(NULL, &device->devid,
					device->uuid);
	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */

	/* Safe because we are under uuid_mutex */
	if (device->name) {
		name = rcu_string_strdup(device->name->str, GFP_NOFS);
		BUG_ON(!name); /* -ENOMEM */
		rcu_assign_pointer(new_device->name, name);
	}

	list_replace_rcu(&device->dev_list, &new_device->dev_list);
	new_device->fs_devices = device->fs_devices;
}

1037
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1038
{
1039
	struct btrfs_device *device, *tmp;
1040 1041 1042
	struct list_head pending_put;

	INIT_LIST_HEAD(&pending_put);
Y
Yan Zheng 已提交
1043

Y
Yan Zheng 已提交
1044 1045
	if (--fs_devices->opened > 0)
		return 0;
1046

1047
	mutex_lock(&fs_devices->device_list_mutex);
1048
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1049 1050
		btrfs_prepare_close_one_device(device);
		list_add(&device->dev_list, &pending_put);
1051
	}
1052 1053
	mutex_unlock(&fs_devices->device_list_mutex);

1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
	/*
	 * btrfs_show_devname() is using the device_list_mutex,
	 * sometimes call to blkdev_put() leads vfs calling
	 * into this func. So do put outside of device_list_mutex,
	 * as of now.
	 */
	while (!list_empty(&pending_put)) {
		device = list_first_entry(&pending_put,
				struct btrfs_device, dev_list);
		list_del(&device->dev_list);
		btrfs_close_bdev(device);
1065
		call_rcu(&device->rcu, free_device_rcu);
1066 1067
	}

Y
Yan Zheng 已提交
1068 1069
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
1070 1071 1072
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

1073 1074 1075
	return 0;
}

Y
Yan Zheng 已提交
1076 1077
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Y
Yan Zheng 已提交
1078
	struct btrfs_fs_devices *seed_devices = NULL;
Y
Yan Zheng 已提交
1079 1080 1081
	int ret;

	mutex_lock(&uuid_mutex);
1082
	ret = close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
1083 1084 1085 1086
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Y
Yan Zheng 已提交
1087
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
1088 1089 1090 1091

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
1092
		close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
1093 1094
		free_fs_devices(fs_devices);
	}
Y
Yan Zheng 已提交
1095 1096 1097
	return ret;
}

1098
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
Y
Yan Zheng 已提交
1099
				fmode_t flags, void *holder)
1100 1101
{
	struct btrfs_device *device;
1102
	struct btrfs_device *latest_dev = NULL;
1103
	int ret = 0;
1104

1105 1106
	flags |= FMODE_EXCL;

1107
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
1108
		/* Just open everything we can; ignore failures here */
1109
		if (btrfs_open_one_device(fs_devices, device, flags, holder))
1110
			continue;
1111

1112 1113 1114
		if (!latest_dev ||
		    device->generation > latest_dev->generation)
			latest_dev = device;
1115
	}
1116
	if (fs_devices->open_devices == 0) {
1117
		ret = -EINVAL;
1118 1119
		goto out;
	}
Y
Yan Zheng 已提交
1120
	fs_devices->opened = 1;
1121
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1122
	fs_devices->total_rw_bytes = 0;
1123
out:
Y
Yan Zheng 已提交
1124 1125 1126
	return ret;
}

A
Anand Jain 已提交
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	struct btrfs_device *dev1, *dev2;

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Y
Yan Zheng 已提交
1141
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1142
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1143 1144 1145
{
	int ret;

1146
	mutex_lock(&uuid_mutex);
1147
	mutex_lock(&fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
1148
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1149 1150
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1151
	} else {
A
Anand Jain 已提交
1152
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1153
		ret = open_fs_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1154
	}
1155
	mutex_unlock(&fs_devices->device_list_mutex);
1156
	mutex_unlock(&uuid_mutex);
1157

1158 1159 1160
	return ret;
}

1161
static void btrfs_release_disk_super(struct page *page)
1162 1163 1164 1165 1166
{
	kunmap(page);
	put_page(page);
}

1167 1168 1169
static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				 struct page **page,
				 struct btrfs_super_block **disk_super)
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
{
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
		return 1;

	/* make sure our super fits in the page */
	if (sizeof(**disk_super) > PAGE_SIZE)
		return 1;

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
		return 1;

	/* pull in the page with our super */
	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				   index, GFP_KERNEL);

	if (IS_ERR_OR_NULL(*page))
		return 1;

	p = kmap(*page);

	/* align our pointer to the offset of the super block */
	*disk_super = p + (bytenr & ~PAGE_MASK);

	if (btrfs_super_bytenr(*disk_super) != bytenr ||
	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(*page);
		return 1;
	}

	if ((*disk_super)->label[0] &&
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';

	return 0;
}

1212 1213 1214 1215 1216
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1217
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1218 1219 1220
			  struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_super_block *disk_super;
1221
	struct btrfs_device *device;
1222
	struct block_device *bdev;
1223
	struct page *page;
1224
	int ret = 0;
1225
	u64 bytenr;
1226

1227 1228 1229 1230 1231 1232 1233
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	bytenr = btrfs_sb_offset(0);
1234
	flags |= FMODE_EXCL;
1235 1236

	bdev = blkdev_get_by_path(path, flags, holder);
1237 1238
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
1239

1240 1241
	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
		ret = -EINVAL;
1242
		goto error_bdev_put;
1243
	}
1244

1245
	mutex_lock(&uuid_mutex);
1246
	device = device_list_add(path, disk_super);
1247 1248 1249 1250
	if (IS_ERR(device))
		ret = PTR_ERR(device);
	else
		*fs_devices_ret = device->fs_devices;
1251
	mutex_unlock(&uuid_mutex);
1252

1253
	btrfs_release_disk_super(page);
1254 1255

error_bdev_put:
1256
	blkdev_put(bdev, flags);
1257

1258 1259
	return ret;
}
1260

1261
static int contains_pending_extent(struct btrfs_transaction *transaction,
1262 1263 1264
				   struct btrfs_device *device,
				   u64 *start, u64 len)
{
1265
	struct btrfs_fs_info *fs_info = device->fs_info;
1266
	struct extent_map *em;
1267
	struct list_head *search_list = &fs_info->pinned_chunks;
1268
	int ret = 0;
1269
	u64 physical_start = *start;
1270

1271 1272
	if (transaction)
		search_list = &transaction->pending_chunks;
1273 1274
again:
	list_for_each_entry(em, search_list, list) {
1275 1276 1277
		struct map_lookup *map;
		int i;

1278
		map = em->map_lookup;
1279
		for (i = 0; i < map->num_stripes; i++) {
1280 1281
			u64 end;

1282 1283
			if (map->stripes[i].dev != device)
				continue;
1284
			if (map->stripes[i].physical >= physical_start + len ||
1285
			    map->stripes[i].physical + em->orig_block_len <=
1286
			    physical_start)
1287
				continue;
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304
			/*
			 * Make sure that while processing the pinned list we do
			 * not override our *start with a lower value, because
			 * we can have pinned chunks that fall within this
			 * device hole and that have lower physical addresses
			 * than the pending chunks we processed before. If we
			 * do not take this special care we can end up getting
			 * 2 pending chunks that start at the same physical
			 * device offsets because the end offset of a pinned
			 * chunk can be equal to the start offset of some
			 * pending chunk.
			 */
			end = map->stripes[i].physical + em->orig_block_len;
			if (end > *start) {
				*start = end;
				ret = 1;
			}
1305 1306
		}
	}
1307 1308
	if (search_list != &fs_info->pinned_chunks) {
		search_list = &fs_info->pinned_chunks;
1309 1310
		goto again;
	}
1311 1312 1313 1314 1315

	return ret;
}


1316
/*
1317 1318 1319 1320 1321 1322 1323
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1324
 *
1325 1326 1327
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1328 1329 1330 1331 1332 1333 1334 1335
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1336
 */
1337 1338 1339
int find_free_dev_extent_start(struct btrfs_transaction *transaction,
			       struct btrfs_device *device, u64 num_bytes,
			       u64 search_start, u64 *start, u64 *len)
1340
{
1341 1342
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1343
	struct btrfs_key key;
1344
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1345
	struct btrfs_path *path;
1346 1347 1348 1349
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1350 1351
	u64 search_end = device->total_bytes;
	int ret;
1352
	int slot;
1353
	struct extent_buffer *l;
1354 1355 1356 1357 1358 1359

	/*
	 * We don't want to overwrite the superblock on the drive nor any area
	 * used by the boot loader (grub for example), so we make sure to start
	 * at an offset of at least 1MB.
	 */
1360
	search_start = max_t(u64, search_start, SZ_1M);
1361

1362 1363 1364
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1365

1366 1367 1368
	max_hole_start = search_start;
	max_hole_size = 0;

1369
again:
1370 1371
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1372
		ret = -ENOSPC;
1373
		goto out;
1374 1375
	}

1376
	path->reada = READA_FORWARD;
1377 1378
	path->search_commit_root = 1;
	path->skip_locking = 1;
1379

1380 1381 1382
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1383

1384
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1385
	if (ret < 0)
1386
		goto out;
1387 1388 1389
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
1390
			goto out;
1391
	}
1392

1393 1394 1395 1396 1397 1398 1399 1400
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1401 1402 1403
				goto out;

			break;
1404 1405 1406 1407 1408 1409 1410
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1411
			break;
1412

1413
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1414
			goto next;
1415

1416 1417
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1418

1419 1420 1421 1422
			/*
			 * Have to check before we set max_hole_start, otherwise
			 * we could end up sending back this offset anyway.
			 */
1423
			if (contains_pending_extent(transaction, device,
1424
						    &search_start,
1425 1426 1427 1428 1429 1430 1431 1432
						    hole_size)) {
				if (key.offset >= search_start) {
					hole_size = key.offset - search_start;
				} else {
					WARN_ON_ONCE(1);
					hole_size = 0;
				}
			}
1433

1434 1435 1436 1437
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1438

1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1451 1452 1453 1454
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1455 1456 1457 1458
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1459 1460 1461 1462 1463
next:
		path->slots[0]++;
		cond_resched();
	}

1464 1465 1466 1467 1468
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1469
	if (search_end > search_start) {
1470 1471
		hole_size = search_end - search_start;

1472
		if (contains_pending_extent(transaction, device, &search_start,
1473 1474 1475 1476
					    hole_size)) {
			btrfs_release_path(path);
			goto again;
		}
1477

1478 1479 1480 1481
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1482 1483
	}

1484
	/* See above. */
1485
	if (max_hole_size < num_bytes)
1486 1487 1488 1489 1490
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1491
	btrfs_free_path(path);
1492
	*start = max_hole_start;
1493
	if (len)
1494
		*len = max_hole_size;
1495 1496 1497
	return ret;
}

1498 1499 1500 1501 1502 1503
int find_free_dev_extent(struct btrfs_trans_handle *trans,
			 struct btrfs_device *device, u64 num_bytes,
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
	return find_free_dev_extent_start(trans->transaction, device,
1504
					  num_bytes, 0, start, len);
1505 1506
}

1507
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1508
			  struct btrfs_device *device,
M
Miao Xie 已提交
1509
			  u64 start, u64 *dev_extent_len)
1510
{
1511 1512
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1513 1514 1515
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1516 1517 1518
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1519 1520 1521 1522 1523 1524 1525 1526

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1527
again:
1528
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1529 1530 1531
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1532 1533
		if (ret)
			goto out;
1534 1535 1536 1537 1538 1539
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1540 1541 1542
		key = found_key;
		btrfs_release_path(path);
		goto again;
1543 1544 1545 1546
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1547
	} else {
1548
		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1549
		goto out;
1550
	}
1551

M
Miao Xie 已提交
1552 1553
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1554
	ret = btrfs_del_item(trans, root, path);
1555
	if (ret) {
1556 1557
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to remove dev extent item");
Z
Zhao Lei 已提交
1558
	} else {
1559
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1560
	}
1561
out:
1562 1563 1564 1565
	btrfs_free_path(path);
	return ret;
}

1566 1567 1568
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_device *device,
				  u64 chunk_offset, u64 start, u64 num_bytes)
1569 1570 1571
{
	int ret;
	struct btrfs_path *path;
1572 1573
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1574 1575 1576 1577
	struct btrfs_dev_extent *extent;
	struct extent_buffer *leaf;
	struct btrfs_key key;

1578
	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1579
	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1580 1581 1582 1583 1584
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
Y
Yan Zheng 已提交
1585
	key.offset = start;
1586 1587 1588
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*extent));
1589 1590
	if (ret)
		goto out;
1591 1592 1593 1594

	leaf = path->nodes[0];
	extent = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_dev_extent);
1595 1596
	btrfs_set_dev_extent_chunk_tree(leaf, extent,
					BTRFS_CHUNK_TREE_OBJECTID);
1597 1598
	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1599 1600
	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

1601 1602
	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
	btrfs_mark_buffer_dirty(leaf);
1603
out:
1604 1605 1606 1607
	btrfs_free_path(path);
	return ret;
}

1608
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1609
{
1610 1611 1612 1613
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1614

1615 1616 1617 1618 1619 1620
	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	n = rb_last(&em_tree->map);
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1621
	}
1622 1623
	read_unlock(&em_tree->lock);

1624 1625 1626
	return ret;
}

1627 1628
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1629 1630 1631 1632
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1633 1634 1635 1636 1637
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1638 1639 1640 1641 1642

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1643
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1644 1645 1646
	if (ret < 0)
		goto error;

1647
	BUG_ON(ret == 0); /* Corruption */
1648

1649 1650
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1651 1652
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1653
		*devid_ret = 1;
1654 1655 1656
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1657
		*devid_ret = found_key.offset + 1;
1658 1659 1660
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1661
	btrfs_free_path(path);
1662 1663 1664 1665 1666 1667 1668
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1669
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1670
			    struct btrfs_fs_info *fs_info,
1671
			    struct btrfs_device *device)
1672
{
1673
	struct btrfs_root *root = fs_info->chunk_root;
1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1687
	key.offset = device->devid;
1688 1689

	ret = btrfs_insert_empty_item(trans, root, path, &key,
1690
				      sizeof(*dev_item));
1691 1692 1693 1694 1695 1696 1697
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1698
	btrfs_set_device_generation(leaf, dev_item, 0);
1699 1700 1701 1702
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1703 1704 1705 1706
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1707 1708 1709
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1710
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1711

1712
	ptr = btrfs_device_uuid(dev_item);
1713
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1714
	ptr = btrfs_device_fsid(dev_item);
1715
	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1716 1717
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1718
	ret = 0;
1719 1720 1721 1722
out:
	btrfs_free_path(path);
	return ret;
}
1723

1724 1725 1726 1727
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1728
static void update_dev_time(const char *path_name)
1729 1730 1731 1732
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1733
	if (IS_ERR(filp))
1734 1735 1736 1737 1738
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1739
static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1740 1741
			     struct btrfs_device *device)
{
1742
	struct btrfs_root *root = fs_info->chunk_root;
1743 1744 1745 1746 1747 1748 1749 1750 1751
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1752
	trans = btrfs_start_transaction(root, 0);
1753 1754 1755 1756
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1757 1758 1759 1760 1761
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1762 1763 1764 1765 1766
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1767 1768 1769 1770
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1771 1772 1773 1774 1775
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1776 1777
out:
	btrfs_free_path(path);
1778 1779
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1780 1781 1782
	return ret;
}

1783 1784 1785 1786 1787 1788 1789
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1790 1791
{
	u64 all_avail;
1792
	unsigned seq;
1793
	int i;
1794

1795
	do {
1796
		seq = read_seqbegin(&fs_info->profiles_lock);
1797

1798 1799 1800 1801
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1802

1803
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1804
		if (!(all_avail & btrfs_raid_array[i].bg_flag))
1805
			continue;
1806

1807
		if (num_devices < btrfs_raid_array[i].devs_min) {
1808
			int ret = btrfs_raid_array[i].mindev_error;
1809

1810 1811 1812
			if (ret)
				return ret;
		}
D
David Woodhouse 已提交
1813 1814
	}

1815
	return 0;
1816 1817
}

1818 1819
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1820
{
Y
Yan Zheng 已提交
1821
	struct btrfs_device *next_device;
1822 1823 1824

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
1825 1826
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
		struct btrfs_device *device, struct btrfs_device *this_dev)
{
	struct btrfs_device *next_device;

	if (this_dev)
		next_device = this_dev;
	else
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
								device);
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

1859 1860
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
		u64 devid)
1861 1862
{
	struct btrfs_device *device;
1863
	struct btrfs_fs_devices *cur_devices;
1864
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
1865
	u64 num_devices;
1866 1867 1868 1869
	int ret = 0;

	mutex_lock(&uuid_mutex);

1870
	num_devices = fs_devices->num_devices;
1871
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1872
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1873 1874 1875
		WARN_ON(num_devices < 1);
		num_devices--;
	}
1876
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1877

1878
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1879
	if (ret)
1880 1881
		goto out;

1882 1883
	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
					   &device);
1884
	if (ret)
D
David Woodhouse 已提交
1885
		goto out;
1886

1887
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1888
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1889
		goto out;
1890 1891
	}

1892 1893
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    fs_info->fs_devices->rw_devices == 1) {
1894
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1895
		goto out;
Y
Yan Zheng 已提交
1896 1897
	}

1898
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1899
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
1900
		list_del_init(&device->dev_alloc_list);
1901
		device->fs_devices->rw_devices--;
1902
		mutex_unlock(&fs_info->chunk_mutex);
1903
	}
1904

1905
	mutex_unlock(&uuid_mutex);
1906
	ret = btrfs_shrink_device(device, 0);
1907
	mutex_lock(&uuid_mutex);
1908
	if (ret)
1909
		goto error_undo;
1910

1911 1912 1913 1914 1915
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
1916
	ret = btrfs_rm_dev_item(fs_info, device);
1917
	if (ret)
1918
		goto error_undo;
1919

1920
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1921
	btrfs_scrub_cancel_dev(fs_info, device);
1922 1923 1924 1925

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
1926 1927 1928 1929 1930
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
1931
	 */
1932

1933 1934 1935 1936 1937
	/*
	 * In normal cases the cur_devices == fs_devices. But in case
	 * of deleting a seed device, the cur_devices should point to
	 * its own fs_devices listed under the fs_devices->seed.
	 */
1938
	cur_devices = device->fs_devices;
1939
	mutex_lock(&fs_devices->device_list_mutex);
1940
	list_del_rcu(&device->dev_list);
1941

1942 1943
	cur_devices->num_devices--;
	cur_devices->total_devices--;
1944 1945 1946
	/* Update total_devices of the parent fs_devices if it's seed */
	if (cur_devices != fs_devices)
		fs_devices->total_devices--;
Y
Yan Zheng 已提交
1947

1948
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1949
		cur_devices->missing_devices--;
1950

1951
	btrfs_assign_next_active_device(fs_info, device, NULL);
Y
Yan Zheng 已提交
1952

1953
	if (device->bdev) {
1954
		cur_devices->open_devices--;
1955
		/* remove sysfs entry */
1956
		btrfs_sysfs_rm_device_link(fs_devices, device);
1957
	}
1958

1959 1960
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1961
	mutex_unlock(&fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
1962

1963 1964 1965 1966 1967
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
1968
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1969 1970 1971
		btrfs_scratch_superblocks(device->bdev, device->name->str);

	btrfs_close_bdev(device);
1972
	call_rcu(&device->rcu, free_device_rcu);
1973

1974
	if (cur_devices->open_devices == 0) {
Y
Yan Zheng 已提交
1975
		while (fs_devices) {
1976 1977
			if (fs_devices->seed == cur_devices) {
				fs_devices->seed = cur_devices->seed;
Y
Yan Zheng 已提交
1978
				break;
1979
			}
Y
Yan Zheng 已提交
1980
			fs_devices = fs_devices->seed;
Y
Yan Zheng 已提交
1981
		}
1982
		cur_devices->seed = NULL;
1983
		close_fs_devices(cur_devices);
1984
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
1985 1986
	}

1987 1988 1989
out:
	mutex_unlock(&uuid_mutex);
	return ret;
1990

1991
error_undo:
1992
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1993
		mutex_lock(&fs_info->chunk_mutex);
1994
		list_add(&device->dev_alloc_list,
1995
			 &fs_devices->alloc_list);
1996
		device->fs_devices->rw_devices++;
1997
		mutex_unlock(&fs_info->chunk_mutex);
1998
	}
1999
	goto out;
2000 2001
}

2002 2003
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
					struct btrfs_device *srcdev)
2004
{
2005 2006
	struct btrfs_fs_devices *fs_devices;

2007
	lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2008

2009 2010 2011 2012 2013 2014 2015
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2016

2017
	list_del_rcu(&srcdev->dev_list);
2018
	list_del(&srcdev->dev_alloc_list);
2019
	fs_devices->num_devices--;
2020
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2021
		fs_devices->missing_devices--;
2022

2023
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2024
		fs_devices->rw_devices--;
2025

2026
	if (srcdev->bdev)
2027
		fs_devices->open_devices--;
2028 2029 2030 2031 2032 2033
}

void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *srcdev)
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2034

2035
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2036 2037 2038
		/* zero out the old super if it is writable */
		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
	}
2039 2040

	btrfs_close_bdev(srcdev);
2041
	call_rcu(&srcdev->rcu, free_device_rcu);
2042 2043 2044 2045 2046

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
		struct btrfs_fs_devices *tmp_fs_devices;

2047 2048 2049 2050 2051 2052 2053 2054
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2055 2056 2057 2058 2059 2060 2061 2062 2063
		tmp_fs_devices = fs_info->fs_devices;
		while (tmp_fs_devices) {
			if (tmp_fs_devices->seed == fs_devices) {
				tmp_fs_devices->seed = fs_devices->seed;
				break;
			}
			tmp_fs_devices = tmp_fs_devices->seed;
		}
		fs_devices->seed = NULL;
2064
		close_fs_devices(fs_devices);
2065
		free_fs_devices(fs_devices);
2066
	}
2067 2068 2069 2070 2071
}

void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *tgtdev)
{
2072 2073
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;

2074
	WARN_ON(!tgtdev);
2075
	mutex_lock(&fs_devices->device_list_mutex);
2076

2077
	btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2078

2079
	if (tgtdev->bdev)
2080
		fs_devices->open_devices--;
2081

2082
	fs_devices->num_devices--;
2083

2084
	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2085 2086 2087

	list_del_rcu(&tgtdev->dev_list);

2088
	mutex_unlock(&fs_devices->device_list_mutex);
2089 2090 2091 2092 2093 2094 2095 2096 2097

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2098 2099

	btrfs_close_bdev(tgtdev);
2100
	call_rcu(&tgtdev->rcu, free_device_rcu);
2101 2102
}

2103
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2104
				     const char *device_path,
2105
				     struct btrfs_device **device)
2106 2107 2108 2109 2110 2111 2112 2113 2114 2115
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
	struct buffer_head *bh;

	*device = NULL;
	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2116
				    fs_info->bdev_holder, 0, &bdev, &bh);
2117 2118 2119 2120 2121
	if (ret)
		return ret;
	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2122
	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2123 2124 2125 2126 2127 2128 2129
	brelse(bh);
	if (!*device)
		ret = -ENOENT;
	blkdev_put(bdev, FMODE_READ);
	return ret;
}

2130
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2131
					 const char *device_path,
2132 2133 2134 2135 2136 2137 2138
					 struct btrfs_device **device)
{
	*device = NULL;
	if (strcmp(device_path, "missing") == 0) {
		struct list_head *devices;
		struct btrfs_device *tmp;

2139
		devices = &fs_info->fs_devices->devices;
2140
		list_for_each_entry(tmp, devices, dev_list) {
2141 2142
			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&tmp->dev_state) && !tmp->bdev) {
2143 2144 2145 2146 2147
				*device = tmp;
				break;
			}
		}

2148 2149
		if (!*device)
			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2150 2151 2152

		return 0;
	} else {
2153
		return btrfs_find_device_by_path(fs_info, device_path, device);
2154 2155 2156
	}
}

2157 2158 2159
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2160
int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2161 2162
				 const char *devpath,
				 struct btrfs_device **device)
2163 2164 2165
{
	int ret;

2166
	if (devid) {
2167
		ret = 0;
2168
		*device = btrfs_find_device(fs_info, devid, NULL, NULL);
2169 2170 2171
		if (!*device)
			ret = -ENOENT;
	} else {
2172
		if (!devpath || !devpath[0])
2173 2174
			return -EINVAL;

2175
		ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2176 2177 2178 2179 2180
							   device);
	}
	return ret;
}

Y
Yan Zheng 已提交
2181 2182 2183
/*
 * does all the dirty work required for changing file system's UUID.
 */
2184
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2185
{
2186
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2187
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2188
	struct btrfs_fs_devices *seed_devices;
2189
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2190 2191 2192
	struct btrfs_device *device;
	u64 super_flags;

2193
	lockdep_assert_held(&uuid_mutex);
Y
Yan Zheng 已提交
2194
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2195 2196
		return -EINVAL;

D
David Sterba 已提交
2197
	seed_devices = alloc_fs_devices(NULL);
2198 2199
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2200

Y
Yan Zheng 已提交
2201 2202 2203 2204
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2205
	}
Y
Yan Zheng 已提交
2206

2207
	list_add(&old_devices->fs_list, &fs_uuids);
Y
Yan Zheng 已提交
2208

Y
Yan Zheng 已提交
2209 2210 2211 2212
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2213
	mutex_init(&seed_devices->device_list_mutex);
2214

2215
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2216 2217
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2218 2219
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2220

2221
	mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2222
	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2223
	mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2224

Y
Yan Zheng 已提交
2225 2226 2227
	fs_devices->seeding = 0;
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2228 2229
	fs_devices->missing_devices = 0;
	fs_devices->rotating = 0;
Y
Yan Zheng 已提交
2230
	fs_devices->seed = seed_devices;
Y
Yan Zheng 已提交
2231 2232

	generate_random_uuid(fs_devices->fsid);
2233
	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2234
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2235
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2236

Y
Yan Zheng 已提交
2237 2238 2239 2240 2241 2242 2243 2244
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2245
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2246 2247
 */
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2248
			       struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2249
{
2250
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2251 2252 2253 2254 2255
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2256
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2284
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2296
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2297
				   BTRFS_UUID_SIZE);
2298
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2299
				   BTRFS_FSID_SIZE);
2300
		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2301
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2318
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2319
{
2320
	struct btrfs_root *root = fs_info->dev_root;
2321
	struct request_queue *q;
2322 2323 2324
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
2325
	struct super_block *sb = fs_info->sb;
2326
	struct rcu_string *name;
2327
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2328
	u64 tmp;
Y
Yan Zheng 已提交
2329
	int seeding_dev = 0;
2330
	int ret = 0;
2331
	bool unlocked = false;
2332

2333
	if (sb_rdonly(sb) && !fs_devices->seeding)
2334
		return -EROFS;
2335

2336
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2337
				  fs_info->bdev_holder);
2338 2339
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2340

2341
	if (fs_devices->seeding) {
Y
Yan Zheng 已提交
2342 2343 2344 2345 2346
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
	}

2347
	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2348

2349
	mutex_lock(&fs_devices->device_list_mutex);
2350
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
2351 2352
		if (device->bdev == bdev) {
			ret = -EEXIST;
2353
			mutex_unlock(
2354
				&fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2355
			goto error;
2356 2357
		}
	}
2358
	mutex_unlock(&fs_devices->device_list_mutex);
2359

2360
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2361
	if (IS_ERR(device)) {
2362
		/* we can safely leave the fs_devices entry around */
2363
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2364
		goto error;
2365 2366
	}

2367
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2368
	if (!name) {
Y
Yan Zheng 已提交
2369
		ret = -ENOMEM;
2370
		goto error_free_device;
2371
	}
2372
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2373

2374
	trans = btrfs_start_transaction(root, 0);
2375 2376
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2377
		goto error_free_device;
2378 2379
	}

2380
	q = bdev_get_queue(bdev);
2381
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Y
Yan Zheng 已提交
2382
	device->generation = trans->transid;
2383 2384 2385
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2386 2387
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2388
	device->disk_total_bytes = device->total_bytes;
2389
	device->commit_total_bytes = device->total_bytes;
2390
	device->fs_info = fs_info;
2391
	device->bdev = bdev;
2392
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2393
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2394
	device->mode = FMODE_EXCL;
2395
	device->dev_stats_valid = 1;
2396
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2397

Y
Yan Zheng 已提交
2398
	if (seeding_dev) {
2399
		sb->s_flags &= ~SB_RDONLY;
2400
		ret = btrfs_prepare_sprout(fs_info);
2401 2402 2403 2404
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2405
	}
2406

2407
	device->fs_devices = fs_devices;
2408

2409
	mutex_lock(&fs_devices->device_list_mutex);
2410
	mutex_lock(&fs_info->chunk_mutex);
2411 2412 2413 2414 2415 2416 2417
	list_add_rcu(&device->dev_list, &fs_devices->devices);
	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
	fs_devices->num_devices++;
	fs_devices->open_devices++;
	fs_devices->rw_devices++;
	fs_devices->total_devices++;
	fs_devices->total_rw_bytes += device->total_bytes;
2418

2419
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2420

2421
	if (!blk_queue_nonrot(q))
2422
		fs_devices->rotating = 1;
C
Chris Mason 已提交
2423

2424 2425
	tmp = btrfs_super_total_bytes(fs_info->super_copy);
	btrfs_set_super_total_bytes(fs_info->super_copy,
2426
		round_down(tmp + device->total_bytes, fs_info->sectorsize));
2427

2428 2429
	tmp = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2430 2431

	/* add sysfs device entry */
2432
	btrfs_sysfs_add_device_link(fs_devices, device);
2433

M
Miao Xie 已提交
2434 2435 2436 2437
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2438
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2439

2440
	mutex_unlock(&fs_info->chunk_mutex);
2441
	mutex_unlock(&fs_devices->device_list_mutex);
2442

Y
Yan Zheng 已提交
2443
	if (seeding_dev) {
2444
		mutex_lock(&fs_info->chunk_mutex);
2445
		ret = init_first_rw_device(trans, fs_info);
2446
		mutex_unlock(&fs_info->chunk_mutex);
2447
		if (ret) {
2448
			btrfs_abort_transaction(trans, ret);
2449
			goto error_sysfs;
2450
		}
M
Miao Xie 已提交
2451 2452
	}

2453
	ret = btrfs_add_dev_item(trans, fs_info, device);
M
Miao Xie 已提交
2454
	if (ret) {
2455
		btrfs_abort_transaction(trans, ret);
2456
		goto error_sysfs;
M
Miao Xie 已提交
2457 2458 2459 2460 2461
	}

	if (seeding_dev) {
		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];

2462
		ret = btrfs_finish_sprout(trans, fs_info);
2463
		if (ret) {
2464
			btrfs_abort_transaction(trans, ret);
2465
			goto error_sysfs;
2466
		}
2467 2468 2469 2470 2471

		/* Sprouting would change fsid of the mounted root,
		 * so rename the fsid on the sysfs
		 */
		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2472
						fs_info->fsid);
2473
		if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2474 2475
			btrfs_warn(fs_info,
				   "sysfs: failed to create fsid for sprout");
Y
Yan Zheng 已提交
2476 2477
	}

2478
	ret = btrfs_commit_transaction(trans);
2479

Y
Yan Zheng 已提交
2480 2481 2482
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2483
		unlocked = true;
2484

2485 2486 2487
		if (ret) /* transaction commit */
			return ret;

2488
		ret = btrfs_relocate_sys_chunks(fs_info);
2489
		if (ret < 0)
2490
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2491
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2492 2493 2494 2495
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2496 2497 2498
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2499
		}
2500
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2501
	}
2502

2503 2504
	/* Update ctime/mtime for libblkid */
	update_dev_time(device_path);
Y
Yan Zheng 已提交
2505
	return ret;
2506

2507
error_sysfs:
2508
	btrfs_sysfs_rm_device_link(fs_devices, device);
2509
error_trans:
2510
	if (seeding_dev)
2511
		sb->s_flags |= SB_RDONLY;
2512 2513
	if (trans)
		btrfs_end_transaction(trans);
2514
error_free_device:
2515
	btrfs_free_device(device);
Y
Yan Zheng 已提交
2516
error:
2517
	blkdev_put(bdev, FMODE_EXCL);
2518
	if (seeding_dev && !unlocked) {
Y
Yan Zheng 已提交
2519 2520 2521
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2522
	return ret;
2523 2524
}

C
Chris Mason 已提交
2525 2526
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2527 2528 2529
{
	int ret;
	struct btrfs_path *path;
2530
	struct btrfs_root *root = device->fs_info->chunk_root;
2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2560 2561 2562 2563
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2564 2565 2566 2567 2568 2569 2570
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2571
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2572 2573
		      struct btrfs_device *device, u64 new_size)
{
2574 2575
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2576
	struct btrfs_fs_devices *fs_devices;
M
Miao Xie 已提交
2577 2578
	u64 old_total;
	u64 diff;
2579

2580
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Y
Yan Zheng 已提交
2581
		return -EACCES;
M
Miao Xie 已提交
2582

2583 2584
	new_size = round_down(new_size, fs_info->sectorsize);

2585
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2586
	old_total = btrfs_super_total_bytes(super_copy);
2587
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2588

2589
	if (new_size <= device->total_bytes ||
2590
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2591
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2592
		return -EINVAL;
M
Miao Xie 已提交
2593
	}
Y
Yan Zheng 已提交
2594

2595
	fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2596

2597 2598
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2599 2600
	device->fs_devices->total_rw_bytes += diff;

2601 2602
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2603
	btrfs_clear_space_info_full(device->fs_info);
2604 2605 2606
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
			      &fs_devices->resized_devices);
2607
	mutex_unlock(&fs_info->chunk_mutex);
2608

2609 2610 2611 2612
	return btrfs_update_device(trans, device);
}

static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2613
			    struct btrfs_fs_info *fs_info, u64 chunk_offset)
2614
{
2615
	struct btrfs_root *root = fs_info->chunk_root;
2616 2617 2618 2619 2620 2621 2622 2623
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2624
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2625 2626 2627 2628
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2629 2630 2631
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2632 2633
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2634 2635 2636
		ret = -ENOENT;
		goto out;
	}
2637 2638

	ret = btrfs_del_item(trans, root, path);
2639
	if (ret < 0)
2640 2641
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2642
out:
2643
	btrfs_free_path(path);
2644
	return ret;
2645 2646
}

2647
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2648
{
2649
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2650 2651 2652 2653 2654 2655 2656 2657 2658 2659
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2660
	mutex_lock(&fs_info->chunk_mutex);
2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2680
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2681 2682 2683 2684 2685 2686 2687 2688 2689
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
2690
	mutex_unlock(&fs_info->chunk_mutex);
2691 2692 2693
	return ret;
}

2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722
static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
					u64 logical, u64 length)
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2723
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2724
		       struct btrfs_fs_info *fs_info, u64 chunk_offset)
2725 2726 2727
{
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2728
	u64 dev_extent_len = 0;
2729
	int i, ret = 0;
2730
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2731

2732 2733
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em)) {
2734 2735
		/*
		 * This is a logic error, but we don't want to just rely on the
2736
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2737 2738 2739
		 * do anything we still error out.
		 */
		ASSERT(0);
2740
		return PTR_ERR(em);
2741
	}
2742
	map = em->map_lookup;
2743
	mutex_lock(&fs_info->chunk_mutex);
2744
	check_system_chunk(trans, map->type);
2745
	mutex_unlock(&fs_info->chunk_mutex);
2746

2747 2748 2749 2750 2751 2752
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
	mutex_lock(&fs_devices->device_list_mutex);
2753
	for (i = 0; i < map->num_stripes; i++) {
2754
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
2755 2756 2757
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
2758
		if (ret) {
2759
			mutex_unlock(&fs_devices->device_list_mutex);
2760
			btrfs_abort_transaction(trans, ret);
2761 2762
			goto out;
		}
2763

M
Miao Xie 已提交
2764
		if (device->bytes_used > 0) {
2765
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2766 2767
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
2768
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2769
			btrfs_clear_space_info_full(fs_info);
2770
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2771
		}
2772

2773 2774
		if (map->stripes[i].dev) {
			ret = btrfs_update_device(trans, map->stripes[i].dev);
2775
			if (ret) {
2776
				mutex_unlock(&fs_devices->device_list_mutex);
2777
				btrfs_abort_transaction(trans, ret);
2778 2779
				goto out;
			}
2780
		}
2781
	}
2782 2783
	mutex_unlock(&fs_devices->device_list_mutex);

2784
	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2785
	if (ret) {
2786
		btrfs_abort_transaction(trans, ret);
2787 2788
		goto out;
	}
2789

2790
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2791

2792
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2793
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2794
		if (ret) {
2795
			btrfs_abort_transaction(trans, ret);
2796 2797
			goto out;
		}
2798 2799
	}

2800
	ret = btrfs_remove_block_group(trans, chunk_offset, em);
2801
	if (ret) {
2802
		btrfs_abort_transaction(trans, ret);
2803 2804
		goto out;
	}
Y
Yan Zheng 已提交
2805

2806
out:
Y
Yan Zheng 已提交
2807 2808
	/* once for us */
	free_extent_map(em);
2809 2810
	return ret;
}
Y
Yan Zheng 已提交
2811

2812
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2813
{
2814
	struct btrfs_root *root = fs_info->chunk_root;
2815
	struct btrfs_trans_handle *trans;
2816
	int ret;
Y
Yan Zheng 已提交
2817

2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
2830
	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2831

2832
	ret = btrfs_can_relocate(fs_info, chunk_offset);
2833 2834 2835 2836
	if (ret)
		return -ENOSPC;

	/* step one, relocate all the extents inside this chunk */
2837
	btrfs_scrub_pause(fs_info);
2838
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2839
	btrfs_scrub_continue(fs_info);
2840 2841 2842
	if (ret)
		return ret;

2843 2844 2845 2846 2847 2848 2849 2850 2851 2852
	/*
	 * We add the kobjects here (and after forcing data chunk creation)
	 * since relocation is the only place we'll create chunks of a new
	 * type at runtime.  The only place where we'll remove the last
	 * chunk of a type is the call immediately below this one.  Even
	 * so, we're protected against races with the cleaner thread since
	 * we're covered by the delete_unused_bgs_mutex.
	 */
	btrfs_add_raid_kobjects(fs_info);

2853 2854 2855 2856 2857 2858 2859 2860
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

2861
	/*
2862 2863
	 * step two, delete the device extents and the
	 * chunk tree entries
2864
	 */
2865
	ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2866
	btrfs_end_transaction(trans);
2867
	return ret;
Y
Yan Zheng 已提交
2868 2869
}

2870
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2871
{
2872
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2873 2874 2875 2876 2877 2878
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
2879 2880
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
2881 2882 2883 2884 2885 2886
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2887
again:
Y
Yan Zheng 已提交
2888 2889 2890 2891 2892
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
2893
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2894
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2895
		if (ret < 0) {
2896
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2897
			goto error;
2898
		}
2899
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2900 2901 2902

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
2903
		if (ret)
2904
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2905 2906 2907 2908
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
2909

Y
Yan Zheng 已提交
2910 2911
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
2912

Y
Yan Zheng 已提交
2913 2914 2915
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
2916
		btrfs_release_path(path);
2917

Y
Yan Zheng 已提交
2918
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2919
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
2920 2921
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
2922 2923
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
2924
		}
2925
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2926

Y
Yan Zheng 已提交
2927 2928 2929 2930 2931
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
2932 2933 2934 2935
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
2936
	} else if (WARN_ON(failed && retried)) {
2937 2938
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
2939 2940 2941
error:
	btrfs_free_path(path);
	return ret;
2942 2943
}

2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
	struct btrfs_block_group_cache *cache;
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
		spin_lock(&fs_info->data_sinfo->lock);
		bytes_used = fs_info->data_sinfo->bytes_used;
		spin_unlock(&fs_info->data_sinfo->lock);

		if (!bytes_used) {
			struct btrfs_trans_handle *trans;
			int ret;

			trans =	btrfs_join_transaction(fs_info->tree_root);
			if (IS_ERR(trans))
				return PTR_ERR(trans);

2974
			ret = btrfs_force_chunk_alloc(trans,
2975 2976 2977 2978 2979
						      BTRFS_BLOCK_GROUP_DATA);
			btrfs_end_transaction(trans);
			if (ret < 0)
				return ret;

2980 2981
			btrfs_add_raid_kobjects(fs_info);

2982 2983 2984 2985 2986 2987
			return 1;
		}
	}
	return 0;
}

2988
static int insert_balance_item(struct btrfs_fs_info *fs_info,
2989 2990
			       struct btrfs_balance_control *bctl)
{
2991
	struct btrfs_root *root = fs_info->tree_root;
2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3011
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3012 3013 3014 3015 3016 3017 3018 3019 3020 3021
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3022
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3036
	err = btrfs_commit_transaction(trans);
3037 3038 3039 3040 3041
	if (err && !ret)
		ret = err;
	return ret;
}

3042
static int del_balance_item(struct btrfs_fs_info *fs_info)
3043
{
3044
	struct btrfs_root *root = fs_info->tree_root;
3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3061
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3075
	err = btrfs_commit_transaction(trans);
3076 3077 3078 3079 3080
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3105
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3106 3107 3108 3109 3110
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3111
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3112 3113 3114 3115 3116
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3117
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3118 3119 3120 3121 3122 3123
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3124 3125 3126 3127
/*
 * Clear the balance status in fs_info and delete the balance item from disk.
 */
static void reset_balance_state(struct btrfs_fs_info *fs_info)
3128 3129
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3130
	int ret;
3131 3132 3133 3134 3135 3136 3137 3138

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
3139 3140 3141
	ret = del_balance_item(fs_info);
	if (ret)
		btrfs_handle_fs_error(fs_info, ret, NULL);
3142 3143
}

I
Ilya Dryomov 已提交
3144 3145 3146 3147
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3148
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3149 3150
				 struct btrfs_balance_args *bargs)
{
3151 3152
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3153

3154
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3155 3156 3157 3158 3159
		return 0;

	return 1;
}

3160
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3161
			      struct btrfs_balance_args *bargs)
3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
		user_thresh_min = div_factor_fine(cache->key.offset,
					bargs->usage_min);

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
		user_thresh_max = cache->key.offset;
	else
		user_thresh_max = div_factor_fine(cache->key.offset,
					bargs->usage_max);

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3193
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3194
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3195 3196 3197 3198 3199 3200 3201 3202
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

3203
	if (bargs->usage_min == 0)
3204
		user_thresh = 1;
3205 3206 3207 3208 3209 3210
	else if (bargs->usage > 100)
		user_thresh = cache->key.offset;
	else
		user_thresh = div_factor_fine(cache->key.offset,
					      bargs->usage);

I
Ilya Dryomov 已提交
3211 3212 3213 3214 3215 3216 3217
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

I
Ilya Dryomov 已提交
3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
D
David Woodhouse 已提交
3251 3252 3253 3254 3255 3256 3257 3258 3259
	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
		factor = num_stripes / 2;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
		factor = num_stripes - 1;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
		factor = num_stripes - 2;
	} else {
		factor = num_stripes;
	}
I
Ilya Dryomov 已提交
3260 3261 3262 3263 3264 3265 3266 3267

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3268
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3269 3270 3271 3272 3273 3274 3275 3276 3277

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3305
static int chunk_soft_convert_filter(u64 chunk_type,
3306 3307 3308 3309 3310
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3311 3312
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3313

3314
	if (bargs->target == chunk_type)
3315 3316 3317 3318 3319
		return 1;

	return 0;
}

3320
static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3321 3322 3323
				struct extent_buffer *leaf,
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3324
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3341 3342 3343 3344
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3345 3346 3347 3348
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3349
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3350
		return 0;
3351
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3352
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3353
		return 0;
I
Ilya Dryomov 已提交
3354 3355 3356 3357 3358 3359
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3360 3361 3362 3363
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3364
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3365
		return 0;
3366 3367 3368 3369 3370 3371
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3372 3373
	}

3374 3375 3376 3377 3378 3379
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3380 3381 3382 3383 3384 3385
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3386 3387 3388 3389 3390 3391 3392 3393
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3394 3395 3396
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3397
		 * determined here because we do not have the global information
3398 3399 3400 3401 3402 3403
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3404 3405
	}

3406 3407 3408
	return 1;
}

3409
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3410
{
3411
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3412 3413 3414
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct list_head *devices;
3415 3416 3417
	struct btrfs_device *device;
	u64 old_size;
	u64 size_to_free;
3418
	u64 chunk_type;
3419
	struct btrfs_chunk *chunk;
3420
	struct btrfs_path *path = NULL;
3421 3422
	struct btrfs_key key;
	struct btrfs_key found_key;
3423
	struct btrfs_trans_handle *trans;
3424 3425
	struct extent_buffer *leaf;
	int slot;
3426 3427
	int ret;
	int enospc_errors = 0;
3428
	bool counting = true;
3429
	/* The single value limit and min/max limits use the same bytes in the */
3430 3431 3432
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3433 3434 3435
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3436
	int chunk_reserved = 0;
3437 3438

	/* step one make some room on all the devices */
3439
	devices = &fs_info->fs_devices->devices;
Q
Qinghuang Feng 已提交
3440
	list_for_each_entry(device, devices, dev_list) {
3441
		old_size = btrfs_device_get_total_bytes(device);
3442
		size_to_free = div_factor(old_size, 1);
3443
		size_to_free = min_t(u64, size_to_free, SZ_1M);
3444
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3445 3446
		    btrfs_device_get_total_bytes(device) -
		    btrfs_device_get_bytes_used(device) > size_to_free ||
3447
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3448 3449 3450
			continue;

		ret = btrfs_shrink_device(device, old_size - size_to_free);
3451 3452
		if (ret == -ENOSPC)
			break;
3453 3454 3455 3456 3457
		if (ret) {
			/* btrfs_shrink_device never returns ret > 0 */
			WARN_ON(ret > 0);
			goto error;
		}
3458

3459
		trans = btrfs_start_transaction(dev_root, 0);
3460 3461 3462 3463 3464 3465 3466 3467
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3468 3469

		ret = btrfs_grow_device(trans, device, old_size);
3470
		if (ret) {
3471
			btrfs_end_transaction(trans);
3472 3473 3474 3475 3476 3477 3478 3479
			/* btrfs_grow_device never returns ret > 0 */
			WARN_ON(ret > 0);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3480

3481
		btrfs_end_transaction(trans);
3482 3483 3484 3485
	}

	/* step two, relocate all the chunks */
	path = btrfs_alloc_path();
3486 3487 3488 3489
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3490 3491 3492 3493 3494 3495

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3496
	if (!counting) {
3497 3498 3499 3500
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3501 3502 3503 3504
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3505 3506 3507 3508
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3509
	while (1) {
3510
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3511
		    atomic_read(&fs_info->balance_cancel_req)) {
3512 3513 3514 3515
			ret = -ECANCELED;
			goto error;
		}

3516
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
3517
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3518 3519
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3520
			goto error;
3521
		}
3522 3523 3524 3525 3526 3527

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3528
			BUG(); /* FIXME break ? */
3529 3530 3531

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3532
		if (ret) {
3533
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3534
			ret = 0;
3535
			break;
3536
		}
3537

3538 3539 3540
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3541

3542 3543
		if (found_key.objectid != key.objectid) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3544
			break;
3545
		}
3546

3547
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3548
		chunk_type = btrfs_chunk_type(leaf, chunk);
3549

3550 3551 3552 3553 3554 3555
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3556
		ret = should_balance_chunk(fs_info, leaf, chunk,
3557
					   found_key.offset);
3558

3559
		btrfs_release_path(path);
3560 3561
		if (!ret) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3562
			goto loop;
3563
		}
3564

3565
		if (counting) {
3566
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3567 3568 3569
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3592 3593 3594
			goto loop;
		}

3595 3596 3597 3598 3599 3600 3601 3602 3603
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
3604 3605 3606
			if (ret < 0) {
				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				goto error;
3607 3608
			} else if (ret == 1) {
				chunk_reserved = 1;
3609 3610 3611
			}
		}

3612
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3613
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3614 3615
		if (ret && ret != -ENOSPC)
			goto error;
3616
		if (ret == -ENOSPC) {
3617
			enospc_errors++;
3618 3619 3620 3621 3622
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3623
loop:
3624 3625
		if (found_key.offset == 0)
			break;
3626
		key.offset = found_key.offset - 1;
3627
	}
3628

3629 3630 3631 3632 3633
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3634 3635
error:
	btrfs_free_path(path);
3636
	if (enospc_errors) {
3637
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3638
			   enospc_errors);
3639 3640 3641 3642
		if (!ret)
			ret = -ENOSPC;
	}

3643 3644 3645
	return ret;
}

3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

	/* true if exactly one bit set */
	return (flags & (flags - 1)) == 0;
}

3670 3671
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3672 3673 3674 3675
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3676 3677
}

3678 3679 3680 3681 3682 3683 3684 3685 3686
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
		u64 allowed)
{
	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
		 (bctl_arg->target & ~allowed)));
}

3687
/*
3688
 * Should be called with balance mutexe held
3689
 */
3690 3691
int btrfs_balance(struct btrfs_fs_info *fs_info,
		  struct btrfs_balance_control *bctl,
3692 3693
		  struct btrfs_ioctl_balance_args *bargs)
{
3694
	u64 meta_target, data_target;
3695
	u64 allowed;
3696
	int mixed = 0;
3697
	int ret;
3698
	u64 num_devices;
3699
	unsigned seq;
3700

3701
	if (btrfs_fs_closing(fs_info) ||
3702 3703
	    atomic_read(&fs_info->balance_pause_req) ||
	    atomic_read(&fs_info->balance_cancel_req)) {
3704 3705 3706 3707
		ret = -EINVAL;
		goto out;
	}

3708 3709 3710 3711
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

3712 3713 3714 3715
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
3716 3717
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
3718 3719 3720
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
3721
			btrfs_err(fs_info,
3722
	  "balance: mixed groups data and metadata options must be the same");
3723 3724 3725 3726 3727
			ret = -EINVAL;
			goto out;
		}
	}

3728
	num_devices = fs_info->fs_devices->num_devices;
3729
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3730 3731 3732 3733
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		BUG_ON(num_devices < 1);
		num_devices--;
	}
3734
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3735 3736
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
	if (num_devices > 1)
3737
		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3738 3739 3740 3741 3742
	if (num_devices > 2)
		allowed |= BTRFS_BLOCK_GROUP_RAID5;
	if (num_devices > 3)
		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
			    BTRFS_BLOCK_GROUP_RAID6);
3743
	if (validate_convert_profile(&bctl->data, allowed)) {
3744 3745
		int index = btrfs_bg_flags_to_raid_index(bctl->data.target);

J
Jeff Mahoney 已提交
3746
		btrfs_err(fs_info,
3747 3748
			  "balance: invalid convert data profile %s",
			  get_raid_name(index));
3749 3750 3751
		ret = -EINVAL;
		goto out;
	}
3752
	if (validate_convert_profile(&bctl->meta, allowed)) {
3753 3754
		int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);

3755
		btrfs_err(fs_info,
3756 3757
			  "balance: invalid convert metadata profile %s",
			  get_raid_name(index));
3758 3759 3760
		ret = -EINVAL;
		goto out;
	}
3761
	if (validate_convert_profile(&bctl->sys, allowed)) {
3762 3763
		int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);

3764
		btrfs_err(fs_info,
3765 3766
			  "balance: invalid convert system profile %s",
			  get_raid_name(index));
3767 3768 3769 3770 3771 3772
		ret = -EINVAL;
		goto out;
	}

	/* allow to reduce meta or sys integrity only if force set */
	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
3773 3774 3775
			BTRFS_BLOCK_GROUP_RAID10 |
			BTRFS_BLOCK_GROUP_RAID5 |
			BTRFS_BLOCK_GROUP_RAID6;
3776 3777 3778 3779 3780 3781 3782 3783 3784 3785
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
		     !(bctl->meta.target & allowed))) {
			if (bctl->flags & BTRFS_BALANCE_FORCE) {
J
Jeff Mahoney 已提交
3786
				btrfs_info(fs_info,
3787
				"balance: force reducing metadata integrity");
3788
			} else {
J
Jeff Mahoney 已提交
3789
				btrfs_err(fs_info,
3790
	"balance: reduces metadata integrity, use --force if you want this");
3791 3792 3793
				ret = -EINVAL;
				goto out;
			}
3794
		}
3795
	} while (read_seqretry(&fs_info->profiles_lock, seq));
3796

3797 3798 3799 3800 3801 3802 3803
	/* if we're not converting, the target field is uninitialized */
	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->data.target : fs_info->avail_data_alloc_bits;
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3804 3805 3806
		int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
		int data_index = btrfs_bg_flags_to_raid_index(data_target);

3807
		btrfs_warn(fs_info,
3808 3809
	"balance: metadata profile %s has lower redundancy than data profile %s",
			   get_raid_name(meta_index), get_raid_name(data_index));
3810 3811
	}

3812
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
3813
	if (ret && ret != -EEXIST)
3814 3815
		goto out;

I
Ilya Dryomov 已提交
3816 3817
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
3818 3819 3820 3821
		BUG_ON(fs_info->balance_ctl);
		spin_lock(&fs_info->balance_lock);
		fs_info->balance_ctl = bctl;
		spin_unlock(&fs_info->balance_lock);
I
Ilya Dryomov 已提交
3822 3823 3824 3825 3826 3827
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
3828

3829 3830
	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
3831 3832 3833 3834 3835
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
3836
	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
3837 3838 3839

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
3840
		btrfs_update_ioctl_balance_args(fs_info, bargs);
3841 3842
	}

3843 3844
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
3845
		reset_balance_state(fs_info);
3846
		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3847 3848
	}

3849
	wake_up(&fs_info->balance_wait_q);
3850 3851 3852

	return ret;
out:
I
Ilya Dryomov 已提交
3853
	if (bctl->flags & BTRFS_BALANCE_RESUME)
3854
		reset_balance_state(fs_info);
3855
	else
I
Ilya Dryomov 已提交
3856
		kfree(bctl);
3857 3858
	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);

I
Ilya Dryomov 已提交
3859 3860 3861 3862 3863
	return ret;
}

static int balance_kthread(void *data)
{
3864
	struct btrfs_fs_info *fs_info = data;
3865
	int ret = 0;
I
Ilya Dryomov 已提交
3866 3867

	mutex_lock(&fs_info->balance_mutex);
3868
	if (fs_info->balance_ctl) {
3869
		btrfs_info(fs_info, "balance: resuming");
3870
		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
3871
	}
I
Ilya Dryomov 已提交
3872
	mutex_unlock(&fs_info->balance_mutex);
3873

I
Ilya Dryomov 已提交
3874 3875 3876
	return ret;
}

3877 3878 3879 3880
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

3881
	mutex_lock(&fs_info->balance_mutex);
3882
	if (!fs_info->balance_ctl) {
3883
		mutex_unlock(&fs_info->balance_mutex);
3884 3885
		return 0;
	}
3886
	mutex_unlock(&fs_info->balance_mutex);
3887

3888
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
3889
		btrfs_info(fs_info, "balance: resume skipped");
3890 3891 3892
		return 0;
	}

3893 3894 3895 3896 3897 3898 3899 3900 3901
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

3902
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3903
	return PTR_ERR_OR_ZERO(tsk);
3904 3905
}

3906
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
3921
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
3922 3923
	key.offset = 0;

3924
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
3925
	if (ret < 0)
3926
		goto out;
I
Ilya Dryomov 已提交
3927 3928
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
3929 3930 3931 3932 3933 3934 3935
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
3936 3937 3938 3939 3940
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3941 3942
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
3943 3944 3945 3946 3947 3948 3949 3950

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962
	/*
	 * This should never happen, as the paused balance state is recovered
	 * during mount without any chance of other exclusive ops to collide.
	 *
	 * This gives the exclusive op status to balance and keeps in paused
	 * state until user intervention (cancel or umount). If the ownership
	 * cannot be assigned, show a message but do not fail. The balance
	 * is in a paused state and must have fs_info::balance_ctl properly
	 * set up.
	 */
	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
		btrfs_warn(fs_info,
3963
	"balance: cannot set exclusive op status, resume manually");
3964

3965
	mutex_lock(&fs_info->balance_mutex);
3966 3967 3968 3969
	BUG_ON(fs_info->balance_ctl);
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
3970
	mutex_unlock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
3971 3972
out:
	btrfs_free_path(path);
3973 3974 3975
	return ret;
}

3976 3977 3978 3979 3980 3981 3982 3983 3984 3985
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

3986
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
3987 3988 3989 3990
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
3991
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
3992 3993 3994

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
3995
		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
3996 3997 3998 3999 4000 4001 4002 4003 4004
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4005 4006 4007 4008 4009 4010 4011 4012
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4013 4014 4015 4016 4017 4018 4019 4020 4021 4022
	/*
	 * A paused balance with the item stored on disk can be resumed at
	 * mount time if the mount is read-write. Otherwise it's still paused
	 * and we must not allow cancelling as it deletes the item.
	 */
	if (sb_rdonly(fs_info->sb)) {
		mutex_unlock(&fs_info->balance_mutex);
		return -EROFS;
	}

4023 4024 4025 4026 4027
	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
4028
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4029 4030
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
4031
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4032 4033 4034
		mutex_lock(&fs_info->balance_mutex);
	} else {
		mutex_unlock(&fs_info->balance_mutex);
4035 4036 4037 4038
		/*
		 * Lock released to allow other waiters to continue, we'll
		 * reexamine the status again.
		 */
4039 4040
		mutex_lock(&fs_info->balance_mutex);

4041
		if (fs_info->balance_ctl) {
4042
			reset_balance_state(fs_info);
4043
			clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4044
			btrfs_info(fs_info, "balance: canceled");
4045
		}
4046 4047
	}

4048 4049
	BUG_ON(fs_info->balance_ctl ||
		test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4050 4051 4052 4053 4054
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

S
Stefan Behrens 已提交
4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065
static int btrfs_uuid_scan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4066
	struct btrfs_trans_handle *trans = NULL;
S
Stefan Behrens 已提交
4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4079 4080
		ret = btrfs_search_forward(root, &key, path,
				BTRFS_OLDEST_GENERATION);
S
Stefan Behrens 已提交
4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4104 4105 4106 4107 4108 4109 4110

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4111 4112 4113 4114 4115 4116 4117 4118 4119
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4120 4121 4122 4123 4124 4125
			continue;
		} else {
			goto skip;
		}
update_tree:
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4126
			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
S
Stefan Behrens 已提交
4127 4128 4129
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4130
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4131 4132 4133 4134 4135 4136
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4137
			ret = btrfs_uuid_tree_add(trans,
S
Stefan Behrens 已提交
4138 4139 4140 4141
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4142
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4143 4144 4145 4146 4147
					ret);
				break;
			}
		}

4148
skip:
S
Stefan Behrens 已提交
4149
		if (trans) {
4150
			ret = btrfs_end_transaction(trans);
4151
			trans = NULL;
S
Stefan Behrens 已提交
4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173
			if (ret)
				break;
		}

		btrfs_release_path(path);
		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4174
	if (trans && !IS_ERR(trans))
4175
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4176
	if (ret)
4177
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4178
	else
4179
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4180 4181 4182 4183
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4184 4185 4186 4187
/*
 * Callback for btrfs_uuid_tree_iterate().
 * returns:
 * 0	check succeeded, the entry is not outdated.
4188
 * < 0	if an error occurred.
4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240
 * > 0	if the check failed, which means the caller shall remove the entry.
 */
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				       u8 *uuid, u8 type, u64 subid)
{
	struct btrfs_key key;
	int ret = 0;
	struct btrfs_root *subvol_root;

	if (type != BTRFS_UUID_KEY_SUBVOL &&
	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
		goto out;

	key.objectid = subid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(subvol_root)) {
		ret = PTR_ERR(subvol_root);
		if (ret == -ENOENT)
			ret = 1;
		goto out;
	}

	switch (type) {
	case BTRFS_UUID_KEY_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
			ret = 1;
		break;
	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.received_uuid,
			   BTRFS_UUID_SIZE))
			ret = 1;
		break;
	}

out:
	return ret;
}

static int btrfs_uuid_rescan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
	if (ret < 0) {
4241
		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4242 4243 4244 4245 4246 4247
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

4248 4249 4250 4251 4252
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4253 4254
	struct task_struct *task;
	int ret;
4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	uuid_root = btrfs_create_tree(trans, fs_info,
				      BTRFS_UUID_TREE_OBJECTID);
	if (IS_ERR(uuid_root)) {
4267
		ret = PTR_ERR(uuid_root);
4268
		btrfs_abort_transaction(trans, ret);
4269
		btrfs_end_transaction(trans);
4270
		return ret;
4271 4272 4273 4274
	}

	fs_info->uuid_root = uuid_root;

4275
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4276 4277 4278 4279 4280 4281
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4282
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4283
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4284 4285 4286 4287 4288
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4289
}
S
Stefan Behrens 已提交
4290

4291 4292 4293 4294 4295 4296 4297 4298
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4299
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
4300 4301 4302 4303 4304 4305 4306
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

4307 4308 4309 4310 4311 4312 4313
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4314 4315
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4316 4317 4318 4319 4320 4321 4322
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4323 4324
	int failed = 0;
	bool retried = false;
4325
	bool checked_pending_chunks = false;
4326 4327
	struct extent_buffer *l;
	struct btrfs_key key;
4328
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4329
	u64 old_total = btrfs_super_total_bytes(super_copy);
4330
	u64 old_size = btrfs_device_get_total_bytes(device);
4331 4332 4333
	u64 diff;

	new_size = round_down(new_size, fs_info->sectorsize);
4334
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4335

4336
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4337 4338
		return -EINVAL;

4339 4340 4341 4342
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4343
	path->reada = READA_BACK;
4344

4345
	mutex_lock(&fs_info->chunk_mutex);
4346

4347
	btrfs_device_set_total_bytes(device, new_size);
4348
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
4349
		device->fs_devices->total_rw_bytes -= diff;
4350
		atomic64_sub(diff, &fs_info->free_chunk_space);
4351
	}
4352
	mutex_unlock(&fs_info->chunk_mutex);
4353

4354
again:
4355 4356 4357 4358
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4359
	do {
4360
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
4361
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4362
		if (ret < 0) {
4363
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4364
			goto done;
4365
		}
4366 4367

		ret = btrfs_previous_item(root, path, 0, key.type);
4368
		if (ret)
4369
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4370 4371 4372 4373
		if (ret < 0)
			goto done;
		if (ret) {
			ret = 0;
4374
			btrfs_release_path(path);
4375
			break;
4376 4377 4378 4379 4380 4381
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4382
		if (key.objectid != device->devid) {
4383
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4384
			btrfs_release_path(path);
4385
			break;
4386
		}
4387 4388 4389 4390

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4391
		if (key.offset + length <= new_size) {
4392
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4393
			btrfs_release_path(path);
4394
			break;
4395
		}
4396 4397

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4398
		btrfs_release_path(path);
4399

4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
			goto done;
		}

4412 4413
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4414
		if (ret && ret != -ENOSPC)
4415
			goto done;
4416 4417
		if (ret == -ENOSPC)
			failed++;
4418
	} while (key.offset-- > 0);
4419 4420 4421 4422 4423 4424 4425 4426

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4427 4428
	}

4429
	/* Shrinking succeeded, else we would be at "done". */
4430
	trans = btrfs_start_transaction(root, 0);
4431 4432 4433 4434 4435
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4436
	mutex_lock(&fs_info->chunk_mutex);
4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453

	/*
	 * We checked in the above loop all device extents that were already in
	 * the device tree. However before we have updated the device's
	 * total_bytes to the new size, we might have had chunk allocations that
	 * have not complete yet (new block groups attached to transaction
	 * handles), and therefore their device extents were not yet in the
	 * device tree and we missed them in the loop above. So if we have any
	 * pending chunk using a device extent that overlaps the device range
	 * that we can not use anymore, commit the current transaction and
	 * repeat the search on the device tree - this way we guarantee we will
	 * not have chunks using device extents that end beyond 'new_size'.
	 */
	if (!checked_pending_chunks) {
		u64 start = new_size;
		u64 len = old_size - new_size;

4454 4455
		if (contains_pending_extent(trans->transaction, device,
					    &start, len)) {
4456
			mutex_unlock(&fs_info->chunk_mutex);
4457 4458 4459
			checked_pending_chunks = true;
			failed = 0;
			retried = false;
4460
			ret = btrfs_commit_transaction(trans);
4461 4462 4463 4464 4465 4466
			if (ret)
				goto done;
			goto again;
		}
	}

4467
	btrfs_device_set_disk_total_bytes(device, new_size);
4468 4469
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
4470
			      &fs_info->fs_devices->resized_devices);
4471 4472

	WARN_ON(diff > old_total);
4473 4474
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4475
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4476 4477 4478

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4479
	btrfs_end_transaction(trans);
4480 4481
done:
	btrfs_free_path(path);
4482
	if (ret) {
4483
		mutex_lock(&fs_info->chunk_mutex);
4484
		btrfs_device_set_total_bytes(device, old_size);
4485
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4486
			device->fs_devices->total_rw_bytes += diff;
4487
		atomic64_add(diff, &fs_info->free_chunk_space);
4488
		mutex_unlock(&fs_info->chunk_mutex);
4489
	}
4490 4491 4492
	return ret;
}

4493
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4494 4495 4496
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4497
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4498 4499 4500 4501
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4502
	mutex_lock(&fs_info->chunk_mutex);
4503
	array_size = btrfs_super_sys_array_size(super_copy);
4504
	if (array_size + item_size + sizeof(disk_key)
4505
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4506
		mutex_unlock(&fs_info->chunk_mutex);
4507
		return -EFBIG;
4508
	}
4509 4510 4511 4512 4513 4514 4515 4516

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4517
	mutex_unlock(&fs_info->chunk_mutex);
4518

4519 4520 4521
	return 0;
}

4522 4523 4524 4525
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4526
{
4527 4528
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4529

4530
	if (di_a->max_avail > di_b->max_avail)
4531
		return -1;
4532
	if (di_a->max_avail < di_b->max_avail)
4533
		return 1;
4534 4535 4536 4537 4538
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4539
}
4540

D
David Woodhouse 已提交
4541 4542
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4543
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4544 4545
		return;

4546
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4547 4548
}

4549
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
4550 4551 4552 4553 4554 4555 4556 4557
			- sizeof(struct btrfs_chunk))		\
			/ sizeof(struct btrfs_stripe) + 1)

#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
				- 2 * sizeof(struct btrfs_disk_key)	\
				- 2 * sizeof(struct btrfs_chunk))	\
				/ sizeof(struct btrfs_stripe) + 1)

4558
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4559
			       u64 start, u64 type)
4560
{
4561
	struct btrfs_fs_info *info = trans->fs_info;
4562
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
4563
	struct btrfs_device *device;
4564 4565 4566 4567 4568 4569
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct btrfs_device_info *devices_info = NULL;
	u64 total_avail;
	int num_stripes;	/* total number of stripes to allocate */
D
David Woodhouse 已提交
4570 4571
	int data_stripes;	/* number of stripes that count for
				   block group size */
4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585
	int sub_stripes;	/* sub_stripes info for map */
	int dev_stripes;	/* stripes per dev */
	int devs_max;		/* max devs to use */
	int devs_min;		/* min devs needed */
	int devs_increment;	/* ndevs has to be a multiple of this */
	int ncopies;		/* how many copies to data has */
	int ret;
	u64 max_stripe_size;
	u64 max_chunk_size;
	u64 stripe_size;
	u64 num_bytes;
	int ndevs;
	int i;
	int j;
4586
	int index;
4587

4588
	BUG_ON(!alloc_profile_is_valid(type, 0));
4589

4590 4591 4592
	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
4593
		return -ENOSPC;
4594
	}
4595

4596
	index = btrfs_bg_flags_to_raid_index(type);
4597

4598 4599 4600 4601 4602 4603
	sub_stripes = btrfs_raid_array[index].sub_stripes;
	dev_stripes = btrfs_raid_array[index].dev_stripes;
	devs_max = btrfs_raid_array[index].devs_max;
	devs_min = btrfs_raid_array[index].devs_min;
	devs_increment = btrfs_raid_array[index].devs_increment;
	ncopies = btrfs_raid_array[index].ncopies;
4604

4605
	if (type & BTRFS_BLOCK_GROUP_DATA) {
4606
		max_stripe_size = SZ_1G;
4607
		max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4608
		if (!devs_max)
4609
			devs_max = BTRFS_MAX_DEVS(info);
4610
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4611
		/* for larger filesystems, use larger metadata chunks */
4612 4613
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			max_stripe_size = SZ_1G;
4614
		else
4615
			max_stripe_size = SZ_256M;
4616
		max_chunk_size = max_stripe_size;
4617
		if (!devs_max)
4618
			devs_max = BTRFS_MAX_DEVS(info);
4619
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4620
		max_stripe_size = SZ_32M;
4621
		max_chunk_size = 2 * max_stripe_size;
4622 4623
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4624
	} else {
4625
		btrfs_err(info, "invalid chunk type 0x%llx requested",
4626 4627
		       type);
		BUG_ON(1);
4628 4629
	}

Y
Yan Zheng 已提交
4630 4631 4632
	/* we don't want a chunk larger than 10% of writeable space */
	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
			     max_chunk_size);
4633

4634
	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4635 4636 4637
			       GFP_NOFS);
	if (!devices_info)
		return -ENOMEM;
4638

4639
	/*
4640 4641
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
4642
	 */
4643
	ndevs = 0;
4644
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4645 4646
		u64 max_avail;
		u64 dev_offset;
4647

4648
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
J
Julia Lawall 已提交
4649
			WARN(1, KERN_ERR
4650
			       "BTRFS: read-only device in alloc_list\n");
4651 4652
			continue;
		}
4653

4654 4655
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
4656
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4657
			continue;
4658

4659 4660 4661 4662
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
4663 4664 4665 4666

		/* If there is no space on this device, skip it. */
		if (total_avail == 0)
			continue;
4667

4668
		ret = find_free_dev_extent(trans, device,
4669 4670 4671 4672
					   max_stripe_size * dev_stripes,
					   &dev_offset, &max_avail);
		if (ret && ret != -ENOSPC)
			goto error;
4673

4674 4675
		if (ret == 0)
			max_avail = max_stripe_size * dev_stripes;
4676

4677 4678 4679 4680 4681 4682
		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
			"%s: devid %llu has no free space, have=%llu want=%u",
					    __func__, device->devid, max_avail,
					    BTRFS_STRIPE_LEN * dev_stripes);
4683
			continue;
4684
		}
4685

4686 4687 4688 4689 4690
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
4691 4692 4693 4694 4695 4696
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
4697

4698 4699 4700 4701 4702
	/*
	 * now sort the devices by hole size / available space
	 */
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
	     btrfs_cmp_device_info, NULL);
4703

4704
	/* round down to number of usable stripes */
4705
	ndevs = round_down(ndevs, devs_increment);
4706

4707
	if (ndevs < devs_min) {
4708
		ret = -ENOSPC;
4709 4710 4711
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
4712
				    __func__, ndevs, devs_min);
4713
		}
4714
		goto error;
4715
	}
4716

4717 4718
	ndevs = min(ndevs, devs_max);

4719
	/*
4720 4721 4722 4723 4724
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
4725
	 */
4726
	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4727
	num_stripes = ndevs * dev_stripes;
4728

D
David Woodhouse 已提交
4729 4730 4731 4732 4733 4734
	/*
	 * this will have to be fixed for RAID1 and RAID10 over
	 * more drives
	 */
	data_stripes = num_stripes / ncopies;

4735
	if (type & BTRFS_BLOCK_GROUP_RAID5)
D
David Woodhouse 已提交
4736
		data_stripes = num_stripes - 1;
4737 4738

	if (type & BTRFS_BLOCK_GROUP_RAID6)
D
David Woodhouse 已提交
4739
		data_stripes = num_stripes - 2;
4740 4741 4742 4743 4744 4745 4746

	/*
	 * Use the number of data stripes to figure out how big this chunk
	 * is really going to be in terms of logical address space,
	 * and compare that answer with the max chunk size
	 */
	if (stripe_size * data_stripes > max_chunk_size) {
4747
		stripe_size = div_u64(max_chunk_size, data_stripes);
4748 4749

		/* bump the answer up to a 16MB boundary */
4750
		stripe_size = round_up(stripe_size, SZ_16M);
4751

4752 4753 4754
		/*
		 * But don't go higher than the limits we found while searching
		 * for free extents
4755
		 */
4756 4757
		stripe_size = min(devices_info[ndevs - 1].max_avail,
				  stripe_size);
4758 4759
	}

4760
	/* align to BTRFS_STRIPE_LEN */
4761
	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
4762 4763 4764 4765 4766 4767 4768

	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
	if (!map) {
		ret = -ENOMEM;
		goto error;
	}
	map->num_stripes = num_stripes;
4769

4770 4771 4772 4773 4774 4775
	for (i = 0; i < ndevs; ++i) {
		for (j = 0; j < dev_stripes; ++j) {
			int s = i * dev_stripes + j;
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
						   j * stripe_size;
4776 4777
		}
	}
4778 4779 4780
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
4781 4782
	map->type = type;
	map->sub_stripes = sub_stripes;
4783

D
David Woodhouse 已提交
4784
	num_bytes = stripe_size * data_stripes;
4785

4786
	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4787

4788
	em = alloc_extent_map();
Y
Yan Zheng 已提交
4789
	if (!em) {
4790
		kfree(map);
4791 4792
		ret = -ENOMEM;
		goto error;
4793
	}
4794
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4795
	em->map_lookup = map;
Y
Yan Zheng 已提交
4796
	em->start = start;
4797
	em->len = num_bytes;
Y
Yan Zheng 已提交
4798 4799
	em->block_start = 0;
	em->block_len = em->len;
4800
	em->orig_block_len = stripe_size;
4801

4802
	em_tree = &info->mapping_tree.map_tree;
4803
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4804
	ret = add_extent_mapping(em_tree, em, 0);
4805
	if (ret) {
4806
		write_unlock(&em_tree->lock);
4807
		free_extent_map(em);
4808
		goto error;
4809
	}
4810

4811 4812 4813 4814
	list_add_tail(&em->list, &trans->transaction->pending_chunks);
	refcount_inc(&em->refs);
	write_unlock(&em_tree->lock);

4815
	ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
4816 4817
	if (ret)
		goto error_del_extent;
Y
Yan Zheng 已提交
4818

4819 4820 4821 4822
	for (i = 0; i < map->num_stripes; i++) {
		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
	}
4823

4824
	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
4825

4826
	free_extent_map(em);
4827
	check_raid56_incompat_flag(info, type);
D
David Woodhouse 已提交
4828

4829
	kfree(devices_info);
Y
Yan Zheng 已提交
4830
	return 0;
4831

4832
error_del_extent:
4833 4834 4835 4836 4837 4838 4839 4840
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
4841 4842
	/* One for the pending_chunks list reference */
	free_extent_map(em);
4843 4844 4845
error:
	kfree(devices_info);
	return ret;
Y
Yan Zheng 已提交
4846 4847
}

4848
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4849
				struct btrfs_fs_info *fs_info,
4850
				u64 chunk_offset, u64 chunk_size)
Y
Yan Zheng 已提交
4851
{
4852 4853
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
4854 4855 4856 4857
	struct btrfs_key key;
	struct btrfs_device *device;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
4858 4859 4860 4861 4862 4863
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	u64 dev_offset;
	u64 stripe_size;
	int i = 0;
4864
	int ret = 0;
Y
Yan Zheng 已提交
4865

4866 4867 4868
	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	if (IS_ERR(em))
		return PTR_ERR(em);
4869

4870
	map = em->map_lookup;
4871 4872 4873
	item_size = btrfs_chunk_item_size(map->num_stripes);
	stripe_size = em->orig_block_len;

Y
Yan Zheng 已提交
4874
	chunk = kzalloc(item_size, GFP_NOFS);
4875 4876 4877 4878 4879
	if (!chunk) {
		ret = -ENOMEM;
		goto out;
	}

4880 4881 4882 4883 4884 4885 4886
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with the map's stripes, because the device object's id can change
	 * at any time during that final phase of the device replace operation
	 * (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
4887
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4888 4889 4890
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
Y
Yan Zheng 已提交
4891

4892
		ret = btrfs_update_device(trans, device);
4893
		if (ret)
4894
			break;
4895 4896
		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
					     dev_offset, stripe_size);
4897
		if (ret)
4898 4899 4900
			break;
	}
	if (ret) {
4901
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4902
		goto out;
Y
Yan Zheng 已提交
4903 4904 4905
	}

	stripe = &chunk->stripe;
4906 4907 4908
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
4909

4910 4911 4912
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
4913
		stripe++;
4914
	}
4915
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4916

Y
Yan Zheng 已提交
4917
	btrfs_set_stack_chunk_length(chunk, chunk_size);
4918
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
4919 4920 4921 4922 4923
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4924
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
4925
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4926

Y
Yan Zheng 已提交
4927 4928 4929
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
	key.offset = chunk_offset;
4930

Y
Yan Zheng 已提交
4931
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4932 4933 4934 4935 4936
	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
		/*
		 * TODO: Cleanup of inserted chunk root in case of
		 * failure.
		 */
4937
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
4938
	}
4939

4940
out:
4941
	kfree(chunk);
4942
	free_extent_map(em);
4943
	return ret;
Y
Yan Zheng 已提交
4944
}
4945

Y
Yan Zheng 已提交
4946 4947 4948 4949 4950 4951 4952
/*
 * Chunk allocation falls into two parts. The first part does works
 * that make the new allocated chunk useable, but not do any operation
 * that modifies the chunk tree. The second part does the works that
 * require modifying the chunk tree. This division is important for the
 * bootstrap process of adding storage to a seed btrfs.
 */
4953
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
Y
Yan Zheng 已提交
4954 4955 4956
{
	u64 chunk_offset;

4957 4958
	lockdep_assert_held(&trans->fs_info->chunk_mutex);
	chunk_offset = find_next_chunk(trans->fs_info);
4959
	return __btrfs_alloc_chunk(trans, chunk_offset, type);
Y
Yan Zheng 已提交
4960 4961
}

C
Chris Mason 已提交
4962
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4963
					 struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
4964 4965 4966 4967 4968 4969
{
	u64 chunk_offset;
	u64 sys_chunk_offset;
	u64 alloc_profile;
	int ret;

4970
	chunk_offset = find_next_chunk(fs_info);
4971
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
4972
	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
4973 4974
	if (ret)
		return ret;
Y
Yan Zheng 已提交
4975

4976
	sys_chunk_offset = find_next_chunk(fs_info);
4977
	alloc_profile = btrfs_system_alloc_profile(fs_info);
4978
	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
4979
	return ret;
Y
Yan Zheng 已提交
4980 4981
}

4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
	int max_errors;

	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			 BTRFS_BLOCK_GROUP_RAID10 |
			 BTRFS_BLOCK_GROUP_RAID5 |
			 BTRFS_BLOCK_GROUP_DUP)) {
		max_errors = 1;
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
		max_errors = 2;
	} else {
		max_errors = 0;
4995
	}
Y
Yan Zheng 已提交
4996

4997
	return max_errors;
Y
Yan Zheng 已提交
4998 4999
}

5000
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5001 5002 5003 5004
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5005
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5006 5007
	int i;

5008 5009
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5010 5011
		return 1;

5012
	map = em->map_lookup;
Y
Yan Zheng 已提交
5013
	for (i = 0; i < map->num_stripes; i++) {
5014 5015
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5016 5017 5018
			miss_ndevs++;
			continue;
		}
5019 5020
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
Y
Yan Zheng 已提交
5021
			readonly = 1;
5022
			goto end;
Y
Yan Zheng 已提交
5023 5024
		}
	}
5025 5026 5027 5028 5029 5030 5031 5032 5033

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5034
	free_extent_map(em);
Y
Yan Zheng 已提交
5035
	return readonly;
5036 5037 5038 5039
}

void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
5040
	extent_map_tree_init(&tree->map_tree);
5041 5042 5043 5044 5045 5046
}

void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
{
	struct extent_map *em;

C
Chris Mason 已提交
5047
	while (1) {
5048
		write_lock(&tree->map_tree.lock);
5049 5050 5051
		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
		if (em)
			remove_extent_mapping(&tree->map_tree, em);
5052
		write_unlock(&tree->map_tree.lock);
5053 5054 5055 5056 5057 5058 5059 5060 5061
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5062
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5063 5064 5065 5066 5067
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5068 5069 5070 5071 5072 5073 5074 5075
	em = get_chunk_map(fs_info, logical, len);
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5076 5077
		return 1;

5078
	map = em->map_lookup;
5079 5080
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
		ret = map->num_stripes;
C
Chris Mason 已提交
5081 5082
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5083 5084 5085
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
L
Liu Bo 已提交
5086 5087 5088
		/*
		 * There could be two corrupted data stripes, we need
		 * to loop retry in order to rebuild the correct data.
5089
		 *
L
Liu Bo 已提交
5090 5091 5092 5093
		 * Fail a stripe at a time on every retry except the
		 * stripe under reconstruction.
		 */
		ret = map->num_stripes;
5094 5095 5096
	else
		ret = 1;
	free_extent_map(em);
5097

5098
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
5099 5100
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5101
		ret++;
5102
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
5103

5104 5105 5106
	return ret;
}

5107
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5108 5109 5110 5111
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5112
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5113

5114
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5115

5116 5117 5118 5119 5120 5121
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5122 5123 5124
	return len;
}

5125
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5126 5127 5128 5129 5130
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5131
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5132

5133 5134 5135 5136 5137 5138
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5139 5140 5141
	return ret;
}

5142
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5143
			    struct map_lookup *map, int first,
5144
			    int dev_replace_is_ongoing)
5145 5146
{
	int i;
5147
	int num_stripes;
5148
	int preferred_mirror;
5149 5150 5151
	int tolerance;
	struct btrfs_device *srcdev;

5152 5153 5154 5155 5156 5157 5158 5159
	ASSERT((map->type &
		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

5160 5161
	preferred_mirror = first + current->pid % num_stripes;

5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5175 5176 5177
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5178
		for (i = first; i < first + num_stripes; i++) {
5179 5180 5181 5182
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5183
	}
5184

5185 5186 5187
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5188
	return preferred_mirror;
5189 5190
}

D
David Woodhouse 已提交
5191 5192 5193 5194 5195 5196
static inline int parity_smaller(u64 a, u64 b)
{
	return a > b;
}

/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5197
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5198 5199 5200 5201 5202 5203 5204 5205
{
	struct btrfs_bio_stripe s;
	int i;
	u64 l;
	int again = 1;

	while (again) {
		again = 0;
5206
		for (i = 0; i < num_stripes - 1; i++) {
5207 5208
			if (parity_smaller(bbio->raid_map[i],
					   bbio->raid_map[i+1])) {
D
David Woodhouse 已提交
5209
				s = bbio->stripes[i];
5210
				l = bbio->raid_map[i];
D
David Woodhouse 已提交
5211
				bbio->stripes[i] = bbio->stripes[i+1];
5212
				bbio->raid_map[i] = bbio->raid_map[i+1];
D
David Woodhouse 已提交
5213
				bbio->stripes[i+1] = s;
5214
				bbio->raid_map[i+1] = l;
5215

D
David Woodhouse 已提交
5216 5217 5218 5219 5220 5221
				again = 1;
			}
		}
	}
}

5222 5223 5224
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5225
		 /* the size of the btrfs_bio */
5226
		sizeof(struct btrfs_bio) +
5227
		/* plus the variable array for the stripes */
5228
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5229
		/* plus the variable array for the tgt dev */
5230
		sizeof(int) * (real_stripes) +
5231 5232 5233 5234 5235
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5236
		GFP_NOFS|__GFP_NOFAIL);
5237 5238

	atomic_set(&bbio->error, 0);
5239
	refcount_set(&bbio->refs, 1);
5240 5241 5242 5243 5244 5245

	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5246 5247
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5248 5249 5250 5251 5252 5253
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5254
	if (refcount_dec_and_test(&bbio->refs))
5255 5256 5257
		kfree(bbio);
}

5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

	em = get_chunk_map(fs_info, logical, length);
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
	length = min_t(u64, em->len - offset, length);

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5315
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

5581 5582 5583 5584 5585
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

5586 5587
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
5588
			     u64 logical, u64 *length,
5589
			     struct btrfs_bio **bbio_ret,
5590
			     int mirror_num, int need_raid_map)
5591 5592 5593 5594
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 offset;
5595 5596
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
5597
	u64 stripe_len;
5598
	u32 stripe_index;
5599
	int i;
L
Li Zefan 已提交
5600
	int ret = 0;
5601
	int num_stripes;
5602
	int max_errors = 0;
5603
	int tgtdev_indexes = 0;
5604
	struct btrfs_bio *bbio = NULL;
5605 5606 5607
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
5608 5609
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
5610
	u64 raid56_full_stripe_start = (u64)-1;
5611

5612 5613 5614 5615
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     *length, bbio_ret);

5616 5617 5618
	em = get_chunk_map(fs_info, logical, *length);
	if (IS_ERR(em))
		return PTR_ERR(em);
5619

5620
	map = em->map_lookup;
5621
	offset = logical - em->start;
5622

D
David Woodhouse 已提交
5623
	stripe_len = map->stripe_len;
5624 5625 5626 5627 5628
	stripe_nr = offset;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
5629
	stripe_nr = div64_u64(stripe_nr, stripe_len);
5630

D
David Woodhouse 已提交
5631
	stripe_offset = stripe_nr * stripe_len;
5632
	if (offset < stripe_offset) {
J
Jeff Mahoney 已提交
5633 5634
		btrfs_crit(fs_info,
			   "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5635 5636 5637 5638 5639
			   stripe_offset, offset, em->start, logical,
			   stripe_len);
		free_extent_map(em);
		return -EINVAL;
	}
5640 5641 5642 5643

	/* stripe_offset is the offset of this block in its stripe*/
	stripe_offset = offset - stripe_offset;

D
David Woodhouse 已提交
5644
	/* if we're here for raid56, we need to know the stripe aligned start */
5645
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
D
David Woodhouse 已提交
5646 5647 5648 5649 5650 5651
		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
		raid56_full_stripe_start = offset;

		/* allow a write of a full stripe, but make sure we don't
		 * allow straddling of stripes
		 */
5652 5653
		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				full_stripe_len);
D
David Woodhouse 已提交
5654 5655 5656
		raid56_full_stripe_start *= full_stripe_len;
	}

5657
	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
D
David Woodhouse 已提交
5658 5659 5660 5661
		u64 max_len;
		/* For writes to RAID[56], allow a full stripeset across all disks.
		   For other RAID types and for RAID[56] reads, just allow a single
		   stripe (on a single disk). */
5662
		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5663
		    (op == BTRFS_MAP_WRITE)) {
D
David Woodhouse 已提交
5664 5665 5666 5667 5668 5669 5670
			max_len = stripe_len * nr_data_stripes(map) -
				(offset - raid56_full_stripe_start);
		} else {
			/* we limit the length of each bio to what fits in a stripe */
			max_len = stripe_len - stripe_offset;
		}
		*length = min_t(u64, em->len - offset, max_len);
5671 5672 5673
	} else {
		*length = em->len - offset;
	}
5674

D
David Woodhouse 已提交
5675 5676
	/* This is for when we're called from btrfs_merge_bio_hook() and all
	   it cares about is the length */
5677
	if (!bbio_ret)
5678 5679
		goto out;

5680
	btrfs_dev_replace_read_lock(dev_replace);
5681 5682
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	if (!dev_replace_is_ongoing)
5683
		btrfs_dev_replace_read_unlock(dev_replace);
5684 5685
	else
		btrfs_dev_replace_set_lock_blocking(dev_replace);
5686

5687
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5688
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5689 5690 5691 5692 5693
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
5694
			goto out;
5695 5696
		else
			patch_the_first_stripe_for_dev_replace = 1;
5697 5698 5699 5700
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

5701
	num_stripes = 1;
5702
	stripe_index = 0;
5703
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5704 5705
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5706
		if (!need_full_stripe(op))
5707
			mirror_num = 1;
5708
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5709
		if (need_full_stripe(op))
5710
			num_stripes = map->num_stripes;
5711
		else if (mirror_num)
5712
			stripe_index = mirror_num - 1;
5713
		else {
5714 5715
			stripe_index = find_live_mirror(fs_info, map, 0,
					    dev_replace_is_ongoing);
5716
			mirror_num = stripe_index + 1;
5717
		}
5718

5719
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5720
		if (need_full_stripe(op)) {
5721
			num_stripes = map->num_stripes;
5722
		} else if (mirror_num) {
5723
			stripe_index = mirror_num - 1;
5724 5725 5726
		} else {
			mirror_num = 1;
		}
5727

C
Chris Mason 已提交
5728
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5729
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
5730

5731
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
5732 5733
		stripe_index *= map->sub_stripes;

5734
		if (need_full_stripe(op))
5735
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
5736 5737
		else if (mirror_num)
			stripe_index += mirror_num - 1;
5738
		else {
J
Jan Schmidt 已提交
5739
			int old_stripe_index = stripe_index;
5740 5741 5742
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
5743
			mirror_num = stripe_index - old_stripe_index + 1;
5744
		}
D
David Woodhouse 已提交
5745

5746
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5747
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
5748
			/* push stripe_nr back to the start of the full stripe */
5749
			stripe_nr = div64_u64(raid56_full_stripe_start,
5750
					stripe_len * nr_data_stripes(map));
D
David Woodhouse 已提交
5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
5765 5766
			stripe_nr = div_u64_rem(stripe_nr,
					nr_data_stripes(map), &stripe_index);
D
David Woodhouse 已提交
5767 5768 5769 5770 5771
			if (mirror_num > 1)
				stripe_index = nr_data_stripes(map) +
						mirror_num - 2;

			/* We distribute the parity blocks across stripes */
5772 5773
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
5774
			if (!need_full_stripe(op) && mirror_num <= 1)
5775
				mirror_num = 1;
D
David Woodhouse 已提交
5776
		}
5777 5778
	} else {
		/*
5779 5780 5781
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
5782
		 */
5783 5784
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5785
		mirror_num = stripe_index + 1;
5786
	}
5787
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
5788 5789
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5790 5791 5792 5793
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
5794

5795
	num_alloc_stripes = num_stripes;
5796
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5797
		if (op == BTRFS_MAP_WRITE)
5798
			num_alloc_stripes <<= 1;
5799
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
5800
			num_alloc_stripes++;
5801
		tgtdev_indexes = num_stripes;
5802
	}
5803

5804
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
5805 5806 5807 5808
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
5809
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5810
		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
L
Li Zefan 已提交
5811

5812
	/* build raid_map */
5813 5814
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
5815
		u64 tmp;
5816
		unsigned rot;
5817 5818 5819 5820 5821 5822 5823

		bbio->raid_map = (u64 *)((void *)bbio->stripes +
				 sizeof(struct btrfs_bio_stripe) *
				 num_alloc_stripes +
				 sizeof(int) * tgtdev_indexes);

		/* Work out the disk rotation on this stripe-set */
5824
		div_u64_rem(stripe_nr, num_stripes, &rot);
5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837

		/* Fill in the logical address of each stripe */
		tmp = stripe_nr * nr_data_stripes(map);
		for (i = 0; i < nr_data_stripes(map); i++)
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;
	}

L
Liu Bo 已提交
5838

5839 5840 5841 5842 5843 5844 5845 5846
	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset +
			stripe_nr * map->stripe_len;
		bbio->stripes[i].dev =
			map->stripes[stripe_index].dev;
		stripe_index++;
5847
	}
L
Li Zefan 已提交
5848

5849
	if (need_full_stripe(op))
5850
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
5851

5852 5853
	if (bbio->raid_map)
		sort_parity_stripes(bbio, num_stripes);
5854

5855
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5856
	    need_full_stripe(op)) {
5857 5858
		handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
					  &max_errors);
5859 5860
	}

L
Li Zefan 已提交
5861
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
5862
	bbio->map_type = map->type;
L
Li Zefan 已提交
5863 5864 5865
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
5878
out:
5879 5880
	if (dev_replace_is_ongoing) {
		btrfs_dev_replace_clear_lock_blocking(dev_replace);
5881
		btrfs_dev_replace_read_unlock(dev_replace);
5882
	}
5883
	free_extent_map(em);
L
Li Zefan 已提交
5884
	return ret;
5885 5886
}

5887
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5888
		      u64 logical, u64 *length,
5889
		      struct btrfs_bio **bbio_ret, int mirror_num)
5890
{
5891
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5892
				 mirror_num, 0);
5893 5894
}

5895
/* For Scrub/replace */
5896
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5897
		     u64 logical, u64 *length,
5898
		     struct btrfs_bio **bbio_ret)
5899
{
5900
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5901 5902
}

5903 5904
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
Y
Yan Zheng 已提交
5905 5906 5907 5908 5909 5910 5911
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 *buf;
	u64 bytenr;
	u64 length;
	u64 stripe_nr;
D
David Woodhouse 已提交
5912
	u64 rmap_len;
Y
Yan Zheng 已提交
5913 5914
	int i, j, nr = 0;

5915 5916
	em = get_chunk_map(fs_info, chunk_start, 1);
	if (IS_ERR(em))
5917 5918
		return -EIO;

5919
	map = em->map_lookup;
Y
Yan Zheng 已提交
5920
	length = em->len;
D
David Woodhouse 已提交
5921 5922
	rmap_len = map->stripe_len;

Y
Yan Zheng 已提交
5923
	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5924
		length = div_u64(length, map->num_stripes / map->sub_stripes);
Y
Yan Zheng 已提交
5925
	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5926
		length = div_u64(length, map->num_stripes);
5927
	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5928
		length = div_u64(length, nr_data_stripes(map));
D
David Woodhouse 已提交
5929 5930
		rmap_len = map->stripe_len * nr_data_stripes(map);
	}
Y
Yan Zheng 已提交
5931

5932
	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5933
	BUG_ON(!buf); /* -ENOMEM */
Y
Yan Zheng 已提交
5934 5935 5936 5937 5938 5939 5940

	for (i = 0; i < map->num_stripes; i++) {
		if (map->stripes[i].physical > physical ||
		    map->stripes[i].physical + length <= physical)
			continue;

		stripe_nr = physical - map->stripes[i].physical;
5941
		stripe_nr = div64_u64(stripe_nr, map->stripe_len);
Y
Yan Zheng 已提交
5942 5943 5944

		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			stripe_nr = stripe_nr * map->num_stripes + i;
5945
			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
Y
Yan Zheng 已提交
5946 5947
		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			stripe_nr = stripe_nr * map->num_stripes + i;
D
David Woodhouse 已提交
5948 5949 5950 5951 5952
		} /* else if RAID[56], multiply by nr_data_stripes().
		   * Alternatively, just use rmap_len below instead of
		   * map->stripe_len */

		bytenr = chunk_start + stripe_nr * rmap_len;
5953
		WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
5954 5955 5956 5957
		for (j = 0; j < nr; j++) {
			if (buf[j] == bytenr)
				break;
		}
5958 5959
		if (j == nr) {
			WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
5960
			buf[nr++] = bytenr;
5961
		}
Y
Yan Zheng 已提交
5962 5963 5964 5965
	}

	*logical = buf;
	*naddrs = nr;
D
David Woodhouse 已提交
5966
	*stripe_len = rmap_len;
Y
Yan Zheng 已提交
5967 5968 5969

	free_extent_map(em);
	return 0;
5970 5971
}

5972
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
5973
{
5974 5975
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
5976
	bio_endio(bio);
5977

5978
	btrfs_put_bbio(bbio);
5979 5980
}

5981
static void btrfs_end_bio(struct bio *bio)
5982
{
5983
	struct btrfs_bio *bbio = bio->bi_private;
5984
	int is_orig_bio = 0;
5985

5986
	if (bio->bi_status) {
5987
		atomic_inc(&bbio->error);
5988 5989
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
5990
			unsigned int stripe_index =
5991
				btrfs_io_bio(bio)->stripe_index;
5992
			struct btrfs_device *dev;
5993 5994 5995

			BUG_ON(stripe_index >= bbio->num_stripes);
			dev = bbio->stripes[stripe_index].dev;
5996
			if (dev->bdev) {
M
Mike Christie 已提交
5997
				if (bio_op(bio) == REQ_OP_WRITE)
5998
					btrfs_dev_stat_inc_and_print(dev,
5999 6000
						BTRFS_DEV_STAT_WRITE_ERRS);
				else
6001
					btrfs_dev_stat_inc_and_print(dev,
6002
						BTRFS_DEV_STAT_READ_ERRS);
6003
				if (bio->bi_opf & REQ_PREFLUSH)
6004
					btrfs_dev_stat_inc_and_print(dev,
6005 6006
						BTRFS_DEV_STAT_FLUSH_ERRS);
			}
6007 6008
		}
	}
6009

6010
	if (bio == bbio->orig_bio)
6011 6012
		is_orig_bio = 1;

6013 6014
	btrfs_bio_counter_dec(bbio->fs_info);

6015
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6016 6017
		if (!is_orig_bio) {
			bio_put(bio);
6018
			bio = bbio->orig_bio;
6019
		}
6020

6021
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6022
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6023
		 * beyond the tolerance of the btrfs bio
6024
		 */
6025
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6026
			bio->bi_status = BLK_STS_IOERR;
6027
		} else {
6028 6029 6030 6031
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6032
			bio->bi_status = BLK_STS_OK;
6033
		}
6034

6035
		btrfs_end_bbio(bbio, bio);
6036
	} else if (!is_orig_bio) {
6037 6038 6039 6040
		bio_put(bio);
	}
}

6041 6042 6043 6044 6045 6046 6047
/*
 * see run_scheduled_bios for a description of why bios are collected for
 * async submit.
 *
 * This will add one bio to the pending list for a device and make sure
 * the work struct is scheduled.
 */
6048
static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6049
					struct bio *bio)
6050
{
6051
	struct btrfs_fs_info *fs_info = device->fs_info;
6052
	int should_queue = 1;
6053
	struct btrfs_pending_bios *pending_bios;
6054

6055 6056
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
	    !device->bdev) {
6057
		bio_io_error(bio);
D
David Woodhouse 已提交
6058 6059 6060
		return;
	}

6061
	/* don't bother with additional async steps for reads, right now */
M
Mike Christie 已提交
6062
	if (bio_op(bio) == REQ_OP_READ) {
6063
		btrfsic_submit_bio(bio);
6064
		return;
6065 6066
	}

6067
	WARN_ON(bio->bi_next);
6068 6069 6070
	bio->bi_next = NULL;

	spin_lock(&device->io_lock);
6071
	if (op_is_sync(bio->bi_opf))
6072 6073 6074
		pending_bios = &device->pending_sync_bios;
	else
		pending_bios = &device->pending_bios;
6075

6076 6077
	if (pending_bios->tail)
		pending_bios->tail->bi_next = bio;
6078

6079 6080 6081
	pending_bios->tail = bio;
	if (!pending_bios->head)
		pending_bios->head = bio;
6082 6083 6084 6085 6086 6087
	if (device->running_pending)
		should_queue = 0;

	spin_unlock(&device->io_lock);

	if (should_queue)
6088
		btrfs_queue_work(fs_info->submit_workers, &device->work);
6089 6090
}

6091 6092
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
			      u64 physical, int dev_nr, int async)
6093 6094
{
	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6095
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6096 6097

	bio->bi_private = bbio;
6098
	btrfs_io_bio(bio)->stripe_index = dev_nr;
6099
	bio->bi_end_io = btrfs_end_bio;
6100
	bio->bi_iter.bi_sector = physical >> 9;
6101 6102 6103 6104 6105 6106
#ifdef DEBUG
	{
		struct rcu_string *name;

		rcu_read_lock();
		name = rcu_dereference(dev->name);
6107 6108 6109 6110 6111 6112
		btrfs_debug(fs_info,
			"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
			bio_op(bio), bio->bi_opf,
			(u64)bio->bi_iter.bi_sector,
			(u_long)dev->bdev->bd_dev, name->str, dev->devid,
			bio->bi_iter.bi_size);
6113 6114 6115
		rcu_read_unlock();
	}
#endif
6116
	bio_set_dev(bio, dev->bdev);
6117

6118
	btrfs_bio_counter_inc_noblocked(fs_info);
6119

6120
	if (async)
6121
		btrfs_schedule_bio(dev, bio);
6122
	else
6123
		btrfsic_submit_bio(bio);
6124 6125 6126 6127 6128 6129
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6130
		/* Should be the original bio. */
6131 6132
		WARN_ON(bio != bbio->orig_bio);

6133
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6134
		bio->bi_iter.bi_sector = logical >> 9;
6135 6136 6137 6138
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6139
		btrfs_end_bbio(bbio, bio);
6140 6141 6142
	}
}

6143 6144
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
			   int mirror_num, int async_submit)
6145 6146
{
	struct btrfs_device *dev;
6147
	struct bio *first_bio = bio;
6148
	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6149 6150 6151
	u64 length = 0;
	u64 map_length;
	int ret;
6152 6153
	int dev_nr;
	int total_devs;
6154
	struct btrfs_bio *bbio = NULL;
6155

6156
	length = bio->bi_iter.bi_size;
6157
	map_length = length;
6158

6159
	btrfs_bio_counter_inc_blocked(fs_info);
6160
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6161
				&map_length, &bbio, mirror_num, 1);
6162
	if (ret) {
6163
		btrfs_bio_counter_dec(fs_info);
6164
		return errno_to_blk_status(ret);
6165
	}
6166

6167
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6168 6169 6170
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6171
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6172 6173
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6174
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
M
Mike Christie 已提交
6175
	    ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6176 6177
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
M
Mike Christie 已提交
6178
		if (bio_op(bio) == REQ_OP_WRITE) {
6179 6180
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6181
		} else {
6182 6183
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6184
		}
6185

6186
		btrfs_bio_counter_dec(fs_info);
6187
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6188 6189
	}

6190
	if (map_length < length) {
6191
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6192 6193
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6194 6195
		BUG();
	}
6196

6197
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6198
		dev = bbio->stripes[dev_nr].dev;
M
Mike Christie 已提交
6199
		if (!dev || !dev->bdev ||
6200 6201
		    (bio_op(first_bio) == REQ_OP_WRITE &&
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6202 6203 6204 6205
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6206
		if (dev_nr < total_devs - 1)
6207
			bio = btrfs_bio_clone(first_bio);
6208
		else
6209
			bio = first_bio;
6210

6211 6212
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				  dev_nr, async_submit);
6213
	}
6214
	btrfs_bio_counter_dec(fs_info);
6215
	return BLK_STS_OK;
6216 6217
}

6218
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
Y
Yan Zheng 已提交
6219
				       u8 *uuid, u8 *fsid)
6220
{
Y
Yan Zheng 已提交
6221 6222 6223
	struct btrfs_device *device;
	struct btrfs_fs_devices *cur_devices;

6224
	cur_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
6225 6226
	while (cur_devices) {
		if (!fsid ||
6227
		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6228
			device = find_device(cur_devices, devid, uuid);
Y
Yan Zheng 已提交
6229 6230 6231 6232 6233 6234
			if (device)
				return device;
		}
		cur_devices = cur_devices->seed;
	}
	return NULL;
6235 6236
}

6237
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6238 6239 6240 6241
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;

6242 6243
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
	if (IS_ERR(device))
6244
		return device;
6245 6246

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6247
	device->fs_devices = fs_devices;
6248
	fs_devices->num_devices++;
6249

6250
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6251
	fs_devices->missing_devices++;
6252

6253 6254 6255
	return device;
}

6256 6257 6258 6259 6260 6261 6262 6263 6264 6265
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6266
 * on error.  Returned struct is not linked onto any lists and must be
6267
 * destroyed with btrfs_free_device.
6268 6269 6270 6271 6272 6273 6274 6275
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6276
	if (WARN_ON(!devid && !fs_info))
6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289
		return ERR_PTR(-EINVAL);

	dev = __alloc_device();
	if (IS_ERR(dev))
		return dev;

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6290
			btrfs_free_device(dev);
6291 6292 6293 6294 6295 6296 6297 6298 6299 6300
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

6301 6302
	btrfs_init_work(&dev->work, btrfs_submit_helper,
			pending_bios_fn, NULL, NULL);
6303 6304 6305 6306

	return dev;
}

6307
/* Return -EIO if any error, otherwise return 0. */
6308
static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6309 6310
				   struct extent_buffer *leaf,
				   struct btrfs_chunk *chunk, u64 logical)
6311 6312
{
	u64 length;
6313
	u64 stripe_len;
6314 6315 6316
	u16 num_stripes;
	u16 sub_stripes;
	u64 type;
6317

6318
	length = btrfs_chunk_length(leaf, chunk);
6319 6320
	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6321 6322 6323
	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
	type = btrfs_chunk_type(leaf, chunk);

6324
	if (!num_stripes) {
6325
		btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6326 6327 6328
			  num_stripes);
		return -EIO;
	}
6329 6330
	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6331 6332
		return -EIO;
	}
6333 6334
	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
		btrfs_err(fs_info, "invalid chunk sectorsize %u",
6335 6336 6337
			  btrfs_chunk_sector_size(leaf, chunk));
		return -EIO;
	}
6338 6339
	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk length %llu", length);
6340 6341
		return -EIO;
	}
6342
	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6343
		btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6344 6345 6346 6347
			  stripe_len);
		return -EIO;
	}
	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6348
	    type) {
6349
		btrfs_err(fs_info, "unrecognized chunk type: %llu",
6350 6351 6352 6353 6354
			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
			  btrfs_chunk_type(leaf, chunk));
		return -EIO;
	}
6355 6356 6357 6358 6359 6360 6361
	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
	     num_stripes != 1)) {
6362
		btrfs_err(fs_info,
6363 6364 6365 6366 6367 6368 6369 6370 6371
			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
			num_stripes, sub_stripes,
			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
		return -EIO;
	}

	return 0;
}

6372
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6373
					u64 devid, u8 *uuid, bool error)
6374
{
6375 6376 6377 6378 6379 6380
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6381 6382
}

6383
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6384 6385 6386
			  struct extent_buffer *leaf,
			  struct btrfs_chunk *chunk)
{
6387
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6402
	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6403 6404
	if (ret)
		return ret;
6405

6406
	read_lock(&map_tree->map_tree.lock);
6407
	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6408
	read_unlock(&map_tree->map_tree.lock);
6409 6410 6411 6412 6413 6414 6415 6416 6417

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6418
	em = alloc_extent_map();
6419 6420
	if (!em)
		return -ENOMEM;
6421
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6422 6423 6424 6425 6426
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6427
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6428
	em->map_lookup = map;
6429 6430
	em->start = logical;
	em->len = length;
6431
	em->orig_start = 0;
6432
	em->block_start = 0;
C
Chris Mason 已提交
6433
	em->block_len = em->len;
6434

6435 6436 6437 6438 6439
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	map->type = btrfs_chunk_type(leaf, chunk);
C
Chris Mason 已提交
6440
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6441 6442 6443 6444
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6445 6446 6447
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
6448
		map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6449
							uuid, NULL);
6450
		if (!map->stripes[i].dev &&
6451
		    !btrfs_test_opt(fs_info, DEGRADED)) {
6452
			free_extent_map(em);
6453
			btrfs_report_missing_device(fs_info, devid, uuid, true);
6454
			return -ENOENT;
6455
		}
6456 6457
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
6458 6459
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
6460
			if (IS_ERR(map->stripes[i].dev)) {
6461
				free_extent_map(em);
6462 6463 6464 6465
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
6466
			}
6467
			btrfs_report_missing_device(fs_info, devid, uuid, false);
6468
		}
6469 6470 6471
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));

6472 6473
	}

6474
	write_lock(&map_tree->map_tree.lock);
J
Josef Bacik 已提交
6475
	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6476
	write_unlock(&map_tree->map_tree.lock);
6477
	BUG_ON(ret); /* Tree corruption */
6478 6479 6480 6481 6482
	free_extent_map(em);

	return 0;
}

6483
static void fill_device_from_item(struct extent_buffer *leaf,
6484 6485 6486 6487 6488 6489
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
6490 6491
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
6492
	device->commit_total_bytes = device->disk_total_bytes;
6493
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6494
	device->commit_bytes_used = device->bytes_used;
6495 6496 6497 6498
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6499
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6500
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6501

6502
	ptr = btrfs_device_uuid(dev_item);
6503
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6504 6505
}

6506
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6507
						  u8 *fsid)
Y
Yan Zheng 已提交
6508 6509 6510 6511
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

6512
	lockdep_assert_held(&uuid_mutex);
D
David Sterba 已提交
6513
	ASSERT(fsid);
Y
Yan Zheng 已提交
6514

6515
	fs_devices = fs_info->fs_devices->seed;
Y
Yan Zheng 已提交
6516
	while (fs_devices) {
6517
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6518 6519
			return fs_devices;

Y
Yan Zheng 已提交
6520 6521 6522 6523 6524
		fs_devices = fs_devices->seed;
	}

	fs_devices = find_fsid(fsid);
	if (!fs_devices) {
6525
		if (!btrfs_test_opt(fs_info, DEGRADED))
6526 6527 6528 6529 6530 6531 6532 6533 6534
			return ERR_PTR(-ENOENT);

		fs_devices = alloc_fs_devices(fsid);
		if (IS_ERR(fs_devices))
			return fs_devices;

		fs_devices->seeding = 1;
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
6535
	}
Y
Yan Zheng 已提交
6536 6537

	fs_devices = clone_fs_devices(fs_devices);
6538 6539
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
6540

6541
	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6542 6543
	if (ret) {
		free_fs_devices(fs_devices);
6544
		fs_devices = ERR_PTR(ret);
Y
Yan Zheng 已提交
6545
		goto out;
6546
	}
Y
Yan Zheng 已提交
6547 6548

	if (!fs_devices->seeding) {
6549
		close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
6550
		free_fs_devices(fs_devices);
6551
		fs_devices = ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
6552 6553 6554
		goto out;
	}

6555 6556
	fs_devices->seed = fs_info->fs_devices->seed;
	fs_info->fs_devices->seed = fs_devices;
Y
Yan Zheng 已提交
6557
out:
6558
	return fs_devices;
Y
Yan Zheng 已提交
6559 6560
}

6561
static int read_one_dev(struct btrfs_fs_info *fs_info,
6562 6563 6564
			struct extent_buffer *leaf,
			struct btrfs_dev_item *dev_item)
{
6565
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6566 6567 6568
	struct btrfs_device *device;
	u64 devid;
	int ret;
6569
	u8 fs_uuid[BTRFS_FSID_SIZE];
6570 6571
	u8 dev_uuid[BTRFS_UUID_SIZE];

6572
	devid = btrfs_device_id(leaf, dev_item);
6573
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6574
			   BTRFS_UUID_SIZE);
6575
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6576
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
6577

6578
	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6579
		fs_devices = open_seed_devices(fs_info, fs_uuid);
6580 6581
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
6582 6583
	}

6584
	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6585
	if (!device) {
6586
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
6587 6588
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
6589
			return -ENOENT;
6590
		}
Y
Yan Zheng 已提交
6591

6592
		device = add_missing_dev(fs_devices, devid, dev_uuid);
6593 6594 6595 6596 6597 6598
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
6599
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6600
	} else {
6601
		if (!device->bdev) {
6602 6603 6604
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
6605
				return -ENOENT;
6606 6607 6608
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
6609
		}
6610

6611 6612
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6613 6614 6615 6616 6617 6618
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
6619
			device->fs_devices->missing_devices++;
6620
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Y
Yan Zheng 已提交
6621
		}
6622 6623 6624

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
6625 6626
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
6627 6628 6629 6630 6631 6632 6633 6634 6635 6636

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
6637 6638
	}

6639
	if (device->fs_devices != fs_info->fs_devices) {
6640
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Y
Yan Zheng 已提交
6641 6642 6643
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
6644
	}
6645 6646

	fill_device_from_item(leaf, dev_item, device);
6647
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6648
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6649
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Y
Yan Zheng 已提交
6650
		device->fs_devices->total_rw_bytes += device->total_bytes;
6651 6652
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
6653
	}
6654 6655 6656 6657
	ret = 0;
	return ret;
}

6658
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6659
{
6660
	struct btrfs_root *root = fs_info->tree_root;
6661
	struct btrfs_super_block *super_copy = fs_info->super_copy;
6662
	struct extent_buffer *sb;
6663 6664
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
6665 6666
	u8 *array_ptr;
	unsigned long sb_array_offset;
6667
	int ret = 0;
6668 6669 6670
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
6671
	u32 cur_offset;
6672
	u64 type;
6673
	struct btrfs_key key;
6674

6675
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6676 6677 6678 6679 6680
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
6681
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6682 6683
	if (IS_ERR(sb))
		return PTR_ERR(sb);
6684
	set_extent_buffer_uptodate(sb);
6685
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6686
	/*
6687
	 * The sb extent buffer is artificial and just used to read the system array.
6688
	 * set_extent_buffer_uptodate() call does not properly mark all it's
6689 6690 6691 6692 6693 6694 6695 6696 6697
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
6698
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6699
		SetPageUptodate(sb->pages[0]);
6700

6701
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6702 6703
	array_size = btrfs_super_sys_array_size(super_copy);

6704 6705 6706
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
6707

6708 6709
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
6710 6711 6712 6713
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

6714 6715
		btrfs_disk_key_to_cpu(&key, disk_key);

6716 6717 6718
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6719

6720
		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6721
			chunk = (struct btrfs_chunk *)sb_array_offset;
6722 6723 6724 6725 6726 6727 6728 6729 6730
			/*
			 * At least one btrfs_chunk with one stripe must be
			 * present, exact stripe count check comes afterwards
			 */
			len = btrfs_chunk_item_size(1);
			if (cur_offset + len > array_size)
				goto out_short_read;

			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6731
			if (!num_stripes) {
6732 6733
				btrfs_err(fs_info,
					"invalid number of stripes %u in sys_array at offset %u",
6734 6735 6736 6737 6738
					num_stripes, cur_offset);
				ret = -EIO;
				break;
			}

6739 6740
			type = btrfs_chunk_type(sb, chunk);
			if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6741
				btrfs_err(fs_info,
6742 6743 6744 6745 6746 6747
			    "invalid chunk type %llu in sys_array at offset %u",
					type, cur_offset);
				ret = -EIO;
				break;
			}

6748 6749 6750 6751
			len = btrfs_chunk_item_size(num_stripes);
			if (cur_offset + len > array_size)
				goto out_short_read;

6752
			ret = read_one_chunk(fs_info, &key, sb, chunk);
6753 6754
			if (ret)
				break;
6755
		} else {
6756 6757 6758
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
6759 6760
			ret = -EIO;
			break;
6761
		}
6762 6763 6764
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6765
	}
6766
	clear_extent_buffer_uptodate(sb);
6767
	free_extent_buffer_stale(sb);
6768
	return ret;
6769 6770

out_short_read:
6771
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6772
			len, cur_offset);
6773
	clear_extent_buffer_uptodate(sb);
6774
	free_extent_buffer_stale(sb);
6775
	return -EIO;
6776 6777
}

6778 6779 6780
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
6781 6782
 * If the @failing_dev is specified, it's accounted as missing.
 *
6783 6784 6785
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
6786 6787
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814
{
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

	read_lock(&map_tree->map_tree.lock);
	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
	read_unlock(&map_tree->map_tree.lock);
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

6815 6816
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6817 6818
			    dev->last_flush_error)
				missing++;
6819 6820
			else if (failing_dev && failing_dev == dev)
				missing++;
6821 6822
		}
		if (missing > max_tolerated) {
6823 6824
			if (!failing_dev)
				btrfs_warn(fs_info,
6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842
	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

		read_lock(&map_tree->map_tree.lock);
		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
					   (u64)(-1) - next_start);
		read_unlock(&map_tree->map_tree.lock);
	}
out:
	return ret;
}

6843
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
6844
{
6845
	struct btrfs_root *root = fs_info->chunk_root;
6846 6847 6848 6849 6850 6851
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
6852
	u64 total_dev = 0;
6853 6854 6855 6856 6857

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

6858 6859 6860 6861
	/*
	 * uuid_mutex is needed only if we are mounting a sprout FS
	 * otherwise we don't need it.
	 */
6862
	mutex_lock(&uuid_mutex);
6863
	mutex_lock(&fs_info->chunk_mutex);
6864

6865 6866 6867 6868 6869
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6870 6871 6872 6873 6874
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6875 6876
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
6877
	while (1) {
6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6889 6890 6891
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
6892
						  struct btrfs_dev_item);
6893
			ret = read_one_dev(fs_info, leaf, dev_item);
6894 6895
			if (ret)
				goto error;
6896
			total_dev++;
6897 6898 6899
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6900
			ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
Y
Yan Zheng 已提交
6901 6902
			if (ret)
				goto error;
6903 6904 6905
		}
		path->slots[0]++;
	}
6906 6907 6908 6909 6910

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
6911 6912
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
6913
	   "super_num_devices %llu mismatch with num_devices %llu found here",
6914
			  btrfs_super_num_devices(fs_info->super_copy),
6915 6916 6917 6918
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
6919 6920 6921
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
6922
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
6923 6924
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
6925 6926 6927
		ret = -EINVAL;
		goto error;
	}
6928 6929
	ret = 0;
error:
6930
	mutex_unlock(&fs_info->chunk_mutex);
6931 6932
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
6933
	btrfs_free_path(path);
6934 6935
	return ret;
}
6936

6937 6938 6939 6940 6941
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;

6942 6943 6944
	while (fs_devices) {
		mutex_lock(&fs_devices->device_list_mutex);
		list_for_each_entry(device, &fs_devices->devices, dev_list)
6945
			device->fs_info = fs_info;
6946 6947 6948 6949
		mutex_unlock(&fs_devices->device_list_mutex);

		fs_devices = fs_devices->seed;
	}
6950 6951
}

6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_dev_stat_reset(dev, i);
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct extent_buffer *eb;
	int slot;
	int ret = 0;
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
	int i;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		int item_size;
		struct btrfs_dev_stats_item *ptr;

6984 6985
		key.objectid = BTRFS_DEV_STATS_OBJECTID;
		key.type = BTRFS_PERSISTENT_ITEM_KEY;
6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021
		key.offset = device->devid;
		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
		if (ret) {
			__btrfs_reset_dev_stats(device);
			device->dev_stats_valid = 1;
			btrfs_release_path(path);
			continue;
		}
		slot = path->slots[0];
		eb = path->nodes[0];
		btrfs_item_key_to_cpu(eb, &found_key, slot);
		item_size = btrfs_item_size_nr(eb, slot);

		ptr = btrfs_item_ptr(eb, slot,
				     struct btrfs_dev_stats_item);

		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (item_size >= (1 + i) * sizeof(__le64))
				btrfs_dev_stat_set(device, i,
					btrfs_dev_stats_value(eb, ptr, i));
			else
				btrfs_dev_stat_reset(device, i);
		}

		device->dev_stats_valid = 1;
		btrfs_dev_stat_print_on_load(device);
		btrfs_release_path(path);
	}
	mutex_unlock(&fs_devices->device_list_mutex);

out:
	btrfs_free_path(path);
	return ret < 0 ? ret : 0;
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7022
				struct btrfs_fs_info *fs_info,
7023 7024
				struct btrfs_device *device)
{
7025
	struct btrfs_root *dev_root = fs_info->dev_root;
7026 7027 7028 7029 7030 7031 7032
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7033 7034
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7035 7036 7037
	key.offset = device->devid;

	path = btrfs_alloc_path();
7038 7039
	if (!path)
		return -ENOMEM;
7040 7041
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7042
		btrfs_warn_in_rcu(fs_info,
7043
			"error %d while searching for dev_stats item for device %s",
7044
			      ret, rcu_str_deref(device->name));
7045 7046 7047 7048 7049 7050 7051 7052
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7053
			btrfs_warn_in_rcu(fs_info,
7054
				"delete too small dev_stats item for device %s failed %d",
7055
				      rcu_str_deref(device->name), ret);
7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7067
			btrfs_warn_in_rcu(fs_info,
7068 7069
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7094
	int stats_cnt;
7095 7096 7097 7098
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7099 7100
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7101 7102
			continue;

7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7117
		ret = update_dev_stat_item(trans, fs_info, device);
7118
		if (!ret)
7119
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7120 7121 7122 7123 7124 7125
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7126 7127 7128 7129 7130 7131
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7132
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7133
{
7134 7135
	if (!dev->dev_stats_valid)
		return;
7136
	btrfs_err_rl_in_rcu(dev->fs_info,
7137
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7138
			   rcu_str_deref(dev->name),
7139 7140 7141
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7142 7143
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7144
}
7145

7146 7147
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7148 7149 7150 7151 7152 7153 7154 7155
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7156
	btrfs_info_in_rcu(dev->fs_info,
7157
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7158
	       rcu_str_deref(dev->name),
7159 7160 7161 7162 7163 7164 7165
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7166
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7167
			struct btrfs_ioctl_get_dev_stats *stats)
7168 7169
{
	struct btrfs_device *dev;
7170
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7171 7172 7173
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7174
	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7175 7176 7177
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7178
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7179
		return -ENODEV;
7180
	} else if (!dev->dev_stats_valid) {
7181
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7182
		return -ENODEV;
7183
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
				btrfs_dev_stat_reset(dev, i);
		}
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7200

7201
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7202 7203 7204
{
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
7205
	int copy_num;
7206

7207 7208
	if (!bdev)
		return;
7209

7210 7211
	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
		copy_num++) {
7212

7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228
		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
			continue;

		disk_super = (struct btrfs_super_block *)bh->b_data;

		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
		set_buffer_dirty(bh);
		sync_dirty_buffer(bh);
		brelse(bh);
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
7229
}
7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243

/*
 * Update the size of all devices, which is used for writing out the
 * super blocks.
 */
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *curr, *next;

	if (list_empty(&fs_devices->resized_devices))
		return;

	mutex_lock(&fs_devices->device_list_mutex);
7244
	mutex_lock(&fs_info->chunk_mutex);
7245 7246 7247 7248 7249
	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				 resized_list) {
		list_del_init(&curr->resized_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
	}
7250
	mutex_unlock(&fs_info->chunk_mutex);
7251 7252
	mutex_unlock(&fs_devices->device_list_mutex);
}
7253 7254

/* Must be invoked during the transaction commit */
7255
void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7256
{
7257
	struct btrfs_fs_info *fs_info = trans->fs_info;
7258 7259 7260 7261 7262
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_device *dev;
	int i;

7263
	if (list_empty(&trans->pending_chunks))
7264 7265 7266
		return;

	/* In order to kick the device replace finish process */
7267
	mutex_lock(&fs_info->chunk_mutex);
7268
	list_for_each_entry(em, &trans->pending_chunks, list) {
7269
		map = em->map_lookup;
7270 7271 7272 7273 7274 7275

		for (i = 0; i < map->num_stripes; i++) {
			dev = map->stripes[i].dev;
			dev->commit_bytes_used = dev->bytes_used;
		}
	}
7276
	mutex_unlock(&fs_info->chunk_mutex);
7277
}
7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295

void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = fs_info;
		fs_devices = fs_devices->seed;
	}
}

void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = NULL;
		fs_devices = fs_devices->seed;
	}
}