volumes.c 195.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6 7
#include <linux/sched.h>
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/buffer_head.h>
10
#include <linux/blkdev.h>
11
#include <linux/iocontext.h>
12
#include <linux/capability.h>
13
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
14
#include <linux/kthread.h>
D
David Woodhouse 已提交
15
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
16
#include <linux/semaphore.h>
17
#include <linux/uuid.h>
A
Anand Jain 已提交
18
#include <linux/list_sort.h>
D
David Woodhouse 已提交
19
#include <asm/div64.h>
20 21 22 23 24 25
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
26
#include "raid56.h"
27
#include "async-thread.h"
28
#include "check-integrity.h"
29
#include "rcu-string.h"
30
#include "math.h"
31
#include "dev-replace.h"
32
#include "sysfs.h"
33

Z
Zhao Lei 已提交
34 35 36 37 38 39
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
40
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
41 42 43 44 45 46 47 48
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
49
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
50 51 52 53 54 55 56 57
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
58
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
59 60 61 62 63 64 65 66
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
67
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
68 69 70 71 72 73 74 75
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
76
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
77 78 79 80 81 82 83 84
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
85
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
86 87 88 89 90 91 92 93
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
94
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
95 96 97 98 99
		.devs_increment	= 1,
		.ncopies	= 3,
	},
};

100
const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
Z
Zhao Lei 已提交
101 102 103 104 105 106 107 108 109
	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
};

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
/*
 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 * condition is not met. Zero means there's no corresponding
 * BTRFS_ERROR_DEV_*_NOT_MET value.
 */
const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	[BTRFS_RAID_DUP]    = 0,
	[BTRFS_RAID_RAID0]  = 0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
};

Y
Yan Zheng 已提交
125
static int init_first_rw_device(struct btrfs_trans_handle *trans,
126
				struct btrfs_fs_info *fs_info);
127
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 132 133 134 135
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
136

D
David Sterba 已提交
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
 * seeding, structure cloning, openning/closing devices at mount/umount time
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list!
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * volume_mutex
 * ------------
 * coarse lock owned by a mounted filesystem; used to exclude some operations
 * that cannot run in parallel and affect the higher-level properties of the
 * filesystem like: device add/deleting/resize/replace, or balance
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
 * device is added/removed
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   volume_mutex
 *     device_list_mutex
 *       chunk_mutex
 *     balance_mutex
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 *
 *
 * Exclusive operations, BTRFS_FS_EXCL_OP
 * ======================================
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 * completed.
D
David Sterba 已提交
235 236
 */

237
DEFINE_MUTEX(uuid_mutex);
238
static LIST_HEAD(fs_uuids);
239 240 241 242
struct list_head *btrfs_get_fs_uuids(void)
{
	return &fs_uuids;
}
243

D
David Sterba 已提交
244 245 246 247 248 249 250 251 252
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
 * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
253 254 255
{
	struct btrfs_fs_devices *fs_devs;

256
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
257 258 259 260 261 262
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
263
	INIT_LIST_HEAD(&fs_devs->resized_devices);
264
	INIT_LIST_HEAD(&fs_devs->alloc_list);
265
	INIT_LIST_HEAD(&fs_devs->fs_list);
266 267 268 269 270 271
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

	return fs_devs;
}

272 273 274 275 276 277 278
static void free_device(struct btrfs_device *device)
{
	rcu_string_free(device->name);
	bio_put(device->flush_bio);
	kfree(device);
}

Y
Yan Zheng 已提交
279 280 281 282 283 284 285 286
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
287
		free_device(device);
Y
Yan Zheng 已提交
288 289 290 291
	}
	kfree(fs_devices);
}

292 293 294 295 296 297 298
static void btrfs_kobject_uevent(struct block_device *bdev,
				 enum kobject_action action)
{
	int ret;

	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
	if (ret)
299
		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
300 301 302 303 304
			action,
			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			&disk_to_dev(bdev->bd_disk)->kobj);
}

305
void __exit btrfs_cleanup_fs_uuids(void)
306 307 308
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
309 310
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
311 312
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Y
Yan Zheng 已提交
313
		free_fs_devices(fs_devices);
314 315 316
	}
}

317 318 319 320 321
/*
 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 * Returned struct is not linked onto any lists and must be destroyed using
 * free_device.
 */
322 323 324 325
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

326
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
327 328 329
	if (!dev)
		return ERR_PTR(-ENOMEM);

330 331 332 333 334 335 336 337 338 339
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

340 341
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
342
	INIT_LIST_HEAD(&dev->resized_list);
343 344 345 346

	spin_lock_init(&dev->io_lock);

	atomic_set(&dev->reada_in_flight, 0);
347
	atomic_set(&dev->dev_stats_ccnt, 0);
348
	btrfs_device_data_ordered_init(dev);
349
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
350
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
351 352 353 354

	return dev;
}

355 356 357 358 359 360 361 362 363
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
		u64 devid, const u8 *uuid)
364
{
365
	struct list_head *head = &fs_devices->devices;
366 367
	struct btrfs_device *dev;

Q
Qinghuang Feng 已提交
368
	list_for_each_entry(dev, head, dev_list) {
369
		if (dev->devid == devid &&
370
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
371
			return dev;
372
		}
373 374 375 376
	}
	return NULL;
}

377
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
378 379 380
{
	struct btrfs_fs_devices *fs_devices;

381
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
382 383 384 385 386 387
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
404
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
405 406 407 408 409 410
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
411 412
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
413 414 415 416 417 418 419 420 421 422 423 424
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

425 426 427 428 429 430 431 432 433 434 435 436 437 438
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

439 440 441 442 443 444 445 446 447 448 449
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
450
static noinline void run_scheduled_bios(struct btrfs_device *device)
451
{
452
	struct btrfs_fs_info *fs_info = device->fs_info;
453 454
	struct bio *pending;
	struct backing_dev_info *bdi;
455
	struct btrfs_pending_bios *pending_bios;
456 457 458
	struct bio *tail;
	struct bio *cur;
	int again = 0;
459
	unsigned long num_run;
460
	unsigned long batch_run = 0;
461
	unsigned long last_waited = 0;
462
	int force_reg = 0;
M
Miao Xie 已提交
463
	int sync_pending = 0;
464 465 466 467 468 469 470 471 472
	struct blk_plug plug;

	/*
	 * this function runs all the bios we've collected for
	 * a particular device.  We don't want to wander off to
	 * another device without first sending all of these down.
	 * So, setup a plug here and finish it off before we return
	 */
	blk_start_plug(&plug);
473

474
	bdi = device->bdev->bd_bdi;
475

476 477 478
loop:
	spin_lock(&device->io_lock);

479
loop_lock:
480
	num_run = 0;
481

482 483 484 485 486
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
487
	if (!force_reg && device->pending_sync_bios.head) {
488
		pending_bios = &device->pending_sync_bios;
489 490
		force_reg = 1;
	} else {
491
		pending_bios = &device->pending_bios;
492 493
		force_reg = 0;
	}
494 495 496

	pending = pending_bios->head;
	tail = pending_bios->tail;
497 498 499 500 501 502 503 504 505 506
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
507 508
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
509 510
		again = 0;
		device->running_pending = 0;
511 512 513
	} else {
		again = 1;
		device->running_pending = 1;
514
	}
515 516 517 518

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

519 520
	spin_unlock(&device->io_lock);

C
Chris Mason 已提交
521
	while (pending) {
522 523

		rmb();
524 525 526 527 528 529 530 531
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
532 533 534 535 536
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

537 538 539
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
540

541
		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
542

543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
		/*
		 * if we're doing the sync list, record that our
		 * plug has some sync requests on it
		 *
		 * If we're doing the regular list and there are
		 * sync requests sitting around, unplug before
		 * we add more
		 */
		if (pending_bios == &device->pending_sync_bios) {
			sync_pending = 1;
		} else if (sync_pending) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}

559
		btrfsic_submit_bio(cur);
560 561
		num_run++;
		batch_run++;
562 563

		cond_resched();
564 565 566 567 568 569

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
C
Chris Mason 已提交
570
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
571
		    fs_info->fs_devices->open_devices > 1) {
572
			struct io_context *ioc;
573

574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
596
				cond_resched();
597 598
				continue;
			}
599
			spin_lock(&device->io_lock);
600
			requeue_list(pending_bios, pending, tail);
601
			device->running_pending = 1;
602 603

			spin_unlock(&device->io_lock);
604 605
			btrfs_queue_work(fs_info->submit_workers,
					 &device->work);
606 607 608
			goto done;
		}
	}
609

610 611 612 613 614 615 616 617 618
	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

619
done:
620
	blk_finish_plug(&plug);
621 622
}

623
static void pending_bios_fn(struct btrfs_work *work)
624 625 626 627 628 629 630
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

631 632 633 634 635 636 637 638 639 640
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
 */
static void btrfs_free_stale_devices(const char *path,
				     struct btrfs_device *skip_dev)
A
Anand Jain 已提交
641
{
642 643
	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
	struct btrfs_device *dev, *tmp_dev;
A
Anand Jain 已提交
644

645
	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
A
Anand Jain 已提交
646 647 648 649

		if (fs_devs->opened)
			continue;

650 651
		list_for_each_entry_safe(dev, tmp_dev,
					 &fs_devs->devices, dev_list) {
652
			int not_found = 0;
A
Anand Jain 已提交
653

654 655 656
			if (skip_dev && skip_dev == dev)
				continue;
			if (path && !dev->name)
A
Anand Jain 已提交
657 658 659
				continue;

			rcu_read_lock();
660
			if (path)
661
				not_found = strcmp(rcu_str_deref(dev->name),
662
						   path);
A
Anand Jain 已提交
663
			rcu_read_unlock();
664 665
			if (not_found)
				continue;
A
Anand Jain 已提交
666 667 668 669

			/* delete the stale device */
			if (fs_devs->num_devices == 1) {
				btrfs_sysfs_remove_fsid(fs_devs);
670
				list_del(&fs_devs->fs_list);
A
Anand Jain 已提交
671
				free_fs_devices(fs_devs);
672
				break;
A
Anand Jain 已提交
673 674 675
			} else {
				fs_devs->num_devices--;
				list_del(&dev->dev_list);
676
				free_device(dev);
A
Anand Jain 已提交
677 678 679 680 681
			}
		}
	}
}

682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				    &bdev, &bh);
	if (ret)
		return ret;

	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
		goto error_brelse;

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
		goto error_brelse;

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
714
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
715 716
		fs_devices->seeding = 1;
	} else {
717 718 719 720
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
721 722 723 724 725 726 727
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
		fs_devices->rotating = 1;

	device->bdev = bdev;
728
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
729 730 731
	device->mode = flags;

	fs_devices->open_devices++;
732 733
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
734
		fs_devices->rw_devices++;
735
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
736 737 738 739 740 741 742 743 744 745 746 747
	}
	brelse(bh);

	return 0;

error_brelse:
	brelse(bh);
	blkdev_put(bdev, flags);

	return -EINVAL;
}

748 749 750 751
/*
 * Add new device to list of registered devices
 *
 * Returns:
752 753
 * device pointer which was just added or updated when successful
 * error pointer when failed
754
 */
755
static noinline struct btrfs_device *device_list_add(const char *path,
756
			   struct btrfs_super_block *disk_super)
757 758 759
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
760
	struct rcu_string *name;
761
	u64 found_transid = btrfs_super_generation(disk_super);
762
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
763 764 765

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
766 767
		fs_devices = alloc_fs_devices(disk_super->fsid);
		if (IS_ERR(fs_devices))
768
			return ERR_CAST(fs_devices);
769

770
		list_add(&fs_devices->fs_list, &fs_uuids);
771

772 773
		device = NULL;
	} else {
774 775
		device = find_device(fs_devices, devid,
				disk_super->dev_item.uuid);
776
	}
777

778
	if (!device) {
Y
Yan Zheng 已提交
779
		if (fs_devices->opened)
780
			return ERR_PTR(-EBUSY);
Y
Yan Zheng 已提交
781

782 783 784
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
785
			/* we can safely leave the fs_devices entry around */
786
			return device;
787
		}
788 789 790

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
791
			free_device(device);
792
			return ERR_PTR(-ENOMEM);
793
		}
794
		rcu_assign_pointer(device->name, name);
795

796
		mutex_lock(&fs_devices->device_list_mutex);
797
		list_add_rcu(&device->dev_list, &fs_devices->devices);
798
		fs_devices->num_devices++;
799 800
		mutex_unlock(&fs_devices->device_list_mutex);

Y
Yan Zheng 已提交
801
		device->fs_devices = fs_devices;
802
		btrfs_free_stale_devices(path, device);
803 804 805 806 807 808 809 810

		if (disk_super->label[0])
			pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
				disk_super->label, devid, found_transid, path);
		else
			pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
				disk_super->fsid, devid, found_transid, path);

811
	} else if (!device->name || strcmp(device->name->str, path)) {
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
833 834 835 836
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
837
		 */
838
		if (!fs_devices->opened && found_transid < device->generation) {
839 840 841 842 843 844 845
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
846
			return ERR_PTR(-EEXIST);
847
		}
848

849
		name = rcu_string_strdup(path, GFP_NOFS);
850
		if (!name)
851
			return ERR_PTR(-ENOMEM);
852 853
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
854
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
855
			fs_devices->missing_devices--;
856
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
857
		}
858 859
	}

860 861 862 863 864 865 866 867 868
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
	if (!fs_devices->opened)
		device->generation = found_transid;

869 870
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

871
	return device;
872 873
}

Y
Yan Zheng 已提交
874 875 876 877 878 879
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

880 881 882
	fs_devices = alloc_fs_devices(orig->fsid);
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
883

884
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
885
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
886

887
	/* We have held the volume lock, it is safe to get the devices. */
Y
Yan Zheng 已提交
888
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
889 890
		struct rcu_string *name;

891 892 893
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
		if (IS_ERR(device))
Y
Yan Zheng 已提交
894 895
			goto error;

896 897 898 899
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
900
		if (orig_dev->name) {
901 902
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
903
			if (!name) {
904
				free_device(device);
905 906 907
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
908
		}
Y
Yan Zheng 已提交
909 910 911 912 913

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
914
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
915 916
	return fs_devices;
error:
917
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
918 919 920 921
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

922 923 924 925 926
/*
 * After we have read the system tree and know devids belonging to
 * this filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
927
{
Q
Qinghuang Feng 已提交
928
	struct btrfs_device *device, *next;
929
	struct btrfs_device *latest_dev = NULL;
930

931 932
	mutex_lock(&uuid_mutex);
again:
933
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
934
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
935 936
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
							&device->dev_state)) {
937 938 939 940
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
			     &device->dev_state) &&
			     (!latest_dev ||
			      device->generation > latest_dev->generation)) {
941
				latest_dev = device;
942
			}
Y
Yan Zheng 已提交
943
			continue;
944
		}
Y
Yan Zheng 已提交
945

946 947 948 949 950 951 952 953 954 955 956
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
957 958
			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
						  &device->dev_state)) {
959 960 961
				continue;
			}
		}
Y
Yan Zheng 已提交
962
		if (device->bdev) {
963
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
964 965 966
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
967
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
968
			list_del_init(&device->dev_alloc_list);
969
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
970 971
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				      &device->dev_state))
972
				fs_devices->rw_devices--;
Y
Yan Zheng 已提交
973
		}
Y
Yan Zheng 已提交
974 975
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
976
		free_device(device);
977
	}
Y
Yan Zheng 已提交
978 979 980 981 982 983

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

984
	fs_devices->latest_bdev = latest_dev->bdev;
985

986 987
	mutex_unlock(&uuid_mutex);
}
988

989
static void free_device_rcu(struct rcu_head *head)
990 991 992
{
	struct btrfs_device *device;

L
Liu Bo 已提交
993
	device = container_of(head, struct btrfs_device, rcu);
994
	free_device(device);
995 996
}

997 998
static void btrfs_close_bdev(struct btrfs_device *device)
{
D
David Sterba 已提交
999 1000 1001
	if (!device->bdev)
		return;

1002
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1003 1004 1005 1006
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

D
David Sterba 已提交
1007
	blkdev_put(device->bdev, device->mode);
1008 1009
}

1010
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1011 1012 1013 1014 1015 1016 1017 1018
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;
	struct btrfs_device *new_device;
	struct rcu_string *name;

	if (device->bdev)
		fs_devices->open_devices--;

1019
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1020 1021 1022 1023 1024
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1025
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
		fs_devices->missing_devices--;

	new_device = btrfs_alloc_device(NULL, &device->devid,
					device->uuid);
	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */

	/* Safe because we are under uuid_mutex */
	if (device->name) {
		name = rcu_string_strdup(device->name->str, GFP_NOFS);
		BUG_ON(!name); /* -ENOMEM */
		rcu_assign_pointer(new_device->name, name);
	}

	list_replace_rcu(&device->dev_list, &new_device->dev_list);
	new_device->fs_devices = device->fs_devices;
}

Y
Yan Zheng 已提交
1043
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1044
{
1045
	struct btrfs_device *device, *tmp;
1046 1047 1048
	struct list_head pending_put;

	INIT_LIST_HEAD(&pending_put);
Y
Yan Zheng 已提交
1049

Y
Yan Zheng 已提交
1050 1051
	if (--fs_devices->opened > 0)
		return 0;
1052

1053
	mutex_lock(&fs_devices->device_list_mutex);
1054
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1055 1056
		btrfs_prepare_close_one_device(device);
		list_add(&device->dev_list, &pending_put);
1057
	}
1058 1059
	mutex_unlock(&fs_devices->device_list_mutex);

1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
	/*
	 * btrfs_show_devname() is using the device_list_mutex,
	 * sometimes call to blkdev_put() leads vfs calling
	 * into this func. So do put outside of device_list_mutex,
	 * as of now.
	 */
	while (!list_empty(&pending_put)) {
		device = list_first_entry(&pending_put,
				struct btrfs_device, dev_list);
		list_del(&device->dev_list);
		btrfs_close_bdev(device);
1071
		call_rcu(&device->rcu, free_device_rcu);
1072 1073
	}

Y
Yan Zheng 已提交
1074 1075
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
1076 1077 1078
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

1079 1080 1081
	return 0;
}

Y
Yan Zheng 已提交
1082 1083
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Y
Yan Zheng 已提交
1084
	struct btrfs_fs_devices *seed_devices = NULL;
Y
Yan Zheng 已提交
1085 1086 1087 1088
	int ret;

	mutex_lock(&uuid_mutex);
	ret = __btrfs_close_devices(fs_devices);
Y
Yan Zheng 已提交
1089 1090 1091 1092
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Y
Yan Zheng 已提交
1093
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
1094 1095 1096 1097 1098 1099 1100

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
	}
Y
Yan Zheng 已提交
1101 1102 1103
	return ret;
}

Y
Yan Zheng 已提交
1104 1105
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				fmode_t flags, void *holder)
1106 1107 1108
{
	struct list_head *head = &fs_devices->devices;
	struct btrfs_device *device;
1109
	struct btrfs_device *latest_dev = NULL;
1110
	int ret = 0;
1111

1112 1113
	flags |= FMODE_EXCL;

Q
Qinghuang Feng 已提交
1114
	list_for_each_entry(device, head, dev_list) {
1115
		/* Just open everything we can; ignore failures here */
1116
		if (btrfs_open_one_device(fs_devices, device, flags, holder))
1117
			continue;
1118

1119 1120 1121
		if (!latest_dev ||
		    device->generation > latest_dev->generation)
			latest_dev = device;
1122
	}
1123
	if (fs_devices->open_devices == 0) {
1124
		ret = -EINVAL;
1125 1126
		goto out;
	}
Y
Yan Zheng 已提交
1127
	fs_devices->opened = 1;
1128
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1129
	fs_devices->total_rw_bytes = 0;
1130
out:
Y
Yan Zheng 已提交
1131 1132 1133
	return ret;
}

A
Anand Jain 已提交
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	struct btrfs_device *dev1, *dev2;

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Y
Yan Zheng 已提交
1148
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1149
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1150 1151 1152 1153 1154
{
	int ret;

	mutex_lock(&uuid_mutex);
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1155 1156
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1157
	} else {
A
Anand Jain 已提交
1158
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1159
		ret = __btrfs_open_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1160
	}
1161 1162 1163 1164
	mutex_unlock(&uuid_mutex);
	return ret;
}

1165
static void btrfs_release_disk_super(struct page *page)
1166 1167 1168 1169 1170
{
	kunmap(page);
	put_page(page);
}

1171 1172 1173
static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				 struct page **page,
				 struct btrfs_super_block **disk_super)
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
{
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
		return 1;

	/* make sure our super fits in the page */
	if (sizeof(**disk_super) > PAGE_SIZE)
		return 1;

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
		return 1;

	/* pull in the page with our super */
	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				   index, GFP_KERNEL);

	if (IS_ERR_OR_NULL(*page))
		return 1;

	p = kmap(*page);

	/* align our pointer to the offset of the super block */
	*disk_super = p + (bytenr & ~PAGE_MASK);

	if (btrfs_super_bytenr(*disk_super) != bytenr ||
	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(*page);
		return 1;
	}

	if ((*disk_super)->label[0] &&
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';

	return 0;
}

1216 1217 1218 1219 1220
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1221
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1222 1223 1224
			  struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_super_block *disk_super;
1225
	struct btrfs_device *device;
1226
	struct block_device *bdev;
1227
	struct page *page;
1228
	int ret = 0;
1229
	u64 bytenr;
1230

1231 1232 1233 1234 1235 1236 1237
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	bytenr = btrfs_sb_offset(0);
1238
	flags |= FMODE_EXCL;
1239
	mutex_lock(&uuid_mutex);
1240 1241 1242 1243

	bdev = blkdev_get_by_path(path, flags, holder);
	if (IS_ERR(bdev)) {
		ret = PTR_ERR(bdev);
1244
		goto error;
1245 1246
	}

1247 1248
	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
		ret = -EINVAL;
1249
		goto error_bdev_put;
1250
	}
1251

1252
	device = device_list_add(path, disk_super);
1253 1254 1255 1256
	if (IS_ERR(device))
		ret = PTR_ERR(device);
	else
		*fs_devices_ret = device->fs_devices;
1257

1258
	btrfs_release_disk_super(page);
1259 1260

error_bdev_put:
1261
	blkdev_put(bdev, flags);
1262
error:
1263
	mutex_unlock(&uuid_mutex);
1264 1265
	return ret;
}
1266

1267 1268 1269 1270 1271
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
				   u64 end, u64 *length)
{
	struct btrfs_key key;
1272
	struct btrfs_root *root = device->fs_info->dev_root;
1273 1274 1275 1276 1277 1278 1279 1280 1281
	struct btrfs_dev_extent *dev_extent;
	struct btrfs_path *path;
	u64 extent_end;
	int ret;
	int slot;
	struct extent_buffer *l;

	*length = 0;

1282 1283
	if (start >= device->total_bytes ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1284 1285 1286 1287 1288
		return 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1289
	path->reada = READA_FORWARD;
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
			goto out;
	}

	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto out;

			break;
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
			break;

1324
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
			goto next;

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (key.offset <= start && extent_end > end) {
			*length = end - start + 1;
			break;
		} else if (key.offset <= start && extent_end > start)
			*length += extent_end - start;
		else if (key.offset > start && extent_end <= end)
			*length += extent_end - key.offset;
		else if (key.offset > start && key.offset <= end) {
			*length += end - key.offset + 1;
			break;
		} else if (key.offset > end)
			break;

next:
		path->slots[0]++;
	}
	ret = 0;
out:
	btrfs_free_path(path);
	return ret;
}

1352
static int contains_pending_extent(struct btrfs_transaction *transaction,
1353 1354 1355
				   struct btrfs_device *device,
				   u64 *start, u64 len)
{
1356
	struct btrfs_fs_info *fs_info = device->fs_info;
1357
	struct extent_map *em;
1358
	struct list_head *search_list = &fs_info->pinned_chunks;
1359
	int ret = 0;
1360
	u64 physical_start = *start;
1361

1362 1363
	if (transaction)
		search_list = &transaction->pending_chunks;
1364 1365
again:
	list_for_each_entry(em, search_list, list) {
1366 1367 1368
		struct map_lookup *map;
		int i;

1369
		map = em->map_lookup;
1370
		for (i = 0; i < map->num_stripes; i++) {
1371 1372
			u64 end;

1373 1374
			if (map->stripes[i].dev != device)
				continue;
1375
			if (map->stripes[i].physical >= physical_start + len ||
1376
			    map->stripes[i].physical + em->orig_block_len <=
1377
			    physical_start)
1378
				continue;
1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395
			/*
			 * Make sure that while processing the pinned list we do
			 * not override our *start with a lower value, because
			 * we can have pinned chunks that fall within this
			 * device hole and that have lower physical addresses
			 * than the pending chunks we processed before. If we
			 * do not take this special care we can end up getting
			 * 2 pending chunks that start at the same physical
			 * device offsets because the end offset of a pinned
			 * chunk can be equal to the start offset of some
			 * pending chunk.
			 */
			end = map->stripes[i].physical + em->orig_block_len;
			if (end > *start) {
				*start = end;
				ret = 1;
			}
1396 1397
		}
	}
1398 1399
	if (search_list != &fs_info->pinned_chunks) {
		search_list = &fs_info->pinned_chunks;
1400 1401
		goto again;
	}
1402 1403 1404 1405 1406

	return ret;
}


1407
/*
1408 1409 1410 1411 1412 1413 1414
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1415
 *
1416 1417 1418
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1419 1420 1421 1422 1423 1424 1425 1426
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1427
 */
1428 1429 1430
int find_free_dev_extent_start(struct btrfs_transaction *transaction,
			       struct btrfs_device *device, u64 num_bytes,
			       u64 search_start, u64 *start, u64 *len)
1431
{
1432 1433
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1434
	struct btrfs_key key;
1435
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1436
	struct btrfs_path *path;
1437 1438 1439 1440
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1441 1442
	u64 search_end = device->total_bytes;
	int ret;
1443
	int slot;
1444
	struct extent_buffer *l;
1445 1446 1447 1448 1449 1450

	/*
	 * We don't want to overwrite the superblock on the drive nor any area
	 * used by the boot loader (grub for example), so we make sure to start
	 * at an offset of at least 1MB.
	 */
1451
	search_start = max_t(u64, search_start, SZ_1M);
1452

1453 1454 1455
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1456

1457 1458 1459
	max_hole_start = search_start;
	max_hole_size = 0;

1460
again:
1461 1462
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1463
		ret = -ENOSPC;
1464
		goto out;
1465 1466
	}

1467
	path->reada = READA_FORWARD;
1468 1469
	path->search_commit_root = 1;
	path->skip_locking = 1;
1470

1471 1472 1473
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1474

1475
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1476
	if (ret < 0)
1477
		goto out;
1478 1479 1480
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
1481
			goto out;
1482
	}
1483

1484 1485 1486 1487 1488 1489 1490 1491
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1492 1493 1494
				goto out;

			break;
1495 1496 1497 1498 1499 1500 1501
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1502
			break;
1503

1504
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1505
			goto next;
1506

1507 1508
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1509

1510 1511 1512 1513
			/*
			 * Have to check before we set max_hole_start, otherwise
			 * we could end up sending back this offset anyway.
			 */
1514
			if (contains_pending_extent(transaction, device,
1515
						    &search_start,
1516 1517 1518 1519 1520 1521 1522 1523
						    hole_size)) {
				if (key.offset >= search_start) {
					hole_size = key.offset - search_start;
				} else {
					WARN_ON_ONCE(1);
					hole_size = 0;
				}
			}
1524

1525 1526 1527 1528
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1529

1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1542 1543 1544 1545
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1546 1547 1548 1549
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1550 1551 1552 1553 1554
next:
		path->slots[0]++;
		cond_resched();
	}

1555 1556 1557 1558 1559
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1560
	if (search_end > search_start) {
1561 1562
		hole_size = search_end - search_start;

1563
		if (contains_pending_extent(transaction, device, &search_start,
1564 1565 1566 1567
					    hole_size)) {
			btrfs_release_path(path);
			goto again;
		}
1568

1569 1570 1571 1572
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1573 1574
	}

1575
	/* See above. */
1576
	if (max_hole_size < num_bytes)
1577 1578 1579 1580 1581
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1582
	btrfs_free_path(path);
1583
	*start = max_hole_start;
1584
	if (len)
1585
		*len = max_hole_size;
1586 1587 1588
	return ret;
}

1589 1590 1591 1592 1593 1594
int find_free_dev_extent(struct btrfs_trans_handle *trans,
			 struct btrfs_device *device, u64 num_bytes,
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
	return find_free_dev_extent_start(trans->transaction, device,
1595
					  num_bytes, 0, start, len);
1596 1597
}

1598
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1599
			  struct btrfs_device *device,
M
Miao Xie 已提交
1600
			  u64 start, u64 *dev_extent_len)
1601
{
1602 1603
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1604 1605 1606
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1607 1608 1609
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1610 1611 1612 1613 1614 1615 1616 1617

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1618
again:
1619
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1620 1621 1622
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1623 1624
		if (ret)
			goto out;
1625 1626 1627 1628 1629 1630
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1631 1632 1633
		key = found_key;
		btrfs_release_path(path);
		goto again;
1634 1635 1636 1637
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1638
	} else {
1639
		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1640
		goto out;
1641
	}
1642

M
Miao Xie 已提交
1643 1644
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1645
	ret = btrfs_del_item(trans, root, path);
1646
	if (ret) {
1647 1648
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to remove dev extent item");
Z
Zhao Lei 已提交
1649
	} else {
1650
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1651
	}
1652
out:
1653 1654 1655 1656
	btrfs_free_path(path);
	return ret;
}

1657 1658 1659
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_device *device,
				  u64 chunk_offset, u64 start, u64 num_bytes)
1660 1661 1662
{
	int ret;
	struct btrfs_path *path;
1663 1664
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1665 1666 1667 1668
	struct btrfs_dev_extent *extent;
	struct extent_buffer *leaf;
	struct btrfs_key key;

1669
	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1670
	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1671 1672 1673 1674 1675
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
Y
Yan Zheng 已提交
1676
	key.offset = start;
1677 1678 1679
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*extent));
1680 1681
	if (ret)
		goto out;
1682 1683 1684 1685

	leaf = path->nodes[0];
	extent = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_dev_extent);
1686 1687
	btrfs_set_dev_extent_chunk_tree(leaf, extent,
					BTRFS_CHUNK_TREE_OBJECTID);
1688 1689
	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1690 1691
	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

1692 1693
	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
	btrfs_mark_buffer_dirty(leaf);
1694
out:
1695 1696 1697 1698
	btrfs_free_path(path);
	return ret;
}

1699
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1700
{
1701 1702 1703 1704
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1705

1706 1707 1708 1709 1710 1711
	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	n = rb_last(&em_tree->map);
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1712
	}
1713 1714
	read_unlock(&em_tree->lock);

1715 1716 1717
	return ret;
}

1718 1719
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1720 1721 1722 1723
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1724 1725 1726 1727 1728
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1729 1730 1731 1732 1733

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1734
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1735 1736 1737
	if (ret < 0)
		goto error;

1738
	BUG_ON(ret == 0); /* Corruption */
1739

1740 1741
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1742 1743
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1744
		*devid_ret = 1;
1745 1746 1747
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1748
		*devid_ret = found_key.offset + 1;
1749 1750 1751
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1752
	btrfs_free_path(path);
1753 1754 1755 1756 1757 1758 1759
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1760
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1761
			    struct btrfs_fs_info *fs_info,
1762
			    struct btrfs_device *device)
1763
{
1764
	struct btrfs_root *root = fs_info->chunk_root;
1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1778
	key.offset = device->devid;
1779 1780

	ret = btrfs_insert_empty_item(trans, root, path, &key,
1781
				      sizeof(*dev_item));
1782 1783 1784 1785 1786 1787 1788
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1789
	btrfs_set_device_generation(leaf, dev_item, 0);
1790 1791 1792 1793
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1794 1795 1796 1797
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1798 1799 1800
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1801
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1802

1803
	ptr = btrfs_device_uuid(dev_item);
1804
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1805
	ptr = btrfs_device_fsid(dev_item);
1806
	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1807 1808
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1809
	ret = 0;
1810 1811 1812 1813
out:
	btrfs_free_path(path);
	return ret;
}
1814

1815 1816 1817 1818
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1819
static void update_dev_time(const char *path_name)
1820 1821 1822 1823
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1824
	if (IS_ERR(filp))
1825 1826 1827 1828 1829
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1830
static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1831 1832
			     struct btrfs_device *device)
{
1833
	struct btrfs_root *root = fs_info->chunk_root;
1834 1835 1836 1837 1838 1839 1840 1841 1842
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1843
	trans = btrfs_start_transaction(root, 0);
1844 1845 1846 1847
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1848 1849 1850 1851 1852
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1853 1854 1855 1856 1857
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1858 1859 1860 1861
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1862 1863 1864 1865 1866
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1867 1868
out:
	btrfs_free_path(path);
1869 1870
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1871 1872 1873
	return ret;
}

1874 1875 1876 1877 1878 1879 1880
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1881 1882
{
	u64 all_avail;
1883
	unsigned seq;
1884
	int i;
1885

1886
	do {
1887
		seq = read_seqbegin(&fs_info->profiles_lock);
1888

1889 1890 1891 1892
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1893

1894 1895 1896
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
		if (!(all_avail & btrfs_raid_group[i]))
			continue;
1897

1898 1899
		if (num_devices < btrfs_raid_array[i].devs_min) {
			int ret = btrfs_raid_mindev_error[i];
1900

1901 1902 1903
			if (ret)
				return ret;
		}
D
David Woodhouse 已提交
1904 1905
	}

1906
	return 0;
1907 1908
}

1909 1910
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1911
{
Y
Yan Zheng 已提交
1912
	struct btrfs_device *next_device;
1913 1914 1915

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
1916 1917
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
		struct btrfs_device *device, struct btrfs_device *this_dev)
{
	struct btrfs_device *next_device;

	if (this_dev)
		next_device = this_dev;
	else
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
								device);
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

1950 1951
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
		u64 devid)
1952 1953
{
	struct btrfs_device *device;
1954
	struct btrfs_fs_devices *cur_devices;
Y
Yan Zheng 已提交
1955
	u64 num_devices;
1956 1957
	int ret = 0;

1958
	mutex_lock(&fs_info->volume_mutex);
1959 1960
	mutex_lock(&uuid_mutex);

1961
	num_devices = fs_info->fs_devices->num_devices;
1962
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1963
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1964 1965 1966
		WARN_ON(num_devices < 1);
		num_devices--;
	}
1967
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1968

1969
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1970
	if (ret)
1971 1972
		goto out;

1973 1974
	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
					   &device);
1975
	if (ret)
D
David Woodhouse 已提交
1976
		goto out;
1977

1978
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1979
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1980
		goto out;
1981 1982
	}

1983 1984
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    fs_info->fs_devices->rw_devices == 1) {
1985
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1986
		goto out;
Y
Yan Zheng 已提交
1987 1988
	}

1989
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1990
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
1991
		list_del_init(&device->dev_alloc_list);
1992
		device->fs_devices->rw_devices--;
1993
		mutex_unlock(&fs_info->chunk_mutex);
1994
	}
1995

1996
	mutex_unlock(&uuid_mutex);
1997
	ret = btrfs_shrink_device(device, 0);
1998
	mutex_lock(&uuid_mutex);
1999
	if (ret)
2000
		goto error_undo;
2001

2002 2003 2004 2005 2006
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
2007
	ret = btrfs_rm_dev_item(fs_info, device);
2008
	if (ret)
2009
		goto error_undo;
2010

2011
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2012
	btrfs_scrub_cancel_dev(fs_info, device);
2013 2014 2015 2016

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
2017 2018 2019 2020 2021
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
2022
	 */
2023 2024

	cur_devices = device->fs_devices;
2025
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2026
	list_del_rcu(&device->dev_list);
2027

Y
Yan Zheng 已提交
2028
	device->fs_devices->num_devices--;
J
Josef Bacik 已提交
2029
	device->fs_devices->total_devices--;
Y
Yan Zheng 已提交
2030

2031
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2032
		device->fs_devices->missing_devices--;
2033

2034
	btrfs_assign_next_active_device(fs_info, device, NULL);
Y
Yan Zheng 已提交
2035

2036
	if (device->bdev) {
Y
Yan Zheng 已提交
2037
		device->fs_devices->open_devices--;
2038
		/* remove sysfs entry */
2039
		btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2040
	}
2041

2042 2043 2044
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2045

2046 2047 2048 2049 2050
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
2051
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2052 2053 2054
		btrfs_scratch_superblocks(device->bdev, device->name->str);

	btrfs_close_bdev(device);
2055
	call_rcu(&device->rcu, free_device_rcu);
2056

2057
	if (cur_devices->open_devices == 0) {
Y
Yan Zheng 已提交
2058
		struct btrfs_fs_devices *fs_devices;
2059
		fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2060
		while (fs_devices) {
2061 2062
			if (fs_devices->seed == cur_devices) {
				fs_devices->seed = cur_devices->seed;
Y
Yan Zheng 已提交
2063
				break;
2064
			}
Y
Yan Zheng 已提交
2065
			fs_devices = fs_devices->seed;
Y
Yan Zheng 已提交
2066
		}
2067 2068 2069
		cur_devices->seed = NULL;
		__btrfs_close_devices(cur_devices);
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
2070 2071
	}

2072 2073
out:
	mutex_unlock(&uuid_mutex);
2074
	mutex_unlock(&fs_info->volume_mutex);
2075
	return ret;
2076

2077
error_undo:
2078
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2079
		mutex_lock(&fs_info->chunk_mutex);
2080
		list_add(&device->dev_alloc_list,
2081
			 &fs_info->fs_devices->alloc_list);
2082
		device->fs_devices->rw_devices++;
2083
		mutex_unlock(&fs_info->chunk_mutex);
2084
	}
2085
	goto out;
2086 2087
}

2088 2089
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
					struct btrfs_device *srcdev)
2090
{
2091 2092
	struct btrfs_fs_devices *fs_devices;

2093
	lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2094

2095 2096 2097 2098 2099 2100 2101
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2102

2103
	list_del_rcu(&srcdev->dev_list);
2104
	list_del(&srcdev->dev_alloc_list);
2105
	fs_devices->num_devices--;
2106
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2107
		fs_devices->missing_devices--;
2108

2109
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2110
		fs_devices->rw_devices--;
2111

2112
	if (srcdev->bdev)
2113
		fs_devices->open_devices--;
2114 2115 2116 2117 2118 2119
}

void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *srcdev)
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2120

2121
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2122 2123 2124
		/* zero out the old super if it is writable */
		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
	}
2125 2126

	btrfs_close_bdev(srcdev);
2127
	call_rcu(&srcdev->rcu, free_device_rcu);
2128 2129 2130 2131 2132

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
		struct btrfs_fs_devices *tmp_fs_devices;

2133 2134 2135 2136 2137 2138 2139 2140
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2141 2142 2143 2144 2145 2146 2147 2148 2149
		tmp_fs_devices = fs_info->fs_devices;
		while (tmp_fs_devices) {
			if (tmp_fs_devices->seed == fs_devices) {
				tmp_fs_devices->seed = fs_devices->seed;
				break;
			}
			tmp_fs_devices = tmp_fs_devices->seed;
		}
		fs_devices->seed = NULL;
2150 2151
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
2152
	}
2153 2154 2155 2156 2157
}

void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *tgtdev)
{
2158
	mutex_lock(&uuid_mutex);
2159 2160
	WARN_ON(!tgtdev);
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2161

2162
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2163

2164
	if (tgtdev->bdev)
2165
		fs_info->fs_devices->open_devices--;
2166

2167 2168
	fs_info->fs_devices->num_devices--;

2169
	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2170 2171 2172 2173

	list_del_rcu(&tgtdev->dev_list);

	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2174
	mutex_unlock(&uuid_mutex);
2175 2176 2177 2178 2179 2180 2181 2182 2183

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2184 2185

	btrfs_close_bdev(tgtdev);
2186
	call_rcu(&tgtdev->rcu, free_device_rcu);
2187 2188
}

2189
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2190
				     const char *device_path,
2191
				     struct btrfs_device **device)
2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
	struct buffer_head *bh;

	*device = NULL;
	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2202
				    fs_info->bdev_holder, 0, &bdev, &bh);
2203 2204 2205 2206 2207
	if (ret)
		return ret;
	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2208
	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2209 2210 2211 2212 2213 2214 2215
	brelse(bh);
	if (!*device)
		ret = -ENOENT;
	blkdev_put(bdev, FMODE_READ);
	return ret;
}

2216
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2217
					 const char *device_path,
2218 2219 2220 2221 2222 2223 2224
					 struct btrfs_device **device)
{
	*device = NULL;
	if (strcmp(device_path, "missing") == 0) {
		struct list_head *devices;
		struct btrfs_device *tmp;

2225
		devices = &fs_info->fs_devices->devices;
2226 2227 2228 2229 2230
		/*
		 * It is safe to read the devices since the volume_mutex
		 * is held by the caller.
		 */
		list_for_each_entry(tmp, devices, dev_list) {
2231 2232
			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&tmp->dev_state) && !tmp->bdev) {
2233 2234 2235 2236 2237
				*device = tmp;
				break;
			}
		}

2238 2239
		if (!*device)
			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2240 2241 2242

		return 0;
	} else {
2243
		return btrfs_find_device_by_path(fs_info, device_path, device);
2244 2245 2246
	}
}

2247 2248 2249
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2250
int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2251 2252
				 const char *devpath,
				 struct btrfs_device **device)
2253 2254 2255
{
	int ret;

2256
	if (devid) {
2257
		ret = 0;
2258
		*device = btrfs_find_device(fs_info, devid, NULL, NULL);
2259 2260 2261
		if (!*device)
			ret = -ENOENT;
	} else {
2262
		if (!devpath || !devpath[0])
2263 2264
			return -EINVAL;

2265
		ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2266 2267 2268 2269 2270
							   device);
	}
	return ret;
}

Y
Yan Zheng 已提交
2271 2272 2273
/*
 * does all the dirty work required for changing file system's UUID.
 */
2274
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2275
{
2276
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2277
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2278
	struct btrfs_fs_devices *seed_devices;
2279
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2280 2281 2282
	struct btrfs_device *device;
	u64 super_flags;

2283
	lockdep_assert_held(&uuid_mutex);
Y
Yan Zheng 已提交
2284
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2285 2286
		return -EINVAL;

D
David Sterba 已提交
2287
	seed_devices = alloc_fs_devices(NULL);
2288 2289
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2290

Y
Yan Zheng 已提交
2291 2292 2293 2294
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2295
	}
Y
Yan Zheng 已提交
2296

2297
	list_add(&old_devices->fs_list, &fs_uuids);
Y
Yan Zheng 已提交
2298

Y
Yan Zheng 已提交
2299 2300 2301 2302
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2303
	mutex_init(&seed_devices->device_list_mutex);
2304

2305
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2306 2307
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2308 2309
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2310

2311
	mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2312
	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2313
	mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2314

Y
Yan Zheng 已提交
2315 2316 2317
	fs_devices->seeding = 0;
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2318 2319
	fs_devices->missing_devices = 0;
	fs_devices->rotating = 0;
Y
Yan Zheng 已提交
2320
	fs_devices->seed = seed_devices;
Y
Yan Zheng 已提交
2321 2322

	generate_random_uuid(fs_devices->fsid);
2323
	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2324
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2325
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2326

Y
Yan Zheng 已提交
2327 2328 2329 2330 2331 2332 2333 2334
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2335
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2336 2337
 */
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2338
			       struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2339
{
2340
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2341 2342 2343 2344 2345
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2346
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2374
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2386
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2387
				   BTRFS_UUID_SIZE);
2388
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2389
				   BTRFS_FSID_SIZE);
2390
		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2391
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2408
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2409
{
2410
	struct btrfs_root *root = fs_info->dev_root;
2411
	struct request_queue *q;
2412 2413 2414 2415
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
2416
	struct super_block *sb = fs_info->sb;
2417
	struct rcu_string *name;
2418
	u64 tmp;
Y
Yan Zheng 已提交
2419
	int seeding_dev = 0;
2420
	int ret = 0;
2421
	bool unlocked = false;
2422

2423
	if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2424
		return -EROFS;
2425

2426
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2427
				  fs_info->bdev_holder);
2428 2429
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2430

2431
	if (fs_info->fs_devices->seeding) {
Y
Yan Zheng 已提交
2432 2433 2434 2435 2436
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
	}

2437
	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2438

2439
	devices = &fs_info->fs_devices->devices;
2440

2441
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
Q
Qinghuang Feng 已提交
2442
	list_for_each_entry(device, devices, dev_list) {
2443 2444
		if (device->bdev == bdev) {
			ret = -EEXIST;
2445
			mutex_unlock(
2446
				&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2447
			goto error;
2448 2449
		}
	}
2450
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2451

2452
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2453
	if (IS_ERR(device)) {
2454
		/* we can safely leave the fs_devices entry around */
2455
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2456
		goto error;
2457 2458
	}

2459
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2460
	if (!name) {
Y
Yan Zheng 已提交
2461
		ret = -ENOMEM;
2462
		goto error_free_device;
2463
	}
2464
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2465

2466
	trans = btrfs_start_transaction(root, 0);
2467 2468
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2469
		goto error_free_device;
2470 2471
	}

2472
	q = bdev_get_queue(bdev);
2473
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Y
Yan Zheng 已提交
2474
	device->generation = trans->transid;
2475 2476 2477
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2478 2479
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2480
	device->disk_total_bytes = device->total_bytes;
2481
	device->commit_total_bytes = device->total_bytes;
2482
	device->fs_info = fs_info;
2483
	device->bdev = bdev;
2484
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2485
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2486
	device->mode = FMODE_EXCL;
2487
	device->dev_stats_valid = 1;
2488
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2489

Y
Yan Zheng 已提交
2490
	if (seeding_dev) {
2491
		sb->s_flags &= ~SB_RDONLY;
2492
		ret = btrfs_prepare_sprout(fs_info);
2493 2494 2495 2496
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2497
	}
2498

2499
	device->fs_devices = fs_info->fs_devices;
2500

2501
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2502
	mutex_lock(&fs_info->chunk_mutex);
2503
	list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
Y
Yan Zheng 已提交
2504
	list_add(&device->dev_alloc_list,
2505 2506 2507 2508 2509 2510
		 &fs_info->fs_devices->alloc_list);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
	fs_info->fs_devices->rw_devices++;
	fs_info->fs_devices->total_devices++;
	fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2511

2512
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2513

2514
	if (!blk_queue_nonrot(q))
2515
		fs_info->fs_devices->rotating = 1;
C
Chris Mason 已提交
2516

2517 2518
	tmp = btrfs_super_total_bytes(fs_info->super_copy);
	btrfs_set_super_total_bytes(fs_info->super_copy,
2519
		round_down(tmp + device->total_bytes, fs_info->sectorsize));
2520

2521 2522
	tmp = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2523 2524

	/* add sysfs device entry */
2525
	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2526

M
Miao Xie 已提交
2527 2528 2529 2530
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2531
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2532

2533
	mutex_unlock(&fs_info->chunk_mutex);
2534
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2535

Y
Yan Zheng 已提交
2536
	if (seeding_dev) {
2537
		mutex_lock(&fs_info->chunk_mutex);
2538
		ret = init_first_rw_device(trans, fs_info);
2539
		mutex_unlock(&fs_info->chunk_mutex);
2540
		if (ret) {
2541
			btrfs_abort_transaction(trans, ret);
2542
			goto error_sysfs;
2543
		}
M
Miao Xie 已提交
2544 2545
	}

2546
	ret = btrfs_add_dev_item(trans, fs_info, device);
M
Miao Xie 已提交
2547
	if (ret) {
2548
		btrfs_abort_transaction(trans, ret);
2549
		goto error_sysfs;
M
Miao Xie 已提交
2550 2551 2552 2553 2554
	}

	if (seeding_dev) {
		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];

2555
		ret = btrfs_finish_sprout(trans, fs_info);
2556
		if (ret) {
2557
			btrfs_abort_transaction(trans, ret);
2558
			goto error_sysfs;
2559
		}
2560 2561 2562 2563 2564

		/* Sprouting would change fsid of the mounted root,
		 * so rename the fsid on the sysfs
		 */
		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2565 2566 2567 2568
						fs_info->fsid);
		if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
			btrfs_warn(fs_info,
				   "sysfs: failed to create fsid for sprout");
Y
Yan Zheng 已提交
2569 2570
	}

2571
	ret = btrfs_commit_transaction(trans);
2572

Y
Yan Zheng 已提交
2573 2574 2575
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2576
		unlocked = true;
2577

2578 2579 2580
		if (ret) /* transaction commit */
			return ret;

2581
		ret = btrfs_relocate_sys_chunks(fs_info);
2582
		if (ret < 0)
2583
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2584
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2585 2586 2587 2588
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2589 2590 2591
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2592
		}
2593
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2594
	}
2595

2596 2597
	/* Update ctime/mtime for libblkid */
	update_dev_time(device_path);
Y
Yan Zheng 已提交
2598
	return ret;
2599

2600 2601
error_sysfs:
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2602
error_trans:
2603
	if (seeding_dev)
2604
		sb->s_flags |= SB_RDONLY;
2605 2606
	if (trans)
		btrfs_end_transaction(trans);
2607
error_free_device:
2608
	free_device(device);
Y
Yan Zheng 已提交
2609
error:
2610
	blkdev_put(bdev, FMODE_EXCL);
2611
	if (seeding_dev && !unlocked) {
Y
Yan Zheng 已提交
2612 2613 2614
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2615
	return ret;
2616 2617
}

2618
int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2619
				  const char *device_path,
2620
				  struct btrfs_device *srcdev,
2621 2622 2623 2624 2625 2626
				  struct btrfs_device **device_out)
{
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
	struct rcu_string *name;
2627
	u64 devid = BTRFS_DEV_REPLACE_DEVID;
2628 2629 2630
	int ret = 0;

	*device_out = NULL;
2631 2632
	if (fs_info->fs_devices->seeding) {
		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2633
		return -EINVAL;
2634
	}
2635 2636 2637

	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
				  fs_info->bdev_holder);
2638 2639
	if (IS_ERR(bdev)) {
		btrfs_err(fs_info, "target device %s is invalid!", device_path);
2640
		return PTR_ERR(bdev);
2641
	}
2642 2643 2644 2645 2646 2647

	filemap_write_and_wait(bdev->bd_inode->i_mapping);

	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
		if (device->bdev == bdev) {
J
Jeff Mahoney 已提交
2648 2649
			btrfs_err(fs_info,
				  "target device is in the filesystem!");
2650 2651 2652 2653 2654
			ret = -EEXIST;
			goto error;
		}
	}

2655

2656 2657
	if (i_size_read(bdev->bd_inode) <
	    btrfs_device_get_total_bytes(srcdev)) {
J
Jeff Mahoney 已提交
2658 2659
		btrfs_err(fs_info,
			  "target device is smaller than source device!");
2660 2661 2662 2663 2664
		ret = -EINVAL;
		goto error;
	}


2665 2666 2667
	device = btrfs_alloc_device(NULL, &devid, NULL);
	if (IS_ERR(device)) {
		ret = PTR_ERR(device);
2668 2669 2670
		goto error;
	}

2671
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2672
	if (!name) {
2673
		free_device(device);
2674 2675 2676 2677 2678
		ret = -ENOMEM;
		goto error;
	}
	rcu_assign_pointer(device->name, name);

2679
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2680
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2681
	device->generation = 0;
2682 2683 2684
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2685 2686 2687
	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2688
	device->commit_total_bytes = srcdev->commit_total_bytes;
2689
	device->commit_bytes_used = device->bytes_used;
2690
	device->fs_info = fs_info;
2691
	device->bdev = bdev;
2692
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2693
	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2694
	device->mode = FMODE_EXCL;
2695
	device->dev_stats_valid = 1;
2696
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2697 2698 2699 2700
	device->fs_devices = fs_info->fs_devices;
	list_add(&device->dev_list, &fs_info->fs_devices->devices);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
2701
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2702 2703 2704 2705 2706 2707 2708 2709 2710

	*device_out = device;
	return ret;

error:
	blkdev_put(bdev, FMODE_EXCL);
	return ret;
}

C
Chris Mason 已提交
2711 2712
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2713 2714 2715
{
	int ret;
	struct btrfs_path *path;
2716
	struct btrfs_root *root = device->fs_info->chunk_root;
2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2746 2747 2748 2749
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2750 2751 2752 2753 2754 2755 2756
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2757
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2758 2759
		      struct btrfs_device *device, u64 new_size)
{
2760 2761
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2762
	struct btrfs_fs_devices *fs_devices;
M
Miao Xie 已提交
2763 2764
	u64 old_total;
	u64 diff;
2765

2766
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Y
Yan Zheng 已提交
2767
		return -EACCES;
M
Miao Xie 已提交
2768

2769 2770
	new_size = round_down(new_size, fs_info->sectorsize);

2771
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2772
	old_total = btrfs_super_total_bytes(super_copy);
2773
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2774

2775
	if (new_size <= device->total_bytes ||
2776
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2777
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2778
		return -EINVAL;
M
Miao Xie 已提交
2779
	}
Y
Yan Zheng 已提交
2780

2781
	fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2782

2783 2784
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2785 2786
	device->fs_devices->total_rw_bytes += diff;

2787 2788
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2789
	btrfs_clear_space_info_full(device->fs_info);
2790 2791 2792
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
			      &fs_devices->resized_devices);
2793
	mutex_unlock(&fs_info->chunk_mutex);
2794

2795 2796 2797 2798
	return btrfs_update_device(trans, device);
}

static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2799
			    struct btrfs_fs_info *fs_info, u64 chunk_offset)
2800
{
2801
	struct btrfs_root *root = fs_info->chunk_root;
2802 2803 2804 2805 2806 2807 2808 2809
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2810
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2811 2812 2813 2814
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2815 2816 2817
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2818 2819
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2820 2821 2822
		ret = -ENOENT;
		goto out;
	}
2823 2824

	ret = btrfs_del_item(trans, root, path);
2825
	if (ret < 0)
2826 2827
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2828
out:
2829
	btrfs_free_path(path);
2830
	return ret;
2831 2832
}

2833
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2834
{
2835
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2836 2837 2838 2839 2840 2841 2842 2843 2844 2845
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2846
	mutex_lock(&fs_info->chunk_mutex);
2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2866
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2867 2868 2869 2870 2871 2872 2873 2874 2875
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
2876
	mutex_unlock(&fs_info->chunk_mutex);
2877 2878 2879
	return ret;
}

2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908
static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
					u64 logical, u64 length)
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2909
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2910
		       struct btrfs_fs_info *fs_info, u64 chunk_offset)
2911 2912 2913
{
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2914
	u64 dev_extent_len = 0;
2915
	int i, ret = 0;
2916
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2917

2918 2919
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em)) {
2920 2921
		/*
		 * This is a logic error, but we don't want to just rely on the
2922
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2923 2924 2925
		 * do anything we still error out.
		 */
		ASSERT(0);
2926
		return PTR_ERR(em);
2927
	}
2928
	map = em->map_lookup;
2929
	mutex_lock(&fs_info->chunk_mutex);
2930
	check_system_chunk(trans, fs_info, map->type);
2931
	mutex_unlock(&fs_info->chunk_mutex);
2932

2933 2934 2935 2936 2937 2938
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
	mutex_lock(&fs_devices->device_list_mutex);
2939
	for (i = 0; i < map->num_stripes; i++) {
2940
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
2941 2942 2943
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
2944
		if (ret) {
2945
			mutex_unlock(&fs_devices->device_list_mutex);
2946
			btrfs_abort_transaction(trans, ret);
2947 2948
			goto out;
		}
2949

M
Miao Xie 已提交
2950
		if (device->bytes_used > 0) {
2951
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2952 2953
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
2954
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2955
			btrfs_clear_space_info_full(fs_info);
2956
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2957
		}
2958

2959 2960
		if (map->stripes[i].dev) {
			ret = btrfs_update_device(trans, map->stripes[i].dev);
2961
			if (ret) {
2962
				mutex_unlock(&fs_devices->device_list_mutex);
2963
				btrfs_abort_transaction(trans, ret);
2964 2965
				goto out;
			}
2966
		}
2967
	}
2968 2969
	mutex_unlock(&fs_devices->device_list_mutex);

2970
	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2971
	if (ret) {
2972
		btrfs_abort_transaction(trans, ret);
2973 2974
		goto out;
	}
2975

2976
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2977

2978
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2979
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2980
		if (ret) {
2981
			btrfs_abort_transaction(trans, ret);
2982 2983
			goto out;
		}
2984 2985
	}

2986
	ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2987
	if (ret) {
2988
		btrfs_abort_transaction(trans, ret);
2989 2990
		goto out;
	}
Y
Yan Zheng 已提交
2991

2992
out:
Y
Yan Zheng 已提交
2993 2994
	/* once for us */
	free_extent_map(em);
2995 2996
	return ret;
}
Y
Yan Zheng 已提交
2997

2998
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2999
{
3000
	struct btrfs_root *root = fs_info->chunk_root;
3001
	struct btrfs_trans_handle *trans;
3002
	int ret;
Y
Yan Zheng 已提交
3003

3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
3016
	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3017

3018
	ret = btrfs_can_relocate(fs_info, chunk_offset);
3019 3020 3021 3022
	if (ret)
		return -ENOSPC;

	/* step one, relocate all the extents inside this chunk */
3023
	btrfs_scrub_pause(fs_info);
3024
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3025
	btrfs_scrub_continue(fs_info);
3026 3027 3028
	if (ret)
		return ret;

3029 3030 3031 3032 3033 3034 3035 3036 3037 3038
	/*
	 * We add the kobjects here (and after forcing data chunk creation)
	 * since relocation is the only place we'll create chunks of a new
	 * type at runtime.  The only place where we'll remove the last
	 * chunk of a type is the call immediately below this one.  Even
	 * so, we're protected against races with the cleaner thread since
	 * we're covered by the delete_unused_bgs_mutex.
	 */
	btrfs_add_raid_kobjects(fs_info);

3039 3040 3041 3042 3043 3044 3045 3046
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

3047
	/*
3048 3049
	 * step two, delete the device extents and the
	 * chunk tree entries
3050
	 */
3051
	ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
3052
	btrfs_end_transaction(trans);
3053
	return ret;
Y
Yan Zheng 已提交
3054 3055
}

3056
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
3057
{
3058
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
3059 3060 3061 3062 3063 3064
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
3065 3066
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
3067 3068 3069 3070 3071 3072
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3073
again:
Y
Yan Zheng 已提交
3074 3075 3076 3077 3078
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
3079
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
3080
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3081
		if (ret < 0) {
3082
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
3083
			goto error;
3084
		}
3085
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
3086 3087 3088

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
3089
		if (ret)
3090
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
3091 3092 3093 3094
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
3095

Y
Yan Zheng 已提交
3096 3097
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
3098

Y
Yan Zheng 已提交
3099 3100 3101
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3102
		btrfs_release_path(path);
3103

Y
Yan Zheng 已提交
3104
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3105
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3106 3107
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
3108 3109
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
3110
		}
3111
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3112

Y
Yan Zheng 已提交
3113 3114 3115 3116 3117
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3118 3119 3120 3121
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3122
	} else if (WARN_ON(failed && retried)) {
3123 3124
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
3125 3126 3127
error:
	btrfs_free_path(path);
	return ret;
3128 3129
}

3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
	struct btrfs_block_group_cache *cache;
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
		spin_lock(&fs_info->data_sinfo->lock);
		bytes_used = fs_info->data_sinfo->bytes_used;
		spin_unlock(&fs_info->data_sinfo->lock);

		if (!bytes_used) {
			struct btrfs_trans_handle *trans;
			int ret;

			trans =	btrfs_join_transaction(fs_info->tree_root);
			if (IS_ERR(trans))
				return PTR_ERR(trans);

			ret = btrfs_force_chunk_alloc(trans, fs_info,
						      BTRFS_BLOCK_GROUP_DATA);
			btrfs_end_transaction(trans);
			if (ret < 0)
				return ret;

3166 3167
			btrfs_add_raid_kobjects(fs_info);

3168 3169 3170 3171 3172 3173
			return 1;
		}
	}
	return 0;
}

3174
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3175 3176
			       struct btrfs_balance_control *bctl)
{
3177
	struct btrfs_root *root = fs_info->tree_root;
3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3197
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3198 3199 3200 3201 3202 3203 3204 3205 3206 3207
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3208
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3222
	err = btrfs_commit_transaction(trans);
3223 3224 3225 3226 3227
	if (err && !ret)
		ret = err;
	return ret;
}

3228
static int del_balance_item(struct btrfs_fs_info *fs_info)
3229
{
3230
	struct btrfs_root *root = fs_info->tree_root;
3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3247
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3261
	err = btrfs_commit_transaction(trans);
3262 3263 3264 3265 3266
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3291
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3292 3293 3294 3295 3296
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3297
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3298 3299 3300 3301 3302
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3303
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3304 3305 3306 3307 3308 3309
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338
/*
 * Should be called with both balance and volume mutexes held to
 * serialize other volume operations (add_dev/rm_dev/resize) with
 * restriper.  Same goes for unset_balance_control.
 */
static void set_balance_control(struct btrfs_balance_control *bctl)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;

	BUG_ON(fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
}

static void unset_balance_control(struct btrfs_fs_info *fs_info)
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
}

I
Ilya Dryomov 已提交
3339 3340 3341 3342
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3343
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3344 3345
				 struct btrfs_balance_args *bargs)
{
3346 3347
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3348

3349
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3350 3351 3352 3353 3354
		return 0;

	return 1;
}

3355
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3356
			      struct btrfs_balance_args *bargs)
3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
		user_thresh_min = div_factor_fine(cache->key.offset,
					bargs->usage_min);

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
		user_thresh_max = cache->key.offset;
	else
		user_thresh_max = div_factor_fine(cache->key.offset,
					bargs->usage_max);

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3388
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3389
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3390 3391 3392 3393 3394 3395 3396 3397
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

3398
	if (bargs->usage_min == 0)
3399
		user_thresh = 1;
3400 3401 3402 3403 3404 3405
	else if (bargs->usage > 100)
		user_thresh = cache->key.offset;
	else
		user_thresh = div_factor_fine(cache->key.offset,
					      bargs->usage);

I
Ilya Dryomov 已提交
3406 3407 3408 3409 3410 3411 3412
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

I
Ilya Dryomov 已提交
3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
D
David Woodhouse 已提交
3446 3447 3448 3449 3450 3451 3452 3453 3454
	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
		factor = num_stripes / 2;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
		factor = num_stripes - 1;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
		factor = num_stripes - 2;
	} else {
		factor = num_stripes;
	}
I
Ilya Dryomov 已提交
3455 3456 3457 3458 3459 3460 3461 3462

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3463
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3464 3465 3466 3467 3468 3469 3470 3471 3472

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3500
static int chunk_soft_convert_filter(u64 chunk_type,
3501 3502 3503 3504 3505
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3506 3507
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3508

3509
	if (bargs->target == chunk_type)
3510 3511 3512 3513 3514
		return 1;

	return 0;
}

3515
static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3516 3517 3518
				struct extent_buffer *leaf,
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3519
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3536 3537 3538 3539
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3540 3541 3542 3543
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3544
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3545
		return 0;
3546
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3547
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3548
		return 0;
I
Ilya Dryomov 已提交
3549 3550 3551 3552 3553 3554
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3555 3556 3557 3558
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3559
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3560
		return 0;
3561 3562 3563 3564 3565 3566
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3567 3568
	}

3569 3570 3571 3572 3573 3574
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3575 3576 3577 3578 3579 3580
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3581 3582 3583 3584 3585 3586 3587 3588
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3589 3590 3591
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3592
		 * determined here because we do not have the global information
3593 3594 3595 3596 3597 3598
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3599 3600
	}

3601 3602 3603
	return 1;
}

3604
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3605
{
3606
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3607 3608 3609
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct list_head *devices;
3610 3611 3612
	struct btrfs_device *device;
	u64 old_size;
	u64 size_to_free;
3613
	u64 chunk_type;
3614
	struct btrfs_chunk *chunk;
3615
	struct btrfs_path *path = NULL;
3616 3617
	struct btrfs_key key;
	struct btrfs_key found_key;
3618
	struct btrfs_trans_handle *trans;
3619 3620
	struct extent_buffer *leaf;
	int slot;
3621 3622
	int ret;
	int enospc_errors = 0;
3623
	bool counting = true;
3624
	/* The single value limit and min/max limits use the same bytes in the */
3625 3626 3627
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3628 3629 3630
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3631
	int chunk_reserved = 0;
3632 3633

	/* step one make some room on all the devices */
3634
	devices = &fs_info->fs_devices->devices;
Q
Qinghuang Feng 已提交
3635
	list_for_each_entry(device, devices, dev_list) {
3636
		old_size = btrfs_device_get_total_bytes(device);
3637
		size_to_free = div_factor(old_size, 1);
3638
		size_to_free = min_t(u64, size_to_free, SZ_1M);
3639
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3640 3641
		    btrfs_device_get_total_bytes(device) -
		    btrfs_device_get_bytes_used(device) > size_to_free ||
3642
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3643 3644 3645
			continue;

		ret = btrfs_shrink_device(device, old_size - size_to_free);
3646 3647
		if (ret == -ENOSPC)
			break;
3648 3649 3650 3651 3652
		if (ret) {
			/* btrfs_shrink_device never returns ret > 0 */
			WARN_ON(ret > 0);
			goto error;
		}
3653

3654
		trans = btrfs_start_transaction(dev_root, 0);
3655 3656 3657 3658 3659 3660 3661 3662
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3663 3664

		ret = btrfs_grow_device(trans, device, old_size);
3665
		if (ret) {
3666
			btrfs_end_transaction(trans);
3667 3668 3669 3670 3671 3672 3673 3674
			/* btrfs_grow_device never returns ret > 0 */
			WARN_ON(ret > 0);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3675

3676
		btrfs_end_transaction(trans);
3677 3678 3679 3680
	}

	/* step two, relocate all the chunks */
	path = btrfs_alloc_path();
3681 3682 3683 3684
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3685 3686 3687 3688 3689 3690

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3691
	if (!counting) {
3692 3693 3694 3695
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3696 3697 3698 3699
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3700 3701 3702 3703
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3704
	while (1) {
3705
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3706
		    atomic_read(&fs_info->balance_cancel_req)) {
3707 3708 3709 3710
			ret = -ECANCELED;
			goto error;
		}

3711
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
3712
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3713 3714
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3715
			goto error;
3716
		}
3717 3718 3719 3720 3721 3722

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3723
			BUG(); /* FIXME break ? */
3724 3725 3726

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3727
		if (ret) {
3728
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3729
			ret = 0;
3730
			break;
3731
		}
3732

3733 3734 3735
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3736

3737 3738
		if (found_key.objectid != key.objectid) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3739
			break;
3740
		}
3741

3742
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3743
		chunk_type = btrfs_chunk_type(leaf, chunk);
3744

3745 3746 3747 3748 3749 3750
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3751
		ret = should_balance_chunk(fs_info, leaf, chunk,
3752
					   found_key.offset);
3753

3754
		btrfs_release_path(path);
3755 3756
		if (!ret) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3757
			goto loop;
3758
		}
3759

3760
		if (counting) {
3761
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3762 3763 3764
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3787 3788 3789
			goto loop;
		}

3790 3791 3792 3793 3794 3795 3796 3797 3798
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
3799 3800 3801
			if (ret < 0) {
				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				goto error;
3802 3803
			} else if (ret == 1) {
				chunk_reserved = 1;
3804 3805 3806
			}
		}

3807
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3808
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3809 3810
		if (ret && ret != -ENOSPC)
			goto error;
3811
		if (ret == -ENOSPC) {
3812
			enospc_errors++;
3813 3814 3815 3816 3817
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3818
loop:
3819 3820
		if (found_key.offset == 0)
			break;
3821
		key.offset = found_key.offset - 1;
3822
	}
3823

3824 3825 3826 3827 3828
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3829 3830
error:
	btrfs_free_path(path);
3831
	if (enospc_errors) {
3832
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3833
			   enospc_errors);
3834 3835 3836 3837
		if (!ret)
			ret = -ENOSPC;
	}

3838 3839 3840
	return ret;
}

3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

	/* true if exactly one bit set */
	return (flags & (flags - 1)) == 0;
}

3865 3866
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3867 3868 3869 3870
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3871 3872
}

3873 3874
static void __cancel_balance(struct btrfs_fs_info *fs_info)
{
3875 3876
	int ret;

3877
	unset_balance_control(fs_info);
3878
	ret = del_balance_item(fs_info);
3879
	if (ret)
3880
		btrfs_handle_fs_error(fs_info, ret, NULL);
3881

3882
	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3883 3884
}

3885 3886 3887 3888 3889 3890 3891 3892 3893
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
		u64 allowed)
{
	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
		 (bctl_arg->target & ~allowed)));
}

3894 3895 3896 3897 3898 3899 3900
/*
 * Should be called with both balance and volume mutexes held
 */
int btrfs_balance(struct btrfs_balance_control *bctl,
		  struct btrfs_ioctl_balance_args *bargs)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;
3901
	u64 meta_target, data_target;
3902
	u64 allowed;
3903
	int mixed = 0;
3904
	int ret;
3905
	u64 num_devices;
3906
	unsigned seq;
3907

3908
	if (btrfs_fs_closing(fs_info) ||
3909 3910
	    atomic_read(&fs_info->balance_pause_req) ||
	    atomic_read(&fs_info->balance_cancel_req)) {
3911 3912 3913 3914
		ret = -EINVAL;
		goto out;
	}

3915 3916 3917 3918
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

3919 3920 3921 3922
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
3923 3924
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
3925 3926 3927
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
3928 3929
			btrfs_err(fs_info,
				  "with mixed groups data and metadata balance options must be the same");
3930 3931 3932 3933 3934
			ret = -EINVAL;
			goto out;
		}
	}

3935
	num_devices = fs_info->fs_devices->num_devices;
3936
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3937 3938 3939 3940
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		BUG_ON(num_devices < 1);
		num_devices--;
	}
3941
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3942 3943
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
	if (num_devices > 1)
3944
		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3945 3946 3947 3948 3949
	if (num_devices > 2)
		allowed |= BTRFS_BLOCK_GROUP_RAID5;
	if (num_devices > 3)
		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
			    BTRFS_BLOCK_GROUP_RAID6);
3950
	if (validate_convert_profile(&bctl->data, allowed)) {
J
Jeff Mahoney 已提交
3951 3952 3953
		btrfs_err(fs_info,
			  "unable to start balance with target data profile %llu",
			  bctl->data.target);
3954 3955 3956
		ret = -EINVAL;
		goto out;
	}
3957
	if (validate_convert_profile(&bctl->meta, allowed)) {
3958
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3959 3960
			  "unable to start balance with target metadata profile %llu",
			  bctl->meta.target);
3961 3962 3963
		ret = -EINVAL;
		goto out;
	}
3964
	if (validate_convert_profile(&bctl->sys, allowed)) {
3965
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3966 3967
			  "unable to start balance with target system profile %llu",
			  bctl->sys.target);
3968 3969 3970 3971 3972 3973
		ret = -EINVAL;
		goto out;
	}

	/* allow to reduce meta or sys integrity only if force set */
	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
3974 3975 3976
			BTRFS_BLOCK_GROUP_RAID10 |
			BTRFS_BLOCK_GROUP_RAID5 |
			BTRFS_BLOCK_GROUP_RAID6;
3977 3978 3979 3980 3981 3982 3983 3984 3985 3986
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
		     !(bctl->meta.target & allowed))) {
			if (bctl->flags & BTRFS_BALANCE_FORCE) {
J
Jeff Mahoney 已提交
3987 3988
				btrfs_info(fs_info,
					   "force reducing metadata integrity");
3989
			} else {
J
Jeff Mahoney 已提交
3990 3991
				btrfs_err(fs_info,
					  "balance will reduce metadata integrity, use force if you want this");
3992 3993 3994
				ret = -EINVAL;
				goto out;
			}
3995
		}
3996
	} while (read_seqretry(&fs_info->profiles_lock, seq));
3997

3998 3999 4000 4001 4002 4003 4004
	/* if we're not converting, the target field is uninitialized */
	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->data.target : fs_info->avail_data_alloc_bits;
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4005
		btrfs_warn(fs_info,
J
Jeff Mahoney 已提交
4006
			   "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
4007
			   meta_target, data_target);
4008 4009
	}

4010
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
4011
	if (ret && ret != -EEXIST)
4012 4013
		goto out;

I
Ilya Dryomov 已提交
4014 4015 4016 4017 4018 4019 4020 4021 4022
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
		set_balance_control(bctl);
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
4023

4024
	atomic_inc(&fs_info->balance_running);
4025 4026 4027 4028 4029
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
4030
	atomic_dec(&fs_info->balance_running);
4031 4032 4033

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
4034
		update_ioctl_balance_args(fs_info, 0, bargs);
4035 4036
	}

4037 4038 4039 4040 4041
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
		__cancel_balance(fs_info);
	}

4042
	wake_up(&fs_info->balance_wait_q);
4043 4044 4045

	return ret;
out:
I
Ilya Dryomov 已提交
4046 4047
	if (bctl->flags & BTRFS_BALANCE_RESUME)
		__cancel_balance(fs_info);
4048
	else {
I
Ilya Dryomov 已提交
4049
		kfree(bctl);
4050
		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4051
	}
I
Ilya Dryomov 已提交
4052 4053 4054 4055 4056
	return ret;
}

static int balance_kthread(void *data)
{
4057
	struct btrfs_fs_info *fs_info = data;
4058
	int ret = 0;
I
Ilya Dryomov 已提交
4059 4060 4061 4062

	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);

4063
	if (fs_info->balance_ctl) {
4064
		btrfs_info(fs_info, "continuing balance");
4065
		ret = btrfs_balance(fs_info->balance_ctl, NULL);
4066
	}
I
Ilya Dryomov 已提交
4067 4068 4069

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
4070

I
Ilya Dryomov 已提交
4071 4072 4073
	return ret;
}

4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

	spin_lock(&fs_info->balance_lock);
	if (!fs_info->balance_ctl) {
		spin_unlock(&fs_info->balance_lock);
		return 0;
	}
	spin_unlock(&fs_info->balance_lock);

4085
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4086
		btrfs_info(fs_info, "force skipping balance");
4087 4088 4089
		return 0;
	}

4090 4091 4092 4093 4094 4095 4096 4097 4098
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

4099
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4100
	return PTR_ERR_OR_ZERO(tsk);
4101 4102
}

4103
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
4118
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
4119 4120
	key.offset = 0;

4121
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
4122
	if (ret < 0)
4123
		goto out;
I
Ilya Dryomov 已提交
4124 4125
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
4126 4127 4128 4129 4130 4131 4132
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
4133 4134 4135 4136 4137
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

4138 4139 4140
	bctl->fs_info = fs_info;
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
4141 4142 4143 4144 4145 4146 4147 4148

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4149
	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
4150

4151 4152
	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
4153

4154 4155 4156 4157
	set_balance_control(bctl);

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
I
Ilya Dryomov 已提交
4158 4159
out:
	btrfs_free_path(path);
4160 4161 4162
	return ret;
}

4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	if (atomic_read(&fs_info->balance_running)) {
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
		BUG_ON(atomic_read(&fs_info->balance_running));
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4192 4193
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
4194
	if (sb_rdonly(fs_info->sb))
4195 4196
		return -EROFS;

4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
	if (atomic_read(&fs_info->balance_running)) {
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);
		mutex_lock(&fs_info->balance_mutex);
	} else {
		/* __cancel_balance needs volume_mutex */
		mutex_unlock(&fs_info->balance_mutex);
		mutex_lock(&fs_info->volume_mutex);
		mutex_lock(&fs_info->balance_mutex);

		if (fs_info->balance_ctl)
			__cancel_balance(fs_info);

		mutex_unlock(&fs_info->volume_mutex);
	}

	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

S
Stefan Behrens 已提交
4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241
static int btrfs_uuid_scan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4242
	struct btrfs_trans_handle *trans = NULL;
S
Stefan Behrens 已提交
4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4255 4256
		ret = btrfs_search_forward(root, &key, path,
				BTRFS_OLDEST_GENERATION);
S
Stefan Behrens 已提交
4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4280 4281 4282 4283 4284 4285 4286

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4287 4288 4289 4290 4291 4292 4293 4294 4295
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4296 4297 4298 4299 4300 4301
			continue;
		} else {
			goto skip;
		}
update_tree:
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4302
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4303 4304 4305 4306
						  root_item.uuid,
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4307
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4308 4309 4310 4311 4312 4313
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4314
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4315 4316 4317 4318
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4319
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4320 4321 4322 4323 4324
					ret);
				break;
			}
		}

4325
skip:
S
Stefan Behrens 已提交
4326
		if (trans) {
4327
			ret = btrfs_end_transaction(trans);
4328
			trans = NULL;
S
Stefan Behrens 已提交
4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350
			if (ret)
				break;
		}

		btrfs_release_path(path);
		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4351
	if (trans && !IS_ERR(trans))
4352
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4353
	if (ret)
4354
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4355
	else
4356
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4357 4358 4359 4360
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4361 4362 4363 4364
/*
 * Callback for btrfs_uuid_tree_iterate().
 * returns:
 * 0	check succeeded, the entry is not outdated.
4365
 * < 0	if an error occurred.
4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417
 * > 0	if the check failed, which means the caller shall remove the entry.
 */
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				       u8 *uuid, u8 type, u64 subid)
{
	struct btrfs_key key;
	int ret = 0;
	struct btrfs_root *subvol_root;

	if (type != BTRFS_UUID_KEY_SUBVOL &&
	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
		goto out;

	key.objectid = subid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(subvol_root)) {
		ret = PTR_ERR(subvol_root);
		if (ret == -ENOENT)
			ret = 1;
		goto out;
	}

	switch (type) {
	case BTRFS_UUID_KEY_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
			ret = 1;
		break;
	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.received_uuid,
			   BTRFS_UUID_SIZE))
			ret = 1;
		break;
	}

out:
	return ret;
}

static int btrfs_uuid_rescan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
	if (ret < 0) {
4418
		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4419 4420 4421 4422 4423 4424
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

4425 4426 4427 4428 4429
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4430 4431
	struct task_struct *task;
	int ret;
4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	uuid_root = btrfs_create_tree(trans, fs_info,
				      BTRFS_UUID_TREE_OBJECTID);
	if (IS_ERR(uuid_root)) {
4444
		ret = PTR_ERR(uuid_root);
4445
		btrfs_abort_transaction(trans, ret);
4446
		btrfs_end_transaction(trans);
4447
		return ret;
4448 4449 4450 4451
	}

	fs_info->uuid_root = uuid_root;

4452
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4453 4454 4455 4456 4457 4458
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4459
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4460
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4461 4462 4463 4464 4465
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4466
}
S
Stefan Behrens 已提交
4467

4468 4469 4470 4471 4472 4473 4474 4475
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4476
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
4477 4478 4479 4480 4481 4482 4483
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

4484 4485 4486 4487 4488 4489 4490
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4491 4492
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4493 4494 4495 4496 4497 4498 4499
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4500 4501
	int failed = 0;
	bool retried = false;
4502
	bool checked_pending_chunks = false;
4503 4504
	struct extent_buffer *l;
	struct btrfs_key key;
4505
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4506
	u64 old_total = btrfs_super_total_bytes(super_copy);
4507
	u64 old_size = btrfs_device_get_total_bytes(device);
4508 4509 4510
	u64 diff;

	new_size = round_down(new_size, fs_info->sectorsize);
4511
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4512

4513
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4514 4515
		return -EINVAL;

4516 4517 4518 4519
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4520
	path->reada = READA_FORWARD;
4521

4522
	mutex_lock(&fs_info->chunk_mutex);
4523

4524
	btrfs_device_set_total_bytes(device, new_size);
4525
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
4526
		device->fs_devices->total_rw_bytes -= diff;
4527
		atomic64_sub(diff, &fs_info->free_chunk_space);
4528
	}
4529
	mutex_unlock(&fs_info->chunk_mutex);
4530

4531
again:
4532 4533 4534 4535
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4536
	do {
4537
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
4538
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4539
		if (ret < 0) {
4540
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4541
			goto done;
4542
		}
4543 4544

		ret = btrfs_previous_item(root, path, 0, key.type);
4545
		if (ret)
4546
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4547 4548 4549 4550
		if (ret < 0)
			goto done;
		if (ret) {
			ret = 0;
4551
			btrfs_release_path(path);
4552
			break;
4553 4554 4555 4556 4557 4558
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4559
		if (key.objectid != device->devid) {
4560
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4561
			btrfs_release_path(path);
4562
			break;
4563
		}
4564 4565 4566 4567

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4568
		if (key.offset + length <= new_size) {
4569
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4570
			btrfs_release_path(path);
4571
			break;
4572
		}
4573 4574

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4575
		btrfs_release_path(path);
4576

4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
			goto done;
		}

4589 4590
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4591
		if (ret && ret != -ENOSPC)
4592
			goto done;
4593 4594
		if (ret == -ENOSPC)
			failed++;
4595
	} while (key.offset-- > 0);
4596 4597 4598 4599 4600 4601 4602 4603

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4604 4605
	}

4606
	/* Shrinking succeeded, else we would be at "done". */
4607
	trans = btrfs_start_transaction(root, 0);
4608 4609 4610 4611 4612
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4613
	mutex_lock(&fs_info->chunk_mutex);
4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630

	/*
	 * We checked in the above loop all device extents that were already in
	 * the device tree. However before we have updated the device's
	 * total_bytes to the new size, we might have had chunk allocations that
	 * have not complete yet (new block groups attached to transaction
	 * handles), and therefore their device extents were not yet in the
	 * device tree and we missed them in the loop above. So if we have any
	 * pending chunk using a device extent that overlaps the device range
	 * that we can not use anymore, commit the current transaction and
	 * repeat the search on the device tree - this way we guarantee we will
	 * not have chunks using device extents that end beyond 'new_size'.
	 */
	if (!checked_pending_chunks) {
		u64 start = new_size;
		u64 len = old_size - new_size;

4631 4632
		if (contains_pending_extent(trans->transaction, device,
					    &start, len)) {
4633
			mutex_unlock(&fs_info->chunk_mutex);
4634 4635 4636
			checked_pending_chunks = true;
			failed = 0;
			retried = false;
4637
			ret = btrfs_commit_transaction(trans);
4638 4639 4640 4641 4642 4643
			if (ret)
				goto done;
			goto again;
		}
	}

4644
	btrfs_device_set_disk_total_bytes(device, new_size);
4645 4646
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
4647
			      &fs_info->fs_devices->resized_devices);
4648 4649

	WARN_ON(diff > old_total);
4650 4651
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4652
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4653 4654 4655

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4656
	btrfs_end_transaction(trans);
4657 4658
done:
	btrfs_free_path(path);
4659
	if (ret) {
4660
		mutex_lock(&fs_info->chunk_mutex);
4661
		btrfs_device_set_total_bytes(device, old_size);
4662
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4663
			device->fs_devices->total_rw_bytes += diff;
4664
		atomic64_add(diff, &fs_info->free_chunk_space);
4665
		mutex_unlock(&fs_info->chunk_mutex);
4666
	}
4667 4668 4669
	return ret;
}

4670
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4671 4672 4673
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4674
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4675 4676 4677 4678
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4679
	mutex_lock(&fs_info->chunk_mutex);
4680
	array_size = btrfs_super_sys_array_size(super_copy);
4681
	if (array_size + item_size + sizeof(disk_key)
4682
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4683
		mutex_unlock(&fs_info->chunk_mutex);
4684
		return -EFBIG;
4685
	}
4686 4687 4688 4689 4690 4691 4692 4693

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4694
	mutex_unlock(&fs_info->chunk_mutex);
4695

4696 4697 4698
	return 0;
}

4699 4700 4701 4702
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4703
{
4704 4705
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4706

4707
	if (di_a->max_avail > di_b->max_avail)
4708
		return -1;
4709
	if (di_a->max_avail < di_b->max_avail)
4710
		return 1;
4711 4712 4713 4714 4715
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4716
}
4717

D
David Woodhouse 已提交
4718 4719
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4720
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4721 4722
		return;

4723
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4724 4725
}

4726
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
4727 4728 4729 4730 4731 4732 4733 4734
			- sizeof(struct btrfs_chunk))		\
			/ sizeof(struct btrfs_stripe) + 1)

#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
				- 2 * sizeof(struct btrfs_disk_key)	\
				- 2 * sizeof(struct btrfs_chunk))	\
				/ sizeof(struct btrfs_stripe) + 1)

4735
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4736
			       u64 start, u64 type)
4737
{
4738
	struct btrfs_fs_info *info = trans->fs_info;
4739
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
4740
	struct btrfs_device *device;
4741 4742 4743 4744 4745 4746
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct btrfs_device_info *devices_info = NULL;
	u64 total_avail;
	int num_stripes;	/* total number of stripes to allocate */
D
David Woodhouse 已提交
4747 4748
	int data_stripes;	/* number of stripes that count for
				   block group size */
4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762
	int sub_stripes;	/* sub_stripes info for map */
	int dev_stripes;	/* stripes per dev */
	int devs_max;		/* max devs to use */
	int devs_min;		/* min devs needed */
	int devs_increment;	/* ndevs has to be a multiple of this */
	int ncopies;		/* how many copies to data has */
	int ret;
	u64 max_stripe_size;
	u64 max_chunk_size;
	u64 stripe_size;
	u64 num_bytes;
	int ndevs;
	int i;
	int j;
4763
	int index;
4764

4765
	BUG_ON(!alloc_profile_is_valid(type, 0));
4766

4767 4768 4769
	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
4770
		return -ENOSPC;
4771
	}
4772

4773
	index = btrfs_bg_flags_to_raid_index(type);
4774

4775 4776 4777 4778 4779 4780
	sub_stripes = btrfs_raid_array[index].sub_stripes;
	dev_stripes = btrfs_raid_array[index].dev_stripes;
	devs_max = btrfs_raid_array[index].devs_max;
	devs_min = btrfs_raid_array[index].devs_min;
	devs_increment = btrfs_raid_array[index].devs_increment;
	ncopies = btrfs_raid_array[index].ncopies;
4781

4782
	if (type & BTRFS_BLOCK_GROUP_DATA) {
4783
		max_stripe_size = SZ_1G;
4784
		max_chunk_size = 10 * max_stripe_size;
4785
		if (!devs_max)
4786
			devs_max = BTRFS_MAX_DEVS(info);
4787
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4788
		/* for larger filesystems, use larger metadata chunks */
4789 4790
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			max_stripe_size = SZ_1G;
4791
		else
4792
			max_stripe_size = SZ_256M;
4793
		max_chunk_size = max_stripe_size;
4794
		if (!devs_max)
4795
			devs_max = BTRFS_MAX_DEVS(info);
4796
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4797
		max_stripe_size = SZ_32M;
4798
		max_chunk_size = 2 * max_stripe_size;
4799 4800
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4801
	} else {
4802
		btrfs_err(info, "invalid chunk type 0x%llx requested",
4803 4804
		       type);
		BUG_ON(1);
4805 4806
	}

Y
Yan Zheng 已提交
4807 4808 4809
	/* we don't want a chunk larger than 10% of writeable space */
	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
			     max_chunk_size);
4810

4811
	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4812 4813 4814
			       GFP_NOFS);
	if (!devices_info)
		return -ENOMEM;
4815

4816
	/*
4817 4818
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
4819
	 */
4820
	ndevs = 0;
4821
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4822 4823
		u64 max_avail;
		u64 dev_offset;
4824

4825
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
J
Julia Lawall 已提交
4826
			WARN(1, KERN_ERR
4827
			       "BTRFS: read-only device in alloc_list\n");
4828 4829
			continue;
		}
4830

4831 4832
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
4833
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4834
			continue;
4835

4836 4837 4838 4839
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
4840 4841 4842 4843

		/* If there is no space on this device, skip it. */
		if (total_avail == 0)
			continue;
4844

4845
		ret = find_free_dev_extent(trans, device,
4846 4847 4848 4849
					   max_stripe_size * dev_stripes,
					   &dev_offset, &max_avail);
		if (ret && ret != -ENOSPC)
			goto error;
4850

4851 4852
		if (ret == 0)
			max_avail = max_stripe_size * dev_stripes;
4853

4854 4855 4856 4857 4858 4859
		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
			"%s: devid %llu has no free space, have=%llu want=%u",
					    __func__, device->devid, max_avail,
					    BTRFS_STRIPE_LEN * dev_stripes);
4860
			continue;
4861
		}
4862

4863 4864 4865 4866 4867
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
4868 4869 4870 4871 4872 4873
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
4874

4875 4876 4877 4878 4879
	/*
	 * now sort the devices by hole size / available space
	 */
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
	     btrfs_cmp_device_info, NULL);
4880

4881
	/* round down to number of usable stripes */
4882
	ndevs = round_down(ndevs, devs_increment);
4883

4884
	if (ndevs < devs_min) {
4885
		ret = -ENOSPC;
4886 4887 4888
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
4889
				    __func__, ndevs, devs_min);
4890
		}
4891
		goto error;
4892
	}
4893

4894 4895
	ndevs = min(ndevs, devs_max);

4896
	/*
4897 4898 4899 4900 4901
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
4902
	 */
4903
	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4904
	num_stripes = ndevs * dev_stripes;
4905

D
David Woodhouse 已提交
4906 4907 4908 4909 4910 4911
	/*
	 * this will have to be fixed for RAID1 and RAID10 over
	 * more drives
	 */
	data_stripes = num_stripes / ncopies;

4912
	if (type & BTRFS_BLOCK_GROUP_RAID5)
D
David Woodhouse 已提交
4913
		data_stripes = num_stripes - 1;
4914 4915

	if (type & BTRFS_BLOCK_GROUP_RAID6)
D
David Woodhouse 已提交
4916
		data_stripes = num_stripes - 2;
4917 4918 4919 4920 4921 4922 4923

	/*
	 * Use the number of data stripes to figure out how big this chunk
	 * is really going to be in terms of logical address space,
	 * and compare that answer with the max chunk size
	 */
	if (stripe_size * data_stripes > max_chunk_size) {
4924
		stripe_size = div_u64(max_chunk_size, data_stripes);
4925 4926

		/* bump the answer up to a 16MB boundary */
4927
		stripe_size = round_up(stripe_size, SZ_16M);
4928

4929 4930 4931
		/*
		 * But don't go higher than the limits we found while searching
		 * for free extents
4932
		 */
4933 4934
		stripe_size = min(devices_info[ndevs - 1].max_avail,
				  stripe_size);
4935 4936
	}

4937
	/* align to BTRFS_STRIPE_LEN */
4938
	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
4939 4940 4941 4942 4943 4944 4945

	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
	if (!map) {
		ret = -ENOMEM;
		goto error;
	}
	map->num_stripes = num_stripes;
4946

4947 4948 4949 4950 4951 4952
	for (i = 0; i < ndevs; ++i) {
		for (j = 0; j < dev_stripes; ++j) {
			int s = i * dev_stripes + j;
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
						   j * stripe_size;
4953 4954
		}
	}
4955 4956 4957
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
4958 4959
	map->type = type;
	map->sub_stripes = sub_stripes;
4960

D
David Woodhouse 已提交
4961
	num_bytes = stripe_size * data_stripes;
4962

4963
	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4964

4965
	em = alloc_extent_map();
Y
Yan Zheng 已提交
4966
	if (!em) {
4967
		kfree(map);
4968 4969
		ret = -ENOMEM;
		goto error;
4970
	}
4971
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4972
	em->map_lookup = map;
Y
Yan Zheng 已提交
4973
	em->start = start;
4974
	em->len = num_bytes;
Y
Yan Zheng 已提交
4975 4976
	em->block_start = 0;
	em->block_len = em->len;
4977
	em->orig_block_len = stripe_size;
4978

4979
	em_tree = &info->mapping_tree.map_tree;
4980
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4981
	ret = add_extent_mapping(em_tree, em, 0);
4982
	if (ret) {
4983
		write_unlock(&em_tree->lock);
4984
		free_extent_map(em);
4985
		goto error;
4986
	}
4987

4988 4989 4990 4991
	list_add_tail(&em->list, &trans->transaction->pending_chunks);
	refcount_inc(&em->refs);
	write_unlock(&em_tree->lock);

4992
	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
4993 4994
	if (ret)
		goto error_del_extent;
Y
Yan Zheng 已提交
4995

4996 4997 4998 4999
	for (i = 0; i < map->num_stripes; i++) {
		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
	}
5000

5001
	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
5002

5003
	free_extent_map(em);
5004
	check_raid56_incompat_flag(info, type);
D
David Woodhouse 已提交
5005

5006
	kfree(devices_info);
Y
Yan Zheng 已提交
5007
	return 0;
5008

5009
error_del_extent:
5010 5011 5012 5013 5014 5015 5016 5017
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
5018 5019
	/* One for the pending_chunks list reference */
	free_extent_map(em);
5020 5021 5022
error:
	kfree(devices_info);
	return ret;
Y
Yan Zheng 已提交
5023 5024
}

5025
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5026
				struct btrfs_fs_info *fs_info,
5027
				u64 chunk_offset, u64 chunk_size)
Y
Yan Zheng 已提交
5028
{
5029 5030
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
5031 5032 5033 5034
	struct btrfs_key key;
	struct btrfs_device *device;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
5035 5036 5037 5038 5039 5040
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	u64 dev_offset;
	u64 stripe_size;
	int i = 0;
5041
	int ret = 0;
Y
Yan Zheng 已提交
5042

5043 5044 5045
	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	if (IS_ERR(em))
		return PTR_ERR(em);
5046

5047
	map = em->map_lookup;
5048 5049 5050
	item_size = btrfs_chunk_item_size(map->num_stripes);
	stripe_size = em->orig_block_len;

Y
Yan Zheng 已提交
5051
	chunk = kzalloc(item_size, GFP_NOFS);
5052 5053 5054 5055 5056
	if (!chunk) {
		ret = -ENOMEM;
		goto out;
	}

5057 5058 5059 5060 5061 5062 5063
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with the map's stripes, because the device object's id can change
	 * at any time during that final phase of the device replace operation
	 * (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
5064
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5065 5066 5067
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
Y
Yan Zheng 已提交
5068

5069
		ret = btrfs_update_device(trans, device);
5070
		if (ret)
5071
			break;
5072 5073
		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
					     dev_offset, stripe_size);
5074
		if (ret)
5075 5076 5077
			break;
	}
	if (ret) {
5078
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5079
		goto out;
Y
Yan Zheng 已提交
5080 5081 5082
	}

	stripe = &chunk->stripe;
5083 5084 5085
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
5086

5087 5088 5089
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
5090
		stripe++;
5091
	}
5092
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5093

Y
Yan Zheng 已提交
5094
	btrfs_set_stack_chunk_length(chunk, chunk_size);
5095
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
5096 5097 5098 5099 5100
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5101
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
5102
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5103

Y
Yan Zheng 已提交
5104 5105 5106
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
	key.offset = chunk_offset;
5107

Y
Yan Zheng 已提交
5108
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5109 5110 5111 5112 5113
	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
		/*
		 * TODO: Cleanup of inserted chunk root in case of
		 * failure.
		 */
5114
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5115
	}
5116

5117
out:
5118
	kfree(chunk);
5119
	free_extent_map(em);
5120
	return ret;
Y
Yan Zheng 已提交
5121
}
5122

Y
Yan Zheng 已提交
5123 5124 5125 5126 5127 5128 5129 5130
/*
 * Chunk allocation falls into two parts. The first part does works
 * that make the new allocated chunk useable, but not do any operation
 * that modifies the chunk tree. The second part does the works that
 * require modifying the chunk tree. This division is important for the
 * bootstrap process of adding storage to a seed btrfs.
 */
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5131
		      struct btrfs_fs_info *fs_info, u64 type)
Y
Yan Zheng 已提交
5132 5133 5134
{
	u64 chunk_offset;

5135
	lockdep_assert_held(&fs_info->chunk_mutex);
5136
	chunk_offset = find_next_chunk(fs_info);
5137
	return __btrfs_alloc_chunk(trans, chunk_offset, type);
Y
Yan Zheng 已提交
5138 5139
}

C
Chris Mason 已提交
5140
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5141
					 struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
5142 5143 5144 5145 5146 5147
{
	u64 chunk_offset;
	u64 sys_chunk_offset;
	u64 alloc_profile;
	int ret;

5148
	chunk_offset = find_next_chunk(fs_info);
5149
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5150
	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5151 5152
	if (ret)
		return ret;
Y
Yan Zheng 已提交
5153

5154
	sys_chunk_offset = find_next_chunk(fs_info);
5155
	alloc_profile = btrfs_system_alloc_profile(fs_info);
5156
	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5157
	return ret;
Y
Yan Zheng 已提交
5158 5159
}

5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
	int max_errors;

	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			 BTRFS_BLOCK_GROUP_RAID10 |
			 BTRFS_BLOCK_GROUP_RAID5 |
			 BTRFS_BLOCK_GROUP_DUP)) {
		max_errors = 1;
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
		max_errors = 2;
	} else {
		max_errors = 0;
5173
	}
Y
Yan Zheng 已提交
5174

5175
	return max_errors;
Y
Yan Zheng 已提交
5176 5177
}

5178
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5179 5180 5181 5182
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5183
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5184 5185
	int i;

5186 5187
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5188 5189
		return 1;

5190
	map = em->map_lookup;
Y
Yan Zheng 已提交
5191
	for (i = 0; i < map->num_stripes; i++) {
5192 5193
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5194 5195 5196
			miss_ndevs++;
			continue;
		}
5197 5198
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
Y
Yan Zheng 已提交
5199
			readonly = 1;
5200
			goto end;
Y
Yan Zheng 已提交
5201 5202
		}
	}
5203 5204 5205 5206 5207 5208 5209 5210 5211

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5212
	free_extent_map(em);
Y
Yan Zheng 已提交
5213
	return readonly;
5214 5215 5216 5217
}

void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
5218
	extent_map_tree_init(&tree->map_tree);
5219 5220 5221 5222 5223 5224
}

void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
{
	struct extent_map *em;

C
Chris Mason 已提交
5225
	while (1) {
5226
		write_lock(&tree->map_tree.lock);
5227 5228 5229
		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
		if (em)
			remove_extent_mapping(&tree->map_tree, em);
5230
		write_unlock(&tree->map_tree.lock);
5231 5232 5233 5234 5235 5236 5237 5238 5239
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5240
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5241 5242 5243 5244 5245
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5246 5247 5248 5249 5250 5251 5252 5253
	em = get_chunk_map(fs_info, logical, len);
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5254 5255
		return 1;

5256
	map = em->map_lookup;
5257 5258
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
		ret = map->num_stripes;
C
Chris Mason 已提交
5259 5260
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5261 5262 5263
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
L
Liu Bo 已提交
5264 5265 5266 5267 5268 5269 5270 5271
		/*
		 * There could be two corrupted data stripes, we need
		 * to loop retry in order to rebuild the correct data.
		 * 
		 * Fail a stripe at a time on every retry except the
		 * stripe under reconstruction.
		 */
		ret = map->num_stripes;
5272 5273 5274
	else
		ret = 1;
	free_extent_map(em);
5275

5276
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
5277 5278
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5279
		ret++;
5280
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
5281

5282 5283 5284
	return ret;
}

5285
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5286 5287 5288 5289
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5290
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5291

5292
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5293

5294 5295 5296 5297 5298 5299
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5300 5301 5302
	return len;
}

5303
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5304 5305 5306 5307 5308
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5309
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5310

5311 5312 5313 5314 5315 5316
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5317 5318 5319
	return ret;
}

5320
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5321
			    struct map_lookup *map, int first,
5322
			    int dev_replace_is_ongoing)
5323 5324
{
	int i;
5325
	int num_stripes;
5326
	int preferred_mirror;
5327 5328 5329
	int tolerance;
	struct btrfs_device *srcdev;

5330 5331 5332 5333 5334 5335 5336 5337
	ASSERT((map->type &
		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

5338 5339
	preferred_mirror = first + current->pid % num_stripes;

5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5353 5354 5355
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5356
		for (i = first; i < first + num_stripes; i++) {
5357 5358 5359 5360
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5361
	}
5362

5363 5364 5365
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5366
	return preferred_mirror;
5367 5368
}

D
David Woodhouse 已提交
5369 5370 5371 5372 5373 5374
static inline int parity_smaller(u64 a, u64 b)
{
	return a > b;
}

/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5375
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5376 5377 5378 5379 5380 5381 5382 5383
{
	struct btrfs_bio_stripe s;
	int i;
	u64 l;
	int again = 1;

	while (again) {
		again = 0;
5384
		for (i = 0; i < num_stripes - 1; i++) {
5385 5386
			if (parity_smaller(bbio->raid_map[i],
					   bbio->raid_map[i+1])) {
D
David Woodhouse 已提交
5387
				s = bbio->stripes[i];
5388
				l = bbio->raid_map[i];
D
David Woodhouse 已提交
5389
				bbio->stripes[i] = bbio->stripes[i+1];
5390
				bbio->raid_map[i] = bbio->raid_map[i+1];
D
David Woodhouse 已提交
5391
				bbio->stripes[i+1] = s;
5392
				bbio->raid_map[i+1] = l;
5393

D
David Woodhouse 已提交
5394 5395 5396 5397 5398 5399
				again = 1;
			}
		}
	}
}

5400 5401 5402
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5403
		 /* the size of the btrfs_bio */
5404
		sizeof(struct btrfs_bio) +
5405
		/* plus the variable array for the stripes */
5406
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5407
		/* plus the variable array for the tgt dev */
5408
		sizeof(int) * (real_stripes) +
5409 5410 5411 5412 5413
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5414
		GFP_NOFS|__GFP_NOFAIL);
5415 5416

	atomic_set(&bbio->error, 0);
5417
	refcount_set(&bbio->refs, 1);
5418 5419 5420 5421 5422 5423

	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5424 5425
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5426 5427 5428 5429 5430 5431
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5432
	if (refcount_dec_and_test(&bbio->refs))
5433 5434 5435
		kfree(bbio);
}

5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

	em = get_chunk_map(fs_info, logical, length);
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
	length = min_t(u64, em->len - offset, length);

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5493
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

5759 5760 5761 5762 5763
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

5764 5765
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
5766
			     u64 logical, u64 *length,
5767
			     struct btrfs_bio **bbio_ret,
5768
			     int mirror_num, int need_raid_map)
5769 5770 5771 5772
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 offset;
5773 5774
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
5775
	u64 stripe_len;
5776
	u32 stripe_index;
5777
	int i;
L
Li Zefan 已提交
5778
	int ret = 0;
5779
	int num_stripes;
5780
	int max_errors = 0;
5781
	int tgtdev_indexes = 0;
5782
	struct btrfs_bio *bbio = NULL;
5783 5784 5785
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
5786 5787
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
5788
	u64 raid56_full_stripe_start = (u64)-1;
5789

5790 5791 5792 5793
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     *length, bbio_ret);

5794 5795 5796
	em = get_chunk_map(fs_info, logical, *length);
	if (IS_ERR(em))
		return PTR_ERR(em);
5797

5798
	map = em->map_lookup;
5799
	offset = logical - em->start;
5800

D
David Woodhouse 已提交
5801
	stripe_len = map->stripe_len;
5802 5803 5804 5805 5806
	stripe_nr = offset;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
5807
	stripe_nr = div64_u64(stripe_nr, stripe_len);
5808

D
David Woodhouse 已提交
5809
	stripe_offset = stripe_nr * stripe_len;
5810
	if (offset < stripe_offset) {
J
Jeff Mahoney 已提交
5811 5812
		btrfs_crit(fs_info,
			   "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5813 5814 5815 5816 5817
			   stripe_offset, offset, em->start, logical,
			   stripe_len);
		free_extent_map(em);
		return -EINVAL;
	}
5818 5819 5820 5821

	/* stripe_offset is the offset of this block in its stripe*/
	stripe_offset = offset - stripe_offset;

D
David Woodhouse 已提交
5822
	/* if we're here for raid56, we need to know the stripe aligned start */
5823
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
D
David Woodhouse 已提交
5824 5825 5826 5827 5828 5829
		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
		raid56_full_stripe_start = offset;

		/* allow a write of a full stripe, but make sure we don't
		 * allow straddling of stripes
		 */
5830 5831
		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				full_stripe_len);
D
David Woodhouse 已提交
5832 5833 5834
		raid56_full_stripe_start *= full_stripe_len;
	}

5835
	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
D
David Woodhouse 已提交
5836 5837 5838 5839
		u64 max_len;
		/* For writes to RAID[56], allow a full stripeset across all disks.
		   For other RAID types and for RAID[56] reads, just allow a single
		   stripe (on a single disk). */
5840
		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5841
		    (op == BTRFS_MAP_WRITE)) {
D
David Woodhouse 已提交
5842 5843 5844 5845 5846 5847 5848
			max_len = stripe_len * nr_data_stripes(map) -
				(offset - raid56_full_stripe_start);
		} else {
			/* we limit the length of each bio to what fits in a stripe */
			max_len = stripe_len - stripe_offset;
		}
		*length = min_t(u64, em->len - offset, max_len);
5849 5850 5851
	} else {
		*length = em->len - offset;
	}
5852

D
David Woodhouse 已提交
5853 5854
	/* This is for when we're called from btrfs_merge_bio_hook() and all
	   it cares about is the length */
5855
	if (!bbio_ret)
5856 5857
		goto out;

5858
	btrfs_dev_replace_read_lock(dev_replace);
5859 5860
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	if (!dev_replace_is_ongoing)
5861
		btrfs_dev_replace_read_unlock(dev_replace);
5862 5863
	else
		btrfs_dev_replace_set_lock_blocking(dev_replace);
5864

5865
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5866
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5867 5868 5869 5870 5871
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
5872
			goto out;
5873 5874
		else
			patch_the_first_stripe_for_dev_replace = 1;
5875 5876 5877 5878
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

5879
	num_stripes = 1;
5880
	stripe_index = 0;
5881
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5882 5883
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5884
		if (!need_full_stripe(op))
5885
			mirror_num = 1;
5886
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5887
		if (need_full_stripe(op))
5888
			num_stripes = map->num_stripes;
5889
		else if (mirror_num)
5890
			stripe_index = mirror_num - 1;
5891
		else {
5892 5893
			stripe_index = find_live_mirror(fs_info, map, 0,
					    dev_replace_is_ongoing);
5894
			mirror_num = stripe_index + 1;
5895
		}
5896

5897
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5898
		if (need_full_stripe(op)) {
5899
			num_stripes = map->num_stripes;
5900
		} else if (mirror_num) {
5901
			stripe_index = mirror_num - 1;
5902 5903 5904
		} else {
			mirror_num = 1;
		}
5905

C
Chris Mason 已提交
5906
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5907
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
5908

5909
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
5910 5911
		stripe_index *= map->sub_stripes;

5912
		if (need_full_stripe(op))
5913
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
5914 5915
		else if (mirror_num)
			stripe_index += mirror_num - 1;
5916
		else {
J
Jan Schmidt 已提交
5917
			int old_stripe_index = stripe_index;
5918 5919 5920
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
5921
			mirror_num = stripe_index - old_stripe_index + 1;
5922
		}
D
David Woodhouse 已提交
5923

5924
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5925
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
5926
			/* push stripe_nr back to the start of the full stripe */
5927
			stripe_nr = div64_u64(raid56_full_stripe_start,
5928
					stripe_len * nr_data_stripes(map));
D
David Woodhouse 已提交
5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
5943 5944
			stripe_nr = div_u64_rem(stripe_nr,
					nr_data_stripes(map), &stripe_index);
D
David Woodhouse 已提交
5945 5946 5947 5948 5949
			if (mirror_num > 1)
				stripe_index = nr_data_stripes(map) +
						mirror_num - 2;

			/* We distribute the parity blocks across stripes */
5950 5951
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
5952
			if (!need_full_stripe(op) && mirror_num <= 1)
5953
				mirror_num = 1;
D
David Woodhouse 已提交
5954
		}
5955 5956
	} else {
		/*
5957 5958 5959
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
5960
		 */
5961 5962
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5963
		mirror_num = stripe_index + 1;
5964
	}
5965
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
5966 5967
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5968 5969 5970 5971
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
5972

5973
	num_alloc_stripes = num_stripes;
5974
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5975
		if (op == BTRFS_MAP_WRITE)
5976
			num_alloc_stripes <<= 1;
5977
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
5978
			num_alloc_stripes++;
5979
		tgtdev_indexes = num_stripes;
5980
	}
5981

5982
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
5983 5984 5985 5986
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
5987
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5988
		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
L
Li Zefan 已提交
5989

5990
	/* build raid_map */
5991 5992
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
5993
		u64 tmp;
5994
		unsigned rot;
5995 5996 5997 5998 5999 6000 6001

		bbio->raid_map = (u64 *)((void *)bbio->stripes +
				 sizeof(struct btrfs_bio_stripe) *
				 num_alloc_stripes +
				 sizeof(int) * tgtdev_indexes);

		/* Work out the disk rotation on this stripe-set */
6002
		div_u64_rem(stripe_nr, num_stripes, &rot);
6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015

		/* Fill in the logical address of each stripe */
		tmp = stripe_nr * nr_data_stripes(map);
		for (i = 0; i < nr_data_stripes(map); i++)
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;
	}

L
Liu Bo 已提交
6016

6017 6018 6019 6020 6021 6022 6023 6024
	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset +
			stripe_nr * map->stripe_len;
		bbio->stripes[i].dev =
			map->stripes[stripe_index].dev;
		stripe_index++;
6025
	}
L
Li Zefan 已提交
6026

6027
	if (need_full_stripe(op))
6028
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
6029

6030 6031
	if (bbio->raid_map)
		sort_parity_stripes(bbio, num_stripes);
6032

6033
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6034
	    need_full_stripe(op)) {
6035 6036
		handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
					  &max_errors);
6037 6038
	}

L
Li Zefan 已提交
6039
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
6040
	bbio->map_type = map->type;
L
Li Zefan 已提交
6041 6042 6043
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
6056
out:
6057 6058
	if (dev_replace_is_ongoing) {
		btrfs_dev_replace_clear_lock_blocking(dev_replace);
6059
		btrfs_dev_replace_read_unlock(dev_replace);
6060
	}
6061
	free_extent_map(em);
L
Li Zefan 已提交
6062
	return ret;
6063 6064
}

6065
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6066
		      u64 logical, u64 *length,
6067
		      struct btrfs_bio **bbio_ret, int mirror_num)
6068
{
6069
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6070
				 mirror_num, 0);
6071 6072
}

6073
/* For Scrub/replace */
6074
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6075
		     u64 logical, u64 *length,
6076
		     struct btrfs_bio **bbio_ret)
6077
{
6078
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6079 6080
}

6081
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
Y
Yan Zheng 已提交
6082 6083 6084 6085 6086 6087 6088 6089 6090
		     u64 chunk_start, u64 physical, u64 devid,
		     u64 **logical, int *naddrs, int *stripe_len)
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 *buf;
	u64 bytenr;
	u64 length;
	u64 stripe_nr;
D
David Woodhouse 已提交
6091
	u64 rmap_len;
Y
Yan Zheng 已提交
6092 6093
	int i, j, nr = 0;

6094 6095
	em = get_chunk_map(fs_info, chunk_start, 1);
	if (IS_ERR(em))
6096 6097
		return -EIO;

6098
	map = em->map_lookup;
Y
Yan Zheng 已提交
6099
	length = em->len;
D
David Woodhouse 已提交
6100 6101
	rmap_len = map->stripe_len;

Y
Yan Zheng 已提交
6102
	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
6103
		length = div_u64(length, map->num_stripes / map->sub_stripes);
Y
Yan Zheng 已提交
6104
	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
6105
		length = div_u64(length, map->num_stripes);
6106
	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6107
		length = div_u64(length, nr_data_stripes(map));
D
David Woodhouse 已提交
6108 6109
		rmap_len = map->stripe_len * nr_data_stripes(map);
	}
Y
Yan Zheng 已提交
6110

6111
	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
6112
	BUG_ON(!buf); /* -ENOMEM */
Y
Yan Zheng 已提交
6113 6114 6115 6116 6117 6118 6119 6120 6121

	for (i = 0; i < map->num_stripes; i++) {
		if (devid && map->stripes[i].dev->devid != devid)
			continue;
		if (map->stripes[i].physical > physical ||
		    map->stripes[i].physical + length <= physical)
			continue;

		stripe_nr = physical - map->stripes[i].physical;
6122
		stripe_nr = div64_u64(stripe_nr, map->stripe_len);
Y
Yan Zheng 已提交
6123 6124 6125

		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			stripe_nr = stripe_nr * map->num_stripes + i;
6126
			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
Y
Yan Zheng 已提交
6127 6128
		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			stripe_nr = stripe_nr * map->num_stripes + i;
D
David Woodhouse 已提交
6129 6130 6131 6132 6133
		} /* else if RAID[56], multiply by nr_data_stripes().
		   * Alternatively, just use rmap_len below instead of
		   * map->stripe_len */

		bytenr = chunk_start + stripe_nr * rmap_len;
6134
		WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
6135 6136 6137 6138
		for (j = 0; j < nr; j++) {
			if (buf[j] == bytenr)
				break;
		}
6139 6140
		if (j == nr) {
			WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
6141
			buf[nr++] = bytenr;
6142
		}
Y
Yan Zheng 已提交
6143 6144 6145 6146
	}

	*logical = buf;
	*naddrs = nr;
D
David Woodhouse 已提交
6147
	*stripe_len = rmap_len;
Y
Yan Zheng 已提交
6148 6149 6150

	free_extent_map(em);
	return 0;
6151 6152
}

6153
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6154
{
6155 6156
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
6157
	bio_endio(bio);
6158

6159
	btrfs_put_bbio(bbio);
6160 6161
}

6162
static void btrfs_end_bio(struct bio *bio)
6163
{
6164
	struct btrfs_bio *bbio = bio->bi_private;
6165
	int is_orig_bio = 0;
6166

6167
	if (bio->bi_status) {
6168
		atomic_inc(&bbio->error);
6169 6170
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
6171
			unsigned int stripe_index =
6172
				btrfs_io_bio(bio)->stripe_index;
6173
			struct btrfs_device *dev;
6174 6175 6176

			BUG_ON(stripe_index >= bbio->num_stripes);
			dev = bbio->stripes[stripe_index].dev;
6177
			if (dev->bdev) {
M
Mike Christie 已提交
6178
				if (bio_op(bio) == REQ_OP_WRITE)
6179
					btrfs_dev_stat_inc_and_print(dev,
6180 6181
						BTRFS_DEV_STAT_WRITE_ERRS);
				else
6182
					btrfs_dev_stat_inc_and_print(dev,
6183
						BTRFS_DEV_STAT_READ_ERRS);
6184
				if (bio->bi_opf & REQ_PREFLUSH)
6185
					btrfs_dev_stat_inc_and_print(dev,
6186 6187
						BTRFS_DEV_STAT_FLUSH_ERRS);
			}
6188 6189
		}
	}
6190

6191
	if (bio == bbio->orig_bio)
6192 6193
		is_orig_bio = 1;

6194 6195
	btrfs_bio_counter_dec(bbio->fs_info);

6196
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6197 6198
		if (!is_orig_bio) {
			bio_put(bio);
6199
			bio = bbio->orig_bio;
6200
		}
6201

6202
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6203
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6204
		 * beyond the tolerance of the btrfs bio
6205
		 */
6206
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6207
			bio->bi_status = BLK_STS_IOERR;
6208
		} else {
6209 6210 6211 6212
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6213
			bio->bi_status = BLK_STS_OK;
6214
		}
6215

6216
		btrfs_end_bbio(bbio, bio);
6217
	} else if (!is_orig_bio) {
6218 6219 6220 6221
		bio_put(bio);
	}
}

6222 6223 6224 6225 6226 6227 6228
/*
 * see run_scheduled_bios for a description of why bios are collected for
 * async submit.
 *
 * This will add one bio to the pending list for a device and make sure
 * the work struct is scheduled.
 */
6229
static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6230
					struct bio *bio)
6231
{
6232
	struct btrfs_fs_info *fs_info = device->fs_info;
6233
	int should_queue = 1;
6234
	struct btrfs_pending_bios *pending_bios;
6235

6236 6237
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
	    !device->bdev) {
6238
		bio_io_error(bio);
D
David Woodhouse 已提交
6239 6240 6241
		return;
	}

6242
	/* don't bother with additional async steps for reads, right now */
M
Mike Christie 已提交
6243
	if (bio_op(bio) == REQ_OP_READ) {
6244
		btrfsic_submit_bio(bio);
6245
		return;
6246 6247
	}

6248
	WARN_ON(bio->bi_next);
6249 6250 6251
	bio->bi_next = NULL;

	spin_lock(&device->io_lock);
6252
	if (op_is_sync(bio->bi_opf))
6253 6254 6255
		pending_bios = &device->pending_sync_bios;
	else
		pending_bios = &device->pending_bios;
6256

6257 6258
	if (pending_bios->tail)
		pending_bios->tail->bi_next = bio;
6259

6260 6261 6262
	pending_bios->tail = bio;
	if (!pending_bios->head)
		pending_bios->head = bio;
6263 6264 6265 6266 6267 6268
	if (device->running_pending)
		should_queue = 0;

	spin_unlock(&device->io_lock);

	if (should_queue)
6269
		btrfs_queue_work(fs_info->submit_workers, &device->work);
6270 6271
}

6272 6273
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
			      u64 physical, int dev_nr, int async)
6274 6275
{
	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6276
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6277 6278

	bio->bi_private = bbio;
6279
	btrfs_io_bio(bio)->stripe_index = dev_nr;
6280
	bio->bi_end_io = btrfs_end_bio;
6281
	bio->bi_iter.bi_sector = physical >> 9;
6282 6283 6284 6285 6286 6287
#ifdef DEBUG
	{
		struct rcu_string *name;

		rcu_read_lock();
		name = rcu_dereference(dev->name);
6288 6289 6290 6291 6292 6293
		btrfs_debug(fs_info,
			"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
			bio_op(bio), bio->bi_opf,
			(u64)bio->bi_iter.bi_sector,
			(u_long)dev->bdev->bd_dev, name->str, dev->devid,
			bio->bi_iter.bi_size);
6294 6295 6296
		rcu_read_unlock();
	}
#endif
6297
	bio_set_dev(bio, dev->bdev);
6298

6299
	btrfs_bio_counter_inc_noblocked(fs_info);
6300

6301
	if (async)
6302
		btrfs_schedule_bio(dev, bio);
6303
	else
6304
		btrfsic_submit_bio(bio);
6305 6306 6307 6308 6309 6310
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6311
		/* Should be the original bio. */
6312 6313
		WARN_ON(bio != bbio->orig_bio);

6314
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6315
		bio->bi_iter.bi_sector = logical >> 9;
6316 6317 6318 6319
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6320
		btrfs_end_bbio(bbio, bio);
6321 6322 6323
	}
}

6324 6325
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
			   int mirror_num, int async_submit)
6326 6327
{
	struct btrfs_device *dev;
6328
	struct bio *first_bio = bio;
6329
	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6330 6331 6332
	u64 length = 0;
	u64 map_length;
	int ret;
6333 6334
	int dev_nr;
	int total_devs;
6335
	struct btrfs_bio *bbio = NULL;
6336

6337
	length = bio->bi_iter.bi_size;
6338
	map_length = length;
6339

6340
	btrfs_bio_counter_inc_blocked(fs_info);
6341
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6342
				&map_length, &bbio, mirror_num, 1);
6343
	if (ret) {
6344
		btrfs_bio_counter_dec(fs_info);
6345
		return errno_to_blk_status(ret);
6346
	}
6347

6348
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6349 6350 6351
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6352
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6353 6354
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6355
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
M
Mike Christie 已提交
6356
	    ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6357 6358
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
M
Mike Christie 已提交
6359
		if (bio_op(bio) == REQ_OP_WRITE) {
6360 6361
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6362
		} else {
6363 6364
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6365
		}
6366

6367
		btrfs_bio_counter_dec(fs_info);
6368
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6369 6370
	}

6371
	if (map_length < length) {
6372
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6373 6374
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6375 6376
		BUG();
	}
6377

6378
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6379
		dev = bbio->stripes[dev_nr].dev;
M
Mike Christie 已提交
6380
		if (!dev || !dev->bdev ||
6381 6382
		    (bio_op(first_bio) == REQ_OP_WRITE &&
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6383 6384 6385 6386
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6387
		if (dev_nr < total_devs - 1)
6388
			bio = btrfs_bio_clone(first_bio);
6389
		else
6390
			bio = first_bio;
6391

6392 6393
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				  dev_nr, async_submit);
6394
	}
6395
	btrfs_bio_counter_dec(fs_info);
6396
	return BLK_STS_OK;
6397 6398
}

6399
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
Y
Yan Zheng 已提交
6400
				       u8 *uuid, u8 *fsid)
6401
{
Y
Yan Zheng 已提交
6402 6403 6404
	struct btrfs_device *device;
	struct btrfs_fs_devices *cur_devices;

6405
	cur_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
6406 6407
	while (cur_devices) {
		if (!fsid ||
6408
		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6409
			device = find_device(cur_devices, devid, uuid);
Y
Yan Zheng 已提交
6410 6411 6412 6413 6414 6415
			if (device)
				return device;
		}
		cur_devices = cur_devices->seed;
	}
	return NULL;
6416 6417
}

6418
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6419 6420 6421 6422
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;

6423 6424
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
	if (IS_ERR(device))
6425
		return device;
6426 6427

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6428
	device->fs_devices = fs_devices;
6429
	fs_devices->num_devices++;
6430

6431
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6432
	fs_devices->missing_devices++;
6433

6434 6435 6436
	return device;
}

6437 6438 6439 6440 6441 6442 6443 6444 6445 6446
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6447 6448
 * on error.  Returned struct is not linked onto any lists and must be
 * destroyed with free_device.
6449 6450 6451 6452 6453 6454 6455 6456
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6457
	if (WARN_ON(!devid && !fs_info))
6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470
		return ERR_PTR(-EINVAL);

	dev = __alloc_device();
	if (IS_ERR(dev))
		return dev;

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6471
			free_device(dev);
6472 6473 6474 6475 6476 6477 6478 6479 6480 6481
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

6482 6483
	btrfs_init_work(&dev->work, btrfs_submit_helper,
			pending_bios_fn, NULL, NULL);
6484 6485 6486 6487

	return dev;
}

6488
/* Return -EIO if any error, otherwise return 0. */
6489
static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6490 6491
				   struct extent_buffer *leaf,
				   struct btrfs_chunk *chunk, u64 logical)
6492 6493
{
	u64 length;
6494
	u64 stripe_len;
6495 6496 6497
	u16 num_stripes;
	u16 sub_stripes;
	u64 type;
6498

6499
	length = btrfs_chunk_length(leaf, chunk);
6500 6501
	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6502 6503 6504
	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
	type = btrfs_chunk_type(leaf, chunk);

6505
	if (!num_stripes) {
6506
		btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6507 6508 6509
			  num_stripes);
		return -EIO;
	}
6510 6511
	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6512 6513
		return -EIO;
	}
6514 6515
	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
		btrfs_err(fs_info, "invalid chunk sectorsize %u",
6516 6517 6518
			  btrfs_chunk_sector_size(leaf, chunk));
		return -EIO;
	}
6519 6520
	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk length %llu", length);
6521 6522
		return -EIO;
	}
6523
	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6524
		btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6525 6526 6527 6528
			  stripe_len);
		return -EIO;
	}
	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6529
	    type) {
6530
		btrfs_err(fs_info, "unrecognized chunk type: %llu",
6531 6532 6533 6534 6535
			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
			  btrfs_chunk_type(leaf, chunk));
		return -EIO;
	}
6536 6537 6538 6539 6540 6541 6542
	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
	     num_stripes != 1)) {
6543
		btrfs_err(fs_info,
6544 6545 6546 6547 6548 6549 6550 6551 6552
			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
			num_stripes, sub_stripes,
			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
		return -EIO;
	}

	return 0;
}

6553
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6554
					u64 devid, u8 *uuid, bool error)
6555
{
6556 6557 6558 6559 6560 6561
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6562 6563
}

6564
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6565 6566 6567
			  struct extent_buffer *leaf,
			  struct btrfs_chunk *chunk)
{
6568
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6583
	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6584 6585
	if (ret)
		return ret;
6586

6587
	read_lock(&map_tree->map_tree.lock);
6588
	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6589
	read_unlock(&map_tree->map_tree.lock);
6590 6591 6592 6593 6594 6595 6596 6597 6598

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6599
	em = alloc_extent_map();
6600 6601
	if (!em)
		return -ENOMEM;
6602
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6603 6604 6605 6606 6607
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6608
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6609
	em->map_lookup = map;
6610 6611
	em->start = logical;
	em->len = length;
6612
	em->orig_start = 0;
6613
	em->block_start = 0;
C
Chris Mason 已提交
6614
	em->block_len = em->len;
6615

6616 6617 6618 6619 6620
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	map->type = btrfs_chunk_type(leaf, chunk);
C
Chris Mason 已提交
6621
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6622 6623 6624 6625
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6626 6627 6628
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
6629
		map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6630
							uuid, NULL);
6631
		if (!map->stripes[i].dev &&
6632
		    !btrfs_test_opt(fs_info, DEGRADED)) {
6633
			free_extent_map(em);
6634
			btrfs_report_missing_device(fs_info, devid, uuid, true);
6635
			return -ENOENT;
6636
		}
6637 6638
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
6639 6640
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
6641
			if (IS_ERR(map->stripes[i].dev)) {
6642
				free_extent_map(em);
6643 6644 6645 6646
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
6647
			}
6648
			btrfs_report_missing_device(fs_info, devid, uuid, false);
6649
		}
6650 6651 6652
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));

6653 6654
	}

6655
	write_lock(&map_tree->map_tree.lock);
J
Josef Bacik 已提交
6656
	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6657
	write_unlock(&map_tree->map_tree.lock);
6658
	BUG_ON(ret); /* Tree corruption */
6659 6660 6661 6662 6663
	free_extent_map(em);

	return 0;
}

6664
static void fill_device_from_item(struct extent_buffer *leaf,
6665 6666 6667 6668 6669 6670
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
6671 6672
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
6673
	device->commit_total_bytes = device->disk_total_bytes;
6674
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6675
	device->commit_bytes_used = device->bytes_used;
6676 6677 6678 6679
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6680
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6681
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6682

6683
	ptr = btrfs_device_uuid(dev_item);
6684
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6685 6686
}

6687
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6688
						  u8 *fsid)
Y
Yan Zheng 已提交
6689 6690 6691 6692
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

6693
	lockdep_assert_held(&uuid_mutex);
D
David Sterba 已提交
6694
	ASSERT(fsid);
Y
Yan Zheng 已提交
6695

6696
	fs_devices = fs_info->fs_devices->seed;
Y
Yan Zheng 已提交
6697
	while (fs_devices) {
6698
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6699 6700
			return fs_devices;

Y
Yan Zheng 已提交
6701 6702 6703 6704 6705
		fs_devices = fs_devices->seed;
	}

	fs_devices = find_fsid(fsid);
	if (!fs_devices) {
6706
		if (!btrfs_test_opt(fs_info, DEGRADED))
6707 6708 6709 6710 6711 6712 6713 6714 6715
			return ERR_PTR(-ENOENT);

		fs_devices = alloc_fs_devices(fsid);
		if (IS_ERR(fs_devices))
			return fs_devices;

		fs_devices->seeding = 1;
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
6716
	}
Y
Yan Zheng 已提交
6717 6718

	fs_devices = clone_fs_devices(fs_devices);
6719 6720
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
6721

6722
	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6723
				   fs_info->bdev_holder);
6724 6725
	if (ret) {
		free_fs_devices(fs_devices);
6726
		fs_devices = ERR_PTR(ret);
Y
Yan Zheng 已提交
6727
		goto out;
6728
	}
Y
Yan Zheng 已提交
6729 6730 6731

	if (!fs_devices->seeding) {
		__btrfs_close_devices(fs_devices);
Y
Yan Zheng 已提交
6732
		free_fs_devices(fs_devices);
6733
		fs_devices = ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
6734 6735 6736
		goto out;
	}

6737 6738
	fs_devices->seed = fs_info->fs_devices->seed;
	fs_info->fs_devices->seed = fs_devices;
Y
Yan Zheng 已提交
6739
out:
6740
	return fs_devices;
Y
Yan Zheng 已提交
6741 6742
}

6743
static int read_one_dev(struct btrfs_fs_info *fs_info,
6744 6745 6746
			struct extent_buffer *leaf,
			struct btrfs_dev_item *dev_item)
{
6747
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6748 6749 6750
	struct btrfs_device *device;
	u64 devid;
	int ret;
6751
	u8 fs_uuid[BTRFS_FSID_SIZE];
6752 6753
	u8 dev_uuid[BTRFS_UUID_SIZE];

6754
	devid = btrfs_device_id(leaf, dev_item);
6755
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6756
			   BTRFS_UUID_SIZE);
6757
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6758
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
6759

6760
	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6761
		fs_devices = open_seed_devices(fs_info, fs_uuid);
6762 6763
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
6764 6765
	}

6766
	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6767
	if (!device) {
6768
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
6769 6770
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
6771
			return -ENOENT;
6772
		}
Y
Yan Zheng 已提交
6773

6774
		device = add_missing_dev(fs_devices, devid, dev_uuid);
6775 6776 6777 6778 6779 6780
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
6781
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6782
	} else {
6783
		if (!device->bdev) {
6784 6785 6786
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
6787
				return -ENOENT;
6788 6789 6790
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
6791
		}
6792

6793 6794
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6795 6796 6797 6798 6799 6800
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
6801
			device->fs_devices->missing_devices++;
6802
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Y
Yan Zheng 已提交
6803
		}
6804 6805 6806

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
6807 6808
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
6809 6810 6811 6812 6813 6814 6815 6816 6817 6818

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
6819 6820
	}

6821
	if (device->fs_devices != fs_info->fs_devices) {
6822
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Y
Yan Zheng 已提交
6823 6824 6825
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
6826
	}
6827 6828

	fill_device_from_item(leaf, dev_item, device);
6829
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6830
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6831
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Y
Yan Zheng 已提交
6832
		device->fs_devices->total_rw_bytes += device->total_bytes;
6833 6834
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
6835
	}
6836 6837 6838 6839
	ret = 0;
	return ret;
}

6840
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6841
{
6842
	struct btrfs_root *root = fs_info->tree_root;
6843
	struct btrfs_super_block *super_copy = fs_info->super_copy;
6844
	struct extent_buffer *sb;
6845 6846
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
6847 6848
	u8 *array_ptr;
	unsigned long sb_array_offset;
6849
	int ret = 0;
6850 6851 6852
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
6853
	u32 cur_offset;
6854
	u64 type;
6855
	struct btrfs_key key;
6856

6857
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6858 6859 6860 6861 6862
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
6863
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6864 6865
	if (IS_ERR(sb))
		return PTR_ERR(sb);
6866
	set_extent_buffer_uptodate(sb);
6867
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6868
	/*
6869
	 * The sb extent buffer is artificial and just used to read the system array.
6870
	 * set_extent_buffer_uptodate() call does not properly mark all it's
6871 6872 6873 6874 6875 6876 6877 6878 6879
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
6880
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6881
		SetPageUptodate(sb->pages[0]);
6882

6883
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6884 6885
	array_size = btrfs_super_sys_array_size(super_copy);

6886 6887 6888
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
6889

6890 6891
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
6892 6893 6894 6895
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

6896 6897
		btrfs_disk_key_to_cpu(&key, disk_key);

6898 6899 6900
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6901

6902
		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6903
			chunk = (struct btrfs_chunk *)sb_array_offset;
6904 6905 6906 6907 6908 6909 6910 6911 6912
			/*
			 * At least one btrfs_chunk with one stripe must be
			 * present, exact stripe count check comes afterwards
			 */
			len = btrfs_chunk_item_size(1);
			if (cur_offset + len > array_size)
				goto out_short_read;

			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6913
			if (!num_stripes) {
6914 6915
				btrfs_err(fs_info,
					"invalid number of stripes %u in sys_array at offset %u",
6916 6917 6918 6919 6920
					num_stripes, cur_offset);
				ret = -EIO;
				break;
			}

6921 6922
			type = btrfs_chunk_type(sb, chunk);
			if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6923
				btrfs_err(fs_info,
6924 6925 6926 6927 6928 6929
			    "invalid chunk type %llu in sys_array at offset %u",
					type, cur_offset);
				ret = -EIO;
				break;
			}

6930 6931 6932 6933
			len = btrfs_chunk_item_size(num_stripes);
			if (cur_offset + len > array_size)
				goto out_short_read;

6934
			ret = read_one_chunk(fs_info, &key, sb, chunk);
6935 6936
			if (ret)
				break;
6937
		} else {
6938 6939 6940
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
6941 6942
			ret = -EIO;
			break;
6943
		}
6944 6945 6946
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6947
	}
6948
	clear_extent_buffer_uptodate(sb);
6949
	free_extent_buffer_stale(sb);
6950
	return ret;
6951 6952

out_short_read:
6953
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6954
			len, cur_offset);
6955
	clear_extent_buffer_uptodate(sb);
6956
	free_extent_buffer_stale(sb);
6957
	return -EIO;
6958 6959
}

6960 6961 6962
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
6963 6964
 * If the @failing_dev is specified, it's accounted as missing.
 *
6965 6966 6967
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
6968 6969
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996
{
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

	read_lock(&map_tree->map_tree.lock);
	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
	read_unlock(&map_tree->map_tree.lock);
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

6997 6998
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6999 7000
			    dev->last_flush_error)
				missing++;
7001 7002
			else if (failing_dev && failing_dev == dev)
				missing++;
7003 7004
		}
		if (missing > max_tolerated) {
7005 7006
			if (!failing_dev)
				btrfs_warn(fs_info,
7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024
	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

		read_lock(&map_tree->map_tree.lock);
		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
					   (u64)(-1) - next_start);
		read_unlock(&map_tree->map_tree.lock);
	}
out:
	return ret;
}

7025
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7026
{
7027
	struct btrfs_root *root = fs_info->chunk_root;
7028 7029 7030 7031 7032 7033
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
7034
	u64 total_dev = 0;
7035 7036 7037 7038 7039

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

7040
	mutex_lock(&uuid_mutex);
7041
	mutex_lock(&fs_info->chunk_mutex);
7042

7043 7044 7045 7046 7047
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7048 7049 7050 7051 7052
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7053 7054
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
7055
	while (1) {
7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7067 7068 7069
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
7070
						  struct btrfs_dev_item);
7071
			ret = read_one_dev(fs_info, leaf, dev_item);
7072 7073
			if (ret)
				goto error;
7074
			total_dev++;
7075 7076 7077
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7078
			ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
Y
Yan Zheng 已提交
7079 7080
			if (ret)
				goto error;
7081 7082 7083
		}
		path->slots[0]++;
	}
7084 7085 7086 7087 7088

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
7089 7090
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
7091
	   "super_num_devices %llu mismatch with num_devices %llu found here",
7092
			  btrfs_super_num_devices(fs_info->super_copy),
7093 7094 7095 7096
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
7097 7098 7099
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
7100
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7101 7102
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
7103 7104 7105
		ret = -EINVAL;
		goto error;
	}
7106 7107
	ret = 0;
error:
7108
	mutex_unlock(&fs_info->chunk_mutex);
7109 7110
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
7111
	btrfs_free_path(path);
7112 7113
	return ret;
}
7114

7115 7116 7117 7118 7119
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;

7120 7121 7122
	while (fs_devices) {
		mutex_lock(&fs_devices->device_list_mutex);
		list_for_each_entry(device, &fs_devices->devices, dev_list)
7123
			device->fs_info = fs_info;
7124 7125 7126 7127
		mutex_unlock(&fs_devices->device_list_mutex);

		fs_devices = fs_devices->seed;
	}
7128 7129
}

7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_dev_stat_reset(dev, i);
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct extent_buffer *eb;
	int slot;
	int ret = 0;
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
	int i;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		int item_size;
		struct btrfs_dev_stats_item *ptr;

7162 7163
		key.objectid = BTRFS_DEV_STATS_OBJECTID;
		key.type = BTRFS_PERSISTENT_ITEM_KEY;
7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199
		key.offset = device->devid;
		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
		if (ret) {
			__btrfs_reset_dev_stats(device);
			device->dev_stats_valid = 1;
			btrfs_release_path(path);
			continue;
		}
		slot = path->slots[0];
		eb = path->nodes[0];
		btrfs_item_key_to_cpu(eb, &found_key, slot);
		item_size = btrfs_item_size_nr(eb, slot);

		ptr = btrfs_item_ptr(eb, slot,
				     struct btrfs_dev_stats_item);

		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (item_size >= (1 + i) * sizeof(__le64))
				btrfs_dev_stat_set(device, i,
					btrfs_dev_stats_value(eb, ptr, i));
			else
				btrfs_dev_stat_reset(device, i);
		}

		device->dev_stats_valid = 1;
		btrfs_dev_stat_print_on_load(device);
		btrfs_release_path(path);
	}
	mutex_unlock(&fs_devices->device_list_mutex);

out:
	btrfs_free_path(path);
	return ret < 0 ? ret : 0;
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7200
				struct btrfs_fs_info *fs_info,
7201 7202
				struct btrfs_device *device)
{
7203
	struct btrfs_root *dev_root = fs_info->dev_root;
7204 7205 7206 7207 7208 7209 7210
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7211 7212
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7213 7214 7215
	key.offset = device->devid;

	path = btrfs_alloc_path();
7216 7217
	if (!path)
		return -ENOMEM;
7218 7219
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7220
		btrfs_warn_in_rcu(fs_info,
7221
			"error %d while searching for dev_stats item for device %s",
7222
			      ret, rcu_str_deref(device->name));
7223 7224 7225 7226 7227 7228 7229 7230
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7231
			btrfs_warn_in_rcu(fs_info,
7232
				"delete too small dev_stats item for device %s failed %d",
7233
				      rcu_str_deref(device->name), ret);
7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7245
			btrfs_warn_in_rcu(fs_info,
7246 7247
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7272
	int stats_cnt;
7273 7274 7275 7276
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7277 7278
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7279 7280
			continue;

7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7295
		ret = update_dev_stat_item(trans, fs_info, device);
7296
		if (!ret)
7297
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7298 7299 7300 7301 7302 7303
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7304 7305 7306 7307 7308 7309
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7310
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7311
{
7312 7313
	if (!dev->dev_stats_valid)
		return;
7314
	btrfs_err_rl_in_rcu(dev->fs_info,
7315
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7316
			   rcu_str_deref(dev->name),
7317 7318 7319
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7320 7321
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7322
}
7323

7324 7325
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7326 7327 7328 7329 7330 7331 7332 7333
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7334
	btrfs_info_in_rcu(dev->fs_info,
7335
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7336
	       rcu_str_deref(dev->name),
7337 7338 7339 7340 7341 7342 7343
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7344
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7345
			struct btrfs_ioctl_get_dev_stats *stats)
7346 7347
{
	struct btrfs_device *dev;
7348
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7349 7350 7351
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7352
	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7353 7354 7355
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7356
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7357
		return -ENODEV;
7358
	} else if (!dev->dev_stats_valid) {
7359
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7360
		return -ENODEV;
7361
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
				btrfs_dev_stat_reset(dev, i);
		}
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7378

7379
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7380 7381 7382
{
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
7383
	int copy_num;
7384

7385 7386
	if (!bdev)
		return;
7387

7388 7389
	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
		copy_num++) {
7390

7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406
		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
			continue;

		disk_super = (struct btrfs_super_block *)bh->b_data;

		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
		set_buffer_dirty(bh);
		sync_dirty_buffer(bh);
		brelse(bh);
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
7407
}
7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421

/*
 * Update the size of all devices, which is used for writing out the
 * super blocks.
 */
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *curr, *next;

	if (list_empty(&fs_devices->resized_devices))
		return;

	mutex_lock(&fs_devices->device_list_mutex);
7422
	mutex_lock(&fs_info->chunk_mutex);
7423 7424 7425 7426 7427
	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				 resized_list) {
		list_del_init(&curr->resized_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
	}
7428
	mutex_unlock(&fs_info->chunk_mutex);
7429 7430
	mutex_unlock(&fs_devices->device_list_mutex);
}
7431 7432

/* Must be invoked during the transaction commit */
7433
void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7434
{
7435
	struct btrfs_fs_info *fs_info = trans->fs_info;
7436 7437 7438 7439 7440
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_device *dev;
	int i;

7441
	if (list_empty(&trans->pending_chunks))
7442 7443 7444
		return;

	/* In order to kick the device replace finish process */
7445
	mutex_lock(&fs_info->chunk_mutex);
7446
	list_for_each_entry(em, &trans->pending_chunks, list) {
7447
		map = em->map_lookup;
7448 7449 7450 7451 7452 7453

		for (i = 0; i < map->num_stripes; i++) {
			dev = map->stripes[i].dev;
			dev->commit_bytes_used = dev->bytes_used;
		}
	}
7454
	mutex_unlock(&fs_info->chunk_mutex);
7455
}
7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473

void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = fs_info;
		fs_devices = fs_devices->seed;
	}
}

void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = NULL;
		fs_devices = fs_devices->seed;
	}
}