volumes.c 193.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6 7
#include <linux/sched.h>
#include <linux/bio.h>
8
#include <linux/slab.h>
9
#include <linux/buffer_head.h>
10
#include <linux/blkdev.h>
11
#include <linux/iocontext.h>
12
#include <linux/capability.h>
13
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
14
#include <linux/kthread.h>
D
David Woodhouse 已提交
15
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
16
#include <linux/semaphore.h>
17
#include <linux/uuid.h>
A
Anand Jain 已提交
18
#include <linux/list_sort.h>
D
David Woodhouse 已提交
19
#include <asm/div64.h>
20 21 22 23 24 25
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
26
#include "raid56.h"
27
#include "async-thread.h"
28
#include "check-integrity.h"
29
#include "rcu-string.h"
30
#include "math.h"
31
#include "dev-replace.h"
32
#include "sysfs.h"
33

Z
Zhao Lei 已提交
34 35 36 37 38 39
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
40
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
41 42 43 44 45 46 47 48
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
49
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
50 51 52 53 54 55 56 57
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
58
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
59 60 61 62 63 64 65 66
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
67
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
68 69 70 71 72 73 74 75
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
76
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
77 78 79 80 81 82 83 84
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
85
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
86 87 88 89 90 91 92 93
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
94
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
95 96 97 98 99
		.devs_increment	= 1,
		.ncopies	= 3,
	},
};

100
const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
Z
Zhao Lei 已提交
101 102 103 104 105 106 107 108 109
	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
};

110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
/*
 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 * condition is not met. Zero means there's no corresponding
 * BTRFS_ERROR_DEV_*_NOT_MET value.
 */
const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	[BTRFS_RAID_DUP]    = 0,
	[BTRFS_RAID_RAID0]  = 0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
};

Y
Yan Zheng 已提交
125
static int init_first_rw_device(struct btrfs_trans_handle *trans,
126
				struct btrfs_fs_info *fs_info);
127
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
128
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
129
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
130
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
131 132 133 134 135
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
136

D
David Sterba 已提交
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
 * seeding, structure cloning, openning/closing devices at mount/umount time
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list!
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * volume_mutex
 * ------------
 * coarse lock owned by a mounted filesystem; used to exclude some operations
 * that cannot run in parallel and affect the higher-level properties of the
 * filesystem like: device add/deleting/resize/replace, or balance
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
 * device is added/removed
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   volume_mutex
 *     device_list_mutex
 *       chunk_mutex
 *     balance_mutex
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 *
 *
 * Exclusive operations, BTRFS_FS_EXCL_OP
 * ======================================
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 * completed.
D
David Sterba 已提交
235 236
 */

237
DEFINE_MUTEX(uuid_mutex);
238
static LIST_HEAD(fs_uuids);
239 240 241 242
struct list_head *btrfs_get_fs_uuids(void)
{
	return &fs_uuids;
}
243

D
David Sterba 已提交
244 245 246 247 248 249 250 251 252
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
 * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
253 254 255
{
	struct btrfs_fs_devices *fs_devs;

256
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
257 258 259 260 261 262
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
263
	INIT_LIST_HEAD(&fs_devs->resized_devices);
264
	INIT_LIST_HEAD(&fs_devs->alloc_list);
265
	INIT_LIST_HEAD(&fs_devs->fs_list);
266 267 268 269 270 271
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

	return fs_devs;
}

272
void btrfs_free_device(struct btrfs_device *device)
273 274 275 276 277 278
{
	rcu_string_free(device->name);
	bio_put(device->flush_bio);
	kfree(device);
}

Y
Yan Zheng 已提交
279 280 281 282 283 284 285 286
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
287
		btrfs_free_device(device);
Y
Yan Zheng 已提交
288 289 290 291
	}
	kfree(fs_devices);
}

292 293 294 295 296 297 298
static void btrfs_kobject_uevent(struct block_device *bdev,
				 enum kobject_action action)
{
	int ret;

	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
	if (ret)
299
		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
300 301 302 303 304
			action,
			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			&disk_to_dev(bdev->bd_disk)->kobj);
}

305
void __exit btrfs_cleanup_fs_uuids(void)
306 307 308
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
309 310
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
311 312
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Y
Yan Zheng 已提交
313
		free_fs_devices(fs_devices);
314 315 316
	}
}

317 318 319
/*
 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 * Returned struct is not linked onto any lists and must be destroyed using
320
 * btrfs_free_device.
321
 */
322 323 324 325
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

326
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
327 328 329
	if (!dev)
		return ERR_PTR(-ENOMEM);

330 331 332 333 334 335 336 337 338 339
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

340 341
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
342
	INIT_LIST_HEAD(&dev->resized_list);
343 344 345 346

	spin_lock_init(&dev->io_lock);

	atomic_set(&dev->reada_in_flight, 0);
347
	atomic_set(&dev->dev_stats_ccnt, 0);
348
	btrfs_device_data_ordered_init(dev);
349
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
350
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
351 352 353 354

	return dev;
}

355 356 357 358 359 360 361 362 363
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
		u64 devid, const u8 *uuid)
364 365 366
{
	struct btrfs_device *dev;

367
	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
368
		if (dev->devid == devid &&
369
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
370
			return dev;
371
		}
372 373 374 375
	}
	return NULL;
}

376
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
377 378 379
{
	struct btrfs_fs_devices *fs_devices;

380
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
381 382 383 384 385 386
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
403
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
404 405 406 407 408 409
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
410 411
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
412 413 414 415 416 417 418 419 420 421 422 423
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

424 425 426 427 428 429 430 431 432 433 434 435 436 437
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

438 439 440 441 442 443 444 445 446 447 448
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
449
static noinline void run_scheduled_bios(struct btrfs_device *device)
450
{
451
	struct btrfs_fs_info *fs_info = device->fs_info;
452 453
	struct bio *pending;
	struct backing_dev_info *bdi;
454
	struct btrfs_pending_bios *pending_bios;
455 456 457
	struct bio *tail;
	struct bio *cur;
	int again = 0;
458
	unsigned long num_run;
459
	unsigned long batch_run = 0;
460
	unsigned long last_waited = 0;
461
	int force_reg = 0;
M
Miao Xie 已提交
462
	int sync_pending = 0;
463 464 465 466 467 468 469 470 471
	struct blk_plug plug;

	/*
	 * this function runs all the bios we've collected for
	 * a particular device.  We don't want to wander off to
	 * another device without first sending all of these down.
	 * So, setup a plug here and finish it off before we return
	 */
	blk_start_plug(&plug);
472

473
	bdi = device->bdev->bd_bdi;
474

475 476 477
loop:
	spin_lock(&device->io_lock);

478
loop_lock:
479
	num_run = 0;
480

481 482 483 484 485
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
486
	if (!force_reg && device->pending_sync_bios.head) {
487
		pending_bios = &device->pending_sync_bios;
488 489
		force_reg = 1;
	} else {
490
		pending_bios = &device->pending_bios;
491 492
		force_reg = 0;
	}
493 494 495

	pending = pending_bios->head;
	tail = pending_bios->tail;
496 497 498 499 500 501 502 503 504 505
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
506 507
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
508 509
		again = 0;
		device->running_pending = 0;
510 511 512
	} else {
		again = 1;
		device->running_pending = 1;
513
	}
514 515 516 517

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

518 519
	spin_unlock(&device->io_lock);

C
Chris Mason 已提交
520
	while (pending) {
521 522

		rmb();
523 524 525 526 527 528 529 530
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
531 532 533 534 535
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

536 537 538
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
539

540
		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
541

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
		/*
		 * if we're doing the sync list, record that our
		 * plug has some sync requests on it
		 *
		 * If we're doing the regular list and there are
		 * sync requests sitting around, unplug before
		 * we add more
		 */
		if (pending_bios == &device->pending_sync_bios) {
			sync_pending = 1;
		} else if (sync_pending) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}

558
		btrfsic_submit_bio(cur);
559 560
		num_run++;
		batch_run++;
561 562

		cond_resched();
563 564 565 566 567 568

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
C
Chris Mason 已提交
569
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
570
		    fs_info->fs_devices->open_devices > 1) {
571
			struct io_context *ioc;
572

573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
595
				cond_resched();
596 597
				continue;
			}
598
			spin_lock(&device->io_lock);
599
			requeue_list(pending_bios, pending, tail);
600
			device->running_pending = 1;
601 602

			spin_unlock(&device->io_lock);
603 604
			btrfs_queue_work(fs_info->submit_workers,
					 &device->work);
605 606 607
			goto done;
		}
	}
608

609 610 611 612 613 614 615 616 617
	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

618
done:
619
	blk_finish_plug(&plug);
620 621
}

622
static void pending_bios_fn(struct btrfs_work *work)
623 624 625 626 627 628 629
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

630 631 632 633 634 635 636 637 638 639
/*
 *  Search and remove all stale (devices which are not mounted) devices.
 *  When both inputs are NULL, it will search and release all stale devices.
 *  path:	Optional. When provided will it release all unmounted devices
 *		matching this path only.
 *  skip_dev:	Optional. Will skip this device when searching for the stale
 *		devices.
 */
static void btrfs_free_stale_devices(const char *path,
				     struct btrfs_device *skip_dev)
A
Anand Jain 已提交
640
{
641 642
	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
	struct btrfs_device *dev, *tmp_dev;
A
Anand Jain 已提交
643

644
	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) {
A
Anand Jain 已提交
645 646 647 648

		if (fs_devs->opened)
			continue;

649 650
		list_for_each_entry_safe(dev, tmp_dev,
					 &fs_devs->devices, dev_list) {
651
			int not_found = 0;
A
Anand Jain 已提交
652

653 654 655
			if (skip_dev && skip_dev == dev)
				continue;
			if (path && !dev->name)
A
Anand Jain 已提交
656 657 658
				continue;

			rcu_read_lock();
659
			if (path)
660
				not_found = strcmp(rcu_str_deref(dev->name),
661
						   path);
A
Anand Jain 已提交
662
			rcu_read_unlock();
663 664
			if (not_found)
				continue;
A
Anand Jain 已提交
665 666 667 668

			/* delete the stale device */
			if (fs_devs->num_devices == 1) {
				btrfs_sysfs_remove_fsid(fs_devs);
669
				list_del(&fs_devs->fs_list);
A
Anand Jain 已提交
670
				free_fs_devices(fs_devs);
671
				break;
A
Anand Jain 已提交
672 673 674
			} else {
				fs_devs->num_devices--;
				list_del(&dev->dev_list);
675
				btrfs_free_device(dev);
A
Anand Jain 已提交
676 677 678 679 680
			}
		}
	}
}

681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
			struct btrfs_device *device, fmode_t flags,
			void *holder)
{
	struct request_queue *q;
	struct block_device *bdev;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				    &bdev, &bh);
	if (ret)
		return ret;

	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
		goto error_brelse;

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
		goto error_brelse;

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
713
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
714 715
		fs_devices->seeding = 1;
	} else {
716 717 718 719
		if (bdev_read_only(bdev))
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
720 721 722 723 724 725 726
	}

	q = bdev_get_queue(bdev);
	if (!blk_queue_nonrot(q))
		fs_devices->rotating = 1;

	device->bdev = bdev;
727
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
728 729 730
	device->mode = flags;

	fs_devices->open_devices++;
731 732
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
733
		fs_devices->rw_devices++;
734
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
735 736 737 738 739 740 741 742 743 744 745 746
	}
	brelse(bh);

	return 0;

error_brelse:
	brelse(bh);
	blkdev_put(bdev, flags);

	return -EINVAL;
}

747 748 749 750
/*
 * Add new device to list of registered devices
 *
 * Returns:
751 752
 * device pointer which was just added or updated when successful
 * error pointer when failed
753
 */
754
static noinline struct btrfs_device *device_list_add(const char *path,
755
			   struct btrfs_super_block *disk_super)
756 757 758
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
759
	struct rcu_string *name;
760
	u64 found_transid = btrfs_super_generation(disk_super);
761
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
762 763 764

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
765 766
		fs_devices = alloc_fs_devices(disk_super->fsid);
		if (IS_ERR(fs_devices))
767
			return ERR_CAST(fs_devices);
768

769
		list_add(&fs_devices->fs_list, &fs_uuids);
770

771 772
		device = NULL;
	} else {
773 774
		device = find_device(fs_devices, devid,
				disk_super->dev_item.uuid);
775
	}
776

777
	if (!device) {
Y
Yan Zheng 已提交
778
		if (fs_devices->opened)
779
			return ERR_PTR(-EBUSY);
Y
Yan Zheng 已提交
780

781 782 783
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
784
			/* we can safely leave the fs_devices entry around */
785
			return device;
786
		}
787 788 789

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
790
			btrfs_free_device(device);
791
			return ERR_PTR(-ENOMEM);
792
		}
793
		rcu_assign_pointer(device->name, name);
794

795
		mutex_lock(&fs_devices->device_list_mutex);
796
		list_add_rcu(&device->dev_list, &fs_devices->devices);
797
		fs_devices->num_devices++;
798 799
		mutex_unlock(&fs_devices->device_list_mutex);

Y
Yan Zheng 已提交
800
		device->fs_devices = fs_devices;
801
		btrfs_free_stale_devices(path, device);
802 803 804 805 806 807 808 809

		if (disk_super->label[0])
			pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
				disk_super->label, devid, found_transid, path);
		else
			pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
				disk_super->fsid, devid, found_transid, path);

810
	} else if (!device->name || strcmp(device->name->str, path)) {
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
832 833 834 835
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
836
		 */
837
		if (!fs_devices->opened && found_transid < device->generation) {
838 839 840 841 842 843 844
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
845
			return ERR_PTR(-EEXIST);
846
		}
847

848
		name = rcu_string_strdup(path, GFP_NOFS);
849
		if (!name)
850
			return ERR_PTR(-ENOMEM);
851 852
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
853
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
854
			fs_devices->missing_devices--;
855
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
856
		}
857 858
	}

859 860 861 862 863 864 865 866 867
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
	if (!fs_devices->opened)
		device->generation = found_transid;

868 869
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

870
	return device;
871 872
}

Y
Yan Zheng 已提交
873 874 875 876 877 878
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

879 880 881
	fs_devices = alloc_fs_devices(orig->fsid);
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
882

883
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
884
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
885

886
	/* We have held the volume lock, it is safe to get the devices. */
Y
Yan Zheng 已提交
887
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
888 889
		struct rcu_string *name;

890 891 892
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
		if (IS_ERR(device))
Y
Yan Zheng 已提交
893 894
			goto error;

895 896 897 898
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
899
		if (orig_dev->name) {
900 901
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
902
			if (!name) {
903
				btrfs_free_device(device);
904 905 906
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
907
		}
Y
Yan Zheng 已提交
908 909 910 911 912

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
913
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
914 915
	return fs_devices;
error:
916
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
917 918 919 920
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

921 922 923 924 925
/*
 * After we have read the system tree and know devids belonging to
 * this filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
926
{
Q
Qinghuang Feng 已提交
927
	struct btrfs_device *device, *next;
928
	struct btrfs_device *latest_dev = NULL;
929

930 931
	mutex_lock(&uuid_mutex);
again:
932
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
933
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
934 935
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
							&device->dev_state)) {
936 937 938 939
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
			     &device->dev_state) &&
			     (!latest_dev ||
			      device->generation > latest_dev->generation)) {
940
				latest_dev = device;
941
			}
Y
Yan Zheng 已提交
942
			continue;
943
		}
Y
Yan Zheng 已提交
944

945 946 947 948 949 950 951 952 953 954 955
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
956 957
			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
						  &device->dev_state)) {
958 959 960
				continue;
			}
		}
Y
Yan Zheng 已提交
961
		if (device->bdev) {
962
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
963 964 965
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
966
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
967
			list_del_init(&device->dev_alloc_list);
968
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
969 970
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				      &device->dev_state))
971
				fs_devices->rw_devices--;
Y
Yan Zheng 已提交
972
		}
Y
Yan Zheng 已提交
973 974
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
975
		btrfs_free_device(device);
976
	}
Y
Yan Zheng 已提交
977 978 979 980 981 982

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

983
	fs_devices->latest_bdev = latest_dev->bdev;
984

985 986
	mutex_unlock(&uuid_mutex);
}
987

988
static void free_device_rcu(struct rcu_head *head)
989 990 991
{
	struct btrfs_device *device;

L
Liu Bo 已提交
992
	device = container_of(head, struct btrfs_device, rcu);
993
	btrfs_free_device(device);
994 995
}

996 997
static void btrfs_close_bdev(struct btrfs_device *device)
{
D
David Sterba 已提交
998 999 1000
	if (!device->bdev)
		return;

1001
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1002 1003 1004 1005
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

D
David Sterba 已提交
1006
	blkdev_put(device->bdev, device->mode);
1007 1008
}

1009
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
1010 1011 1012 1013 1014 1015 1016 1017
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;
	struct btrfs_device *new_device;
	struct rcu_string *name;

	if (device->bdev)
		fs_devices->open_devices--;

1018
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1019 1020 1021 1022 1023
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1024
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
		fs_devices->missing_devices--;

	new_device = btrfs_alloc_device(NULL, &device->devid,
					device->uuid);
	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */

	/* Safe because we are under uuid_mutex */
	if (device->name) {
		name = rcu_string_strdup(device->name->str, GFP_NOFS);
		BUG_ON(!name); /* -ENOMEM */
		rcu_assign_pointer(new_device->name, name);
	}

	list_replace_rcu(&device->dev_list, &new_device->dev_list);
	new_device->fs_devices = device->fs_devices;
}

1042
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1043
{
1044
	struct btrfs_device *device, *tmp;
1045 1046 1047
	struct list_head pending_put;

	INIT_LIST_HEAD(&pending_put);
Y
Yan Zheng 已提交
1048

Y
Yan Zheng 已提交
1049 1050
	if (--fs_devices->opened > 0)
		return 0;
1051

1052
	mutex_lock(&fs_devices->device_list_mutex);
1053
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1054 1055
		btrfs_prepare_close_one_device(device);
		list_add(&device->dev_list, &pending_put);
1056
	}
1057 1058
	mutex_unlock(&fs_devices->device_list_mutex);

1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
	/*
	 * btrfs_show_devname() is using the device_list_mutex,
	 * sometimes call to blkdev_put() leads vfs calling
	 * into this func. So do put outside of device_list_mutex,
	 * as of now.
	 */
	while (!list_empty(&pending_put)) {
		device = list_first_entry(&pending_put,
				struct btrfs_device, dev_list);
		list_del(&device->dev_list);
		btrfs_close_bdev(device);
1070
		call_rcu(&device->rcu, free_device_rcu);
1071 1072
	}

Y
Yan Zheng 已提交
1073 1074
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
1075 1076 1077
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

1078 1079 1080
	return 0;
}

Y
Yan Zheng 已提交
1081 1082
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Y
Yan Zheng 已提交
1083
	struct btrfs_fs_devices *seed_devices = NULL;
Y
Yan Zheng 已提交
1084 1085 1086
	int ret;

	mutex_lock(&uuid_mutex);
1087
	ret = close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
1088 1089 1090 1091
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Y
Yan Zheng 已提交
1092
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
1093 1094 1095 1096

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
1097
		close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
1098 1099
		free_fs_devices(fs_devices);
	}
Y
Yan Zheng 已提交
1100 1101 1102
	return ret;
}

1103
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
Y
Yan Zheng 已提交
1104
				fmode_t flags, void *holder)
1105 1106
{
	struct btrfs_device *device;
1107
	struct btrfs_device *latest_dev = NULL;
1108
	int ret = 0;
1109

1110 1111
	flags |= FMODE_EXCL;

1112
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
1113
		/* Just open everything we can; ignore failures here */
1114
		if (btrfs_open_one_device(fs_devices, device, flags, holder))
1115
			continue;
1116

1117 1118 1119
		if (!latest_dev ||
		    device->generation > latest_dev->generation)
			latest_dev = device;
1120
	}
1121
	if (fs_devices->open_devices == 0) {
1122
		ret = -EINVAL;
1123 1124
		goto out;
	}
Y
Yan Zheng 已提交
1125
	fs_devices->opened = 1;
1126
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1127
	fs_devices->total_rw_bytes = 0;
1128
out:
Y
Yan Zheng 已提交
1129 1130 1131
	return ret;
}

A
Anand Jain 已提交
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	struct btrfs_device *dev1, *dev2;

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Y
Yan Zheng 已提交
1146
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1147
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1148 1149 1150 1151 1152
{
	int ret;

	mutex_lock(&uuid_mutex);
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1153 1154
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1155
	} else {
A
Anand Jain 已提交
1156
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1157
		ret = open_fs_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1158
	}
1159 1160 1161 1162
	mutex_unlock(&uuid_mutex);
	return ret;
}

1163
static void btrfs_release_disk_super(struct page *page)
1164 1165 1166 1167 1168
{
	kunmap(page);
	put_page(page);
}

1169 1170 1171
static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				 struct page **page,
				 struct btrfs_super_block **disk_super)
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
{
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
		return 1;

	/* make sure our super fits in the page */
	if (sizeof(**disk_super) > PAGE_SIZE)
		return 1;

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
		return 1;

	/* pull in the page with our super */
	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				   index, GFP_KERNEL);

	if (IS_ERR_OR_NULL(*page))
		return 1;

	p = kmap(*page);

	/* align our pointer to the offset of the super block */
	*disk_super = p + (bytenr & ~PAGE_MASK);

	if (btrfs_super_bytenr(*disk_super) != bytenr ||
	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(*page);
		return 1;
	}

	if ((*disk_super)->label[0] &&
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';

	return 0;
}

1214 1215 1216 1217 1218
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1219
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1220 1221 1222
			  struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_super_block *disk_super;
1223
	struct btrfs_device *device;
1224
	struct block_device *bdev;
1225
	struct page *page;
1226
	int ret = 0;
1227
	u64 bytenr;
1228

1229 1230 1231 1232 1233 1234 1235
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	bytenr = btrfs_sb_offset(0);
1236
	flags |= FMODE_EXCL;
1237
	mutex_lock(&uuid_mutex);
1238 1239 1240 1241

	bdev = blkdev_get_by_path(path, flags, holder);
	if (IS_ERR(bdev)) {
		ret = PTR_ERR(bdev);
1242
		goto error;
1243 1244
	}

1245 1246
	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
		ret = -EINVAL;
1247
		goto error_bdev_put;
1248
	}
1249

1250
	device = device_list_add(path, disk_super);
1251 1252 1253 1254
	if (IS_ERR(device))
		ret = PTR_ERR(device);
	else
		*fs_devices_ret = device->fs_devices;
1255

1256
	btrfs_release_disk_super(page);
1257 1258

error_bdev_put:
1259
	blkdev_put(bdev, flags);
1260
error:
1261
	mutex_unlock(&uuid_mutex);
1262 1263
	return ret;
}
1264

1265 1266 1267 1268 1269
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
				   u64 end, u64 *length)
{
	struct btrfs_key key;
1270
	struct btrfs_root *root = device->fs_info->dev_root;
1271 1272 1273 1274 1275 1276 1277 1278 1279
	struct btrfs_dev_extent *dev_extent;
	struct btrfs_path *path;
	u64 extent_end;
	int ret;
	int slot;
	struct extent_buffer *l;

	*length = 0;

1280 1281
	if (start >= device->total_bytes ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
1282 1283 1284 1285 1286
		return 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1287
	path->reada = READA_FORWARD;
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
			goto out;
	}

	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto out;

			break;
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
			break;

1322
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
			goto next;

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (key.offset <= start && extent_end > end) {
			*length = end - start + 1;
			break;
		} else if (key.offset <= start && extent_end > start)
			*length += extent_end - start;
		else if (key.offset > start && extent_end <= end)
			*length += extent_end - key.offset;
		else if (key.offset > start && key.offset <= end) {
			*length += end - key.offset + 1;
			break;
		} else if (key.offset > end)
			break;

next:
		path->slots[0]++;
	}
	ret = 0;
out:
	btrfs_free_path(path);
	return ret;
}

1350
static int contains_pending_extent(struct btrfs_transaction *transaction,
1351 1352 1353
				   struct btrfs_device *device,
				   u64 *start, u64 len)
{
1354
	struct btrfs_fs_info *fs_info = device->fs_info;
1355
	struct extent_map *em;
1356
	struct list_head *search_list = &fs_info->pinned_chunks;
1357
	int ret = 0;
1358
	u64 physical_start = *start;
1359

1360 1361
	if (transaction)
		search_list = &transaction->pending_chunks;
1362 1363
again:
	list_for_each_entry(em, search_list, list) {
1364 1365 1366
		struct map_lookup *map;
		int i;

1367
		map = em->map_lookup;
1368
		for (i = 0; i < map->num_stripes; i++) {
1369 1370
			u64 end;

1371 1372
			if (map->stripes[i].dev != device)
				continue;
1373
			if (map->stripes[i].physical >= physical_start + len ||
1374
			    map->stripes[i].physical + em->orig_block_len <=
1375
			    physical_start)
1376
				continue;
1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393
			/*
			 * Make sure that while processing the pinned list we do
			 * not override our *start with a lower value, because
			 * we can have pinned chunks that fall within this
			 * device hole and that have lower physical addresses
			 * than the pending chunks we processed before. If we
			 * do not take this special care we can end up getting
			 * 2 pending chunks that start at the same physical
			 * device offsets because the end offset of a pinned
			 * chunk can be equal to the start offset of some
			 * pending chunk.
			 */
			end = map->stripes[i].physical + em->orig_block_len;
			if (end > *start) {
				*start = end;
				ret = 1;
			}
1394 1395
		}
	}
1396 1397
	if (search_list != &fs_info->pinned_chunks) {
		search_list = &fs_info->pinned_chunks;
1398 1399
		goto again;
	}
1400 1401 1402 1403 1404

	return ret;
}


1405
/*
1406 1407 1408 1409 1410 1411 1412
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1413
 *
1414 1415 1416
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1417 1418 1419 1420 1421 1422 1423 1424
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1425
 */
1426 1427 1428
int find_free_dev_extent_start(struct btrfs_transaction *transaction,
			       struct btrfs_device *device, u64 num_bytes,
			       u64 search_start, u64 *start, u64 *len)
1429
{
1430 1431
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1432
	struct btrfs_key key;
1433
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1434
	struct btrfs_path *path;
1435 1436 1437 1438
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1439 1440
	u64 search_end = device->total_bytes;
	int ret;
1441
	int slot;
1442
	struct extent_buffer *l;
1443 1444 1445 1446 1447 1448

	/*
	 * We don't want to overwrite the superblock on the drive nor any area
	 * used by the boot loader (grub for example), so we make sure to start
	 * at an offset of at least 1MB.
	 */
1449
	search_start = max_t(u64, search_start, SZ_1M);
1450

1451 1452 1453
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1454

1455 1456 1457
	max_hole_start = search_start;
	max_hole_size = 0;

1458
again:
1459 1460
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1461
		ret = -ENOSPC;
1462
		goto out;
1463 1464
	}

1465
	path->reada = READA_FORWARD;
1466 1467
	path->search_commit_root = 1;
	path->skip_locking = 1;
1468

1469 1470 1471
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1472

1473
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1474
	if (ret < 0)
1475
		goto out;
1476 1477 1478
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
1479
			goto out;
1480
	}
1481

1482 1483 1484 1485 1486 1487 1488 1489
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1490 1491 1492
				goto out;

			break;
1493 1494 1495 1496 1497 1498 1499
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1500
			break;
1501

1502
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1503
			goto next;
1504

1505 1506
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1507

1508 1509 1510 1511
			/*
			 * Have to check before we set max_hole_start, otherwise
			 * we could end up sending back this offset anyway.
			 */
1512
			if (contains_pending_extent(transaction, device,
1513
						    &search_start,
1514 1515 1516 1517 1518 1519 1520 1521
						    hole_size)) {
				if (key.offset >= search_start) {
					hole_size = key.offset - search_start;
				} else {
					WARN_ON_ONCE(1);
					hole_size = 0;
				}
			}
1522

1523 1524 1525 1526
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1527

1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1540 1541 1542 1543
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1544 1545 1546 1547
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1548 1549 1550 1551 1552
next:
		path->slots[0]++;
		cond_resched();
	}

1553 1554 1555 1556 1557
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1558
	if (search_end > search_start) {
1559 1560
		hole_size = search_end - search_start;

1561
		if (contains_pending_extent(transaction, device, &search_start,
1562 1563 1564 1565
					    hole_size)) {
			btrfs_release_path(path);
			goto again;
		}
1566

1567 1568 1569 1570
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1571 1572
	}

1573
	/* See above. */
1574
	if (max_hole_size < num_bytes)
1575 1576 1577 1578 1579
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1580
	btrfs_free_path(path);
1581
	*start = max_hole_start;
1582
	if (len)
1583
		*len = max_hole_size;
1584 1585 1586
	return ret;
}

1587 1588 1589 1590 1591 1592
int find_free_dev_extent(struct btrfs_trans_handle *trans,
			 struct btrfs_device *device, u64 num_bytes,
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
	return find_free_dev_extent_start(trans->transaction, device,
1593
					  num_bytes, 0, start, len);
1594 1595
}

1596
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1597
			  struct btrfs_device *device,
M
Miao Xie 已提交
1598
			  u64 start, u64 *dev_extent_len)
1599
{
1600 1601
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1602 1603 1604
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1605 1606 1607
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1608 1609 1610 1611 1612 1613 1614 1615

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1616
again:
1617
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1618 1619 1620
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1621 1622
		if (ret)
			goto out;
1623 1624 1625 1626 1627 1628
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1629 1630 1631
		key = found_key;
		btrfs_release_path(path);
		goto again;
1632 1633 1634 1635
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1636
	} else {
1637
		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1638
		goto out;
1639
	}
1640

M
Miao Xie 已提交
1641 1642
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1643
	ret = btrfs_del_item(trans, root, path);
1644
	if (ret) {
1645 1646
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to remove dev extent item");
Z
Zhao Lei 已提交
1647
	} else {
1648
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1649
	}
1650
out:
1651 1652 1653 1654
	btrfs_free_path(path);
	return ret;
}

1655 1656 1657
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_device *device,
				  u64 chunk_offset, u64 start, u64 num_bytes)
1658 1659 1660
{
	int ret;
	struct btrfs_path *path;
1661 1662
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1663 1664 1665 1666
	struct btrfs_dev_extent *extent;
	struct extent_buffer *leaf;
	struct btrfs_key key;

1667
	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1668
	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1669 1670 1671 1672 1673
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
Y
Yan Zheng 已提交
1674
	key.offset = start;
1675 1676 1677
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*extent));
1678 1679
	if (ret)
		goto out;
1680 1681 1682 1683

	leaf = path->nodes[0];
	extent = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_dev_extent);
1684 1685
	btrfs_set_dev_extent_chunk_tree(leaf, extent,
					BTRFS_CHUNK_TREE_OBJECTID);
1686 1687
	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1688 1689
	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

1690 1691
	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
	btrfs_mark_buffer_dirty(leaf);
1692
out:
1693 1694 1695 1696
	btrfs_free_path(path);
	return ret;
}

1697
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1698
{
1699 1700 1701 1702
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1703

1704 1705 1706 1707 1708 1709
	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	n = rb_last(&em_tree->map);
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1710
	}
1711 1712
	read_unlock(&em_tree->lock);

1713 1714 1715
	return ret;
}

1716 1717
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1718 1719 1720 1721
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1722 1723 1724 1725 1726
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1727 1728 1729 1730 1731

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1732
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1733 1734 1735
	if (ret < 0)
		goto error;

1736
	BUG_ON(ret == 0); /* Corruption */
1737

1738 1739
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1740 1741
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1742
		*devid_ret = 1;
1743 1744 1745
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1746
		*devid_ret = found_key.offset + 1;
1747 1748 1749
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1750
	btrfs_free_path(path);
1751 1752 1753 1754 1755 1756 1757
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1758
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1759
			    struct btrfs_fs_info *fs_info,
1760
			    struct btrfs_device *device)
1761
{
1762
	struct btrfs_root *root = fs_info->chunk_root;
1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1776
	key.offset = device->devid;
1777 1778

	ret = btrfs_insert_empty_item(trans, root, path, &key,
1779
				      sizeof(*dev_item));
1780 1781 1782 1783 1784 1785 1786
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1787
	btrfs_set_device_generation(leaf, dev_item, 0);
1788 1789 1790 1791
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1792 1793 1794 1795
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1796 1797 1798
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1799
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1800

1801
	ptr = btrfs_device_uuid(dev_item);
1802
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1803
	ptr = btrfs_device_fsid(dev_item);
1804
	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1805 1806
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1807
	ret = 0;
1808 1809 1810 1811
out:
	btrfs_free_path(path);
	return ret;
}
1812

1813 1814 1815 1816
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1817
static void update_dev_time(const char *path_name)
1818 1819 1820 1821
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1822
	if (IS_ERR(filp))
1823 1824 1825 1826 1827
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1828
static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1829 1830
			     struct btrfs_device *device)
{
1831
	struct btrfs_root *root = fs_info->chunk_root;
1832 1833 1834 1835 1836 1837 1838 1839 1840
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1841
	trans = btrfs_start_transaction(root, 0);
1842 1843 1844 1845
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1846 1847 1848 1849 1850
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1851 1852 1853 1854 1855
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1856 1857 1858 1859
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1860 1861 1862 1863 1864
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1865 1866
out:
	btrfs_free_path(path);
1867 1868
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1869 1870 1871
	return ret;
}

1872 1873 1874 1875 1876 1877 1878
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1879 1880
{
	u64 all_avail;
1881
	unsigned seq;
1882
	int i;
1883

1884
	do {
1885
		seq = read_seqbegin(&fs_info->profiles_lock);
1886

1887 1888 1889 1890
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1891

1892 1893 1894
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
		if (!(all_avail & btrfs_raid_group[i]))
			continue;
1895

1896 1897
		if (num_devices < btrfs_raid_array[i].devs_min) {
			int ret = btrfs_raid_mindev_error[i];
1898

1899 1900 1901
			if (ret)
				return ret;
		}
D
David Woodhouse 已提交
1902 1903
	}

1904
	return 0;
1905 1906
}

1907 1908
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1909
{
Y
Yan Zheng 已提交
1910
	struct btrfs_device *next_device;
1911 1912 1913

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
1914 1915
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
		struct btrfs_device *device, struct btrfs_device *this_dev)
{
	struct btrfs_device *next_device;

	if (this_dev)
		next_device = this_dev;
	else
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
								device);
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

1948 1949
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
		u64 devid)
1950 1951
{
	struct btrfs_device *device;
1952
	struct btrfs_fs_devices *cur_devices;
1953
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
1954
	u64 num_devices;
1955 1956 1957 1958
	int ret = 0;

	mutex_lock(&uuid_mutex);

1959
	num_devices = fs_devices->num_devices;
1960
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1961
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1962 1963 1964
		WARN_ON(num_devices < 1);
		num_devices--;
	}
1965
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
1966

1967
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1968
	if (ret)
1969 1970
		goto out;

1971 1972
	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
					   &device);
1973
	if (ret)
D
David Woodhouse 已提交
1974
		goto out;
1975

1976
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1977
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1978
		goto out;
1979 1980
	}

1981 1982
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    fs_info->fs_devices->rw_devices == 1) {
1983
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1984
		goto out;
Y
Yan Zheng 已提交
1985 1986
	}

1987
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1988
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
1989
		list_del_init(&device->dev_alloc_list);
1990
		device->fs_devices->rw_devices--;
1991
		mutex_unlock(&fs_info->chunk_mutex);
1992
	}
1993

1994
	mutex_unlock(&uuid_mutex);
1995
	ret = btrfs_shrink_device(device, 0);
1996
	mutex_lock(&uuid_mutex);
1997
	if (ret)
1998
		goto error_undo;
1999

2000 2001 2002 2003 2004
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
2005
	ret = btrfs_rm_dev_item(fs_info, device);
2006
	if (ret)
2007
		goto error_undo;
2008

2009
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2010
	btrfs_scrub_cancel_dev(fs_info, device);
2011 2012 2013 2014

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
2015 2016 2017 2018 2019
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
2020
	 */
2021 2022

	cur_devices = device->fs_devices;
2023
	mutex_lock(&fs_devices->device_list_mutex);
2024
	list_del_rcu(&device->dev_list);
2025

Y
Yan Zheng 已提交
2026
	device->fs_devices->num_devices--;
J
Josef Bacik 已提交
2027
	device->fs_devices->total_devices--;
Y
Yan Zheng 已提交
2028

2029
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2030
		device->fs_devices->missing_devices--;
2031

2032
	btrfs_assign_next_active_device(fs_info, device, NULL);
Y
Yan Zheng 已提交
2033

2034
	if (device->bdev) {
Y
Yan Zheng 已提交
2035
		device->fs_devices->open_devices--;
2036
		/* remove sysfs entry */
2037
		btrfs_sysfs_rm_device_link(fs_devices, device);
2038
	}
2039

2040 2041
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2042
	mutex_unlock(&fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2043

2044 2045 2046 2047 2048
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
2049
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2050 2051 2052
		btrfs_scratch_superblocks(device->bdev, device->name->str);

	btrfs_close_bdev(device);
2053
	call_rcu(&device->rcu, free_device_rcu);
2054

2055
	if (cur_devices->open_devices == 0) {
Y
Yan Zheng 已提交
2056
		while (fs_devices) {
2057 2058
			if (fs_devices->seed == cur_devices) {
				fs_devices->seed = cur_devices->seed;
Y
Yan Zheng 已提交
2059
				break;
2060
			}
Y
Yan Zheng 已提交
2061
			fs_devices = fs_devices->seed;
Y
Yan Zheng 已提交
2062
		}
2063
		cur_devices->seed = NULL;
2064
		close_fs_devices(cur_devices);
2065
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
2066 2067
	}

2068 2069 2070
out:
	mutex_unlock(&uuid_mutex);
	return ret;
2071

2072
error_undo:
2073
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2074
		mutex_lock(&fs_info->chunk_mutex);
2075
		list_add(&device->dev_alloc_list,
2076
			 &fs_devices->alloc_list);
2077
		device->fs_devices->rw_devices++;
2078
		mutex_unlock(&fs_info->chunk_mutex);
2079
	}
2080
	goto out;
2081 2082
}

2083 2084
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
					struct btrfs_device *srcdev)
2085
{
2086 2087
	struct btrfs_fs_devices *fs_devices;

2088
	lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
2089

2090 2091 2092 2093 2094 2095 2096
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2097

2098
	list_del_rcu(&srcdev->dev_list);
2099
	list_del(&srcdev->dev_alloc_list);
2100
	fs_devices->num_devices--;
2101
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2102
		fs_devices->missing_devices--;
2103

2104
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2105
		fs_devices->rw_devices--;
2106

2107
	if (srcdev->bdev)
2108
		fs_devices->open_devices--;
2109 2110 2111 2112 2113 2114
}

void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *srcdev)
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2115

2116
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2117 2118 2119
		/* zero out the old super if it is writable */
		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
	}
2120 2121

	btrfs_close_bdev(srcdev);
2122
	call_rcu(&srcdev->rcu, free_device_rcu);
2123 2124 2125 2126 2127

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
		struct btrfs_fs_devices *tmp_fs_devices;

2128 2129 2130 2131 2132 2133 2134 2135
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2136 2137 2138 2139 2140 2141 2142 2143 2144
		tmp_fs_devices = fs_info->fs_devices;
		while (tmp_fs_devices) {
			if (tmp_fs_devices->seed == fs_devices) {
				tmp_fs_devices->seed = fs_devices->seed;
				break;
			}
			tmp_fs_devices = tmp_fs_devices->seed;
		}
		fs_devices->seed = NULL;
2145
		close_fs_devices(fs_devices);
2146
		free_fs_devices(fs_devices);
2147
	}
2148 2149 2150 2151 2152
}

void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *tgtdev)
{
2153
	mutex_lock(&uuid_mutex);
2154 2155
	WARN_ON(!tgtdev);
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2156

2157
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2158

2159
	if (tgtdev->bdev)
2160
		fs_info->fs_devices->open_devices--;
2161

2162 2163
	fs_info->fs_devices->num_devices--;

2164
	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2165 2166 2167 2168

	list_del_rcu(&tgtdev->dev_list);

	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2169
	mutex_unlock(&uuid_mutex);
2170 2171 2172 2173 2174 2175 2176 2177 2178

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2179 2180

	btrfs_close_bdev(tgtdev);
2181
	call_rcu(&tgtdev->rcu, free_device_rcu);
2182 2183
}

2184
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2185
				     const char *device_path,
2186
				     struct btrfs_device **device)
2187 2188 2189 2190 2191 2192 2193 2194 2195 2196
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
	struct buffer_head *bh;

	*device = NULL;
	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2197
				    fs_info->bdev_holder, 0, &bdev, &bh);
2198 2199 2200 2201 2202
	if (ret)
		return ret;
	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2203
	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2204 2205 2206 2207 2208 2209 2210
	brelse(bh);
	if (!*device)
		ret = -ENOENT;
	blkdev_put(bdev, FMODE_READ);
	return ret;
}

2211
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2212
					 const char *device_path,
2213 2214 2215 2216 2217 2218 2219
					 struct btrfs_device **device)
{
	*device = NULL;
	if (strcmp(device_path, "missing") == 0) {
		struct list_head *devices;
		struct btrfs_device *tmp;

2220
		devices = &fs_info->fs_devices->devices;
2221 2222 2223 2224 2225
		/*
		 * It is safe to read the devices since the volume_mutex
		 * is held by the caller.
		 */
		list_for_each_entry(tmp, devices, dev_list) {
2226 2227
			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&tmp->dev_state) && !tmp->bdev) {
2228 2229 2230 2231 2232
				*device = tmp;
				break;
			}
		}

2233 2234
		if (!*device)
			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2235 2236 2237

		return 0;
	} else {
2238
		return btrfs_find_device_by_path(fs_info, device_path, device);
2239 2240 2241
	}
}

2242 2243 2244
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2245
int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2246 2247
				 const char *devpath,
				 struct btrfs_device **device)
2248 2249 2250
{
	int ret;

2251
	if (devid) {
2252
		ret = 0;
2253
		*device = btrfs_find_device(fs_info, devid, NULL, NULL);
2254 2255 2256
		if (!*device)
			ret = -ENOENT;
	} else {
2257
		if (!devpath || !devpath[0])
2258 2259
			return -EINVAL;

2260
		ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2261 2262 2263 2264 2265
							   device);
	}
	return ret;
}

Y
Yan Zheng 已提交
2266 2267 2268
/*
 * does all the dirty work required for changing file system's UUID.
 */
2269
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2270
{
2271
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2272
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2273
	struct btrfs_fs_devices *seed_devices;
2274
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2275 2276 2277
	struct btrfs_device *device;
	u64 super_flags;

2278
	lockdep_assert_held(&uuid_mutex);
Y
Yan Zheng 已提交
2279
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2280 2281
		return -EINVAL;

D
David Sterba 已提交
2282
	seed_devices = alloc_fs_devices(NULL);
2283 2284
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2285

Y
Yan Zheng 已提交
2286 2287 2288 2289
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2290
	}
Y
Yan Zheng 已提交
2291

2292
	list_add(&old_devices->fs_list, &fs_uuids);
Y
Yan Zheng 已提交
2293

Y
Yan Zheng 已提交
2294 2295 2296 2297
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2298
	mutex_init(&seed_devices->device_list_mutex);
2299

2300
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2301 2302
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2303 2304
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2305

2306
	mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2307
	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2308
	mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2309

Y
Yan Zheng 已提交
2310 2311 2312
	fs_devices->seeding = 0;
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2313 2314
	fs_devices->missing_devices = 0;
	fs_devices->rotating = 0;
Y
Yan Zheng 已提交
2315
	fs_devices->seed = seed_devices;
Y
Yan Zheng 已提交
2316 2317

	generate_random_uuid(fs_devices->fsid);
2318
	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2319
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2320
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2321

Y
Yan Zheng 已提交
2322 2323 2324 2325 2326 2327 2328 2329
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2330
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2331 2332
 */
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2333
			       struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2334
{
2335
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2336 2337 2338 2339 2340
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2341
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2369
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2381
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2382
				   BTRFS_UUID_SIZE);
2383
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2384
				   BTRFS_FSID_SIZE);
2385
		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2386
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2403
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2404
{
2405
	struct btrfs_root *root = fs_info->dev_root;
2406
	struct request_queue *q;
2407 2408 2409 2410
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
2411
	struct super_block *sb = fs_info->sb;
2412
	struct rcu_string *name;
2413
	u64 tmp;
Y
Yan Zheng 已提交
2414
	int seeding_dev = 0;
2415
	int ret = 0;
2416
	bool unlocked = false;
2417

2418
	if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2419
		return -EROFS;
2420

2421
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2422
				  fs_info->bdev_holder);
2423 2424
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2425

2426
	if (fs_info->fs_devices->seeding) {
Y
Yan Zheng 已提交
2427 2428 2429 2430 2431
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
	}

2432
	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2433

2434
	devices = &fs_info->fs_devices->devices;
2435

2436
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
Q
Qinghuang Feng 已提交
2437
	list_for_each_entry(device, devices, dev_list) {
2438 2439
		if (device->bdev == bdev) {
			ret = -EEXIST;
2440
			mutex_unlock(
2441
				&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2442
			goto error;
2443 2444
		}
	}
2445
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2446

2447
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2448
	if (IS_ERR(device)) {
2449
		/* we can safely leave the fs_devices entry around */
2450
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2451
		goto error;
2452 2453
	}

2454
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2455
	if (!name) {
Y
Yan Zheng 已提交
2456
		ret = -ENOMEM;
2457
		goto error_free_device;
2458
	}
2459
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2460

2461
	trans = btrfs_start_transaction(root, 0);
2462 2463
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2464
		goto error_free_device;
2465 2466
	}

2467
	q = bdev_get_queue(bdev);
2468
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Y
Yan Zheng 已提交
2469
	device->generation = trans->transid;
2470 2471 2472
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2473 2474
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2475
	device->disk_total_bytes = device->total_bytes;
2476
	device->commit_total_bytes = device->total_bytes;
2477
	device->fs_info = fs_info;
2478
	device->bdev = bdev;
2479
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2480
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2481
	device->mode = FMODE_EXCL;
2482
	device->dev_stats_valid = 1;
2483
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2484

Y
Yan Zheng 已提交
2485
	if (seeding_dev) {
2486
		sb->s_flags &= ~SB_RDONLY;
2487
		ret = btrfs_prepare_sprout(fs_info);
2488 2489 2490 2491
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2492
	}
2493

2494
	device->fs_devices = fs_info->fs_devices;
2495

2496
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2497
	mutex_lock(&fs_info->chunk_mutex);
2498
	list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
Y
Yan Zheng 已提交
2499
	list_add(&device->dev_alloc_list,
2500 2501 2502 2503 2504 2505
		 &fs_info->fs_devices->alloc_list);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
	fs_info->fs_devices->rw_devices++;
	fs_info->fs_devices->total_devices++;
	fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2506

2507
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2508

2509
	if (!blk_queue_nonrot(q))
2510
		fs_info->fs_devices->rotating = 1;
C
Chris Mason 已提交
2511

2512 2513
	tmp = btrfs_super_total_bytes(fs_info->super_copy);
	btrfs_set_super_total_bytes(fs_info->super_copy,
2514
		round_down(tmp + device->total_bytes, fs_info->sectorsize));
2515

2516 2517
	tmp = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2518 2519

	/* add sysfs device entry */
2520
	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2521

M
Miao Xie 已提交
2522 2523 2524 2525
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2526
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2527

2528
	mutex_unlock(&fs_info->chunk_mutex);
2529
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2530

Y
Yan Zheng 已提交
2531
	if (seeding_dev) {
2532
		mutex_lock(&fs_info->chunk_mutex);
2533
		ret = init_first_rw_device(trans, fs_info);
2534
		mutex_unlock(&fs_info->chunk_mutex);
2535
		if (ret) {
2536
			btrfs_abort_transaction(trans, ret);
2537
			goto error_sysfs;
2538
		}
M
Miao Xie 已提交
2539 2540
	}

2541
	ret = btrfs_add_dev_item(trans, fs_info, device);
M
Miao Xie 已提交
2542
	if (ret) {
2543
		btrfs_abort_transaction(trans, ret);
2544
		goto error_sysfs;
M
Miao Xie 已提交
2545 2546 2547 2548 2549
	}

	if (seeding_dev) {
		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];

2550
		ret = btrfs_finish_sprout(trans, fs_info);
2551
		if (ret) {
2552
			btrfs_abort_transaction(trans, ret);
2553
			goto error_sysfs;
2554
		}
2555 2556 2557 2558 2559

		/* Sprouting would change fsid of the mounted root,
		 * so rename the fsid on the sysfs
		 */
		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2560 2561 2562 2563
						fs_info->fsid);
		if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
			btrfs_warn(fs_info,
				   "sysfs: failed to create fsid for sprout");
Y
Yan Zheng 已提交
2564 2565
	}

2566
	ret = btrfs_commit_transaction(trans);
2567

Y
Yan Zheng 已提交
2568 2569 2570
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2571
		unlocked = true;
2572

2573 2574 2575
		if (ret) /* transaction commit */
			return ret;

2576
		ret = btrfs_relocate_sys_chunks(fs_info);
2577
		if (ret < 0)
2578
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2579
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2580 2581 2582 2583
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2584 2585 2586
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2587
		}
2588
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2589
	}
2590

2591 2592
	/* Update ctime/mtime for libblkid */
	update_dev_time(device_path);
Y
Yan Zheng 已提交
2593
	return ret;
2594

2595 2596
error_sysfs:
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2597
error_trans:
2598
	if (seeding_dev)
2599
		sb->s_flags |= SB_RDONLY;
2600 2601
	if (trans)
		btrfs_end_transaction(trans);
2602
error_free_device:
2603
	btrfs_free_device(device);
Y
Yan Zheng 已提交
2604
error:
2605
	blkdev_put(bdev, FMODE_EXCL);
2606
	if (seeding_dev && !unlocked) {
Y
Yan Zheng 已提交
2607 2608 2609
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2610
	return ret;
2611 2612
}

C
Chris Mason 已提交
2613 2614
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2615 2616 2617
{
	int ret;
	struct btrfs_path *path;
2618
	struct btrfs_root *root = device->fs_info->chunk_root;
2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2648 2649 2650 2651
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2652 2653 2654 2655 2656 2657 2658
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2659
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2660 2661
		      struct btrfs_device *device, u64 new_size)
{
2662 2663
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2664
	struct btrfs_fs_devices *fs_devices;
M
Miao Xie 已提交
2665 2666
	u64 old_total;
	u64 diff;
2667

2668
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Y
Yan Zheng 已提交
2669
		return -EACCES;
M
Miao Xie 已提交
2670

2671 2672
	new_size = round_down(new_size, fs_info->sectorsize);

2673
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2674
	old_total = btrfs_super_total_bytes(super_copy);
2675
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2676

2677
	if (new_size <= device->total_bytes ||
2678
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2679
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2680
		return -EINVAL;
M
Miao Xie 已提交
2681
	}
Y
Yan Zheng 已提交
2682

2683
	fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2684

2685 2686
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2687 2688
	device->fs_devices->total_rw_bytes += diff;

2689 2690
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2691
	btrfs_clear_space_info_full(device->fs_info);
2692 2693 2694
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
			      &fs_devices->resized_devices);
2695
	mutex_unlock(&fs_info->chunk_mutex);
2696

2697 2698 2699 2700
	return btrfs_update_device(trans, device);
}

static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2701
			    struct btrfs_fs_info *fs_info, u64 chunk_offset)
2702
{
2703
	struct btrfs_root *root = fs_info->chunk_root;
2704 2705 2706 2707 2708 2709 2710 2711
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2712
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2713 2714 2715 2716
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2717 2718 2719
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2720 2721
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2722 2723 2724
		ret = -ENOENT;
		goto out;
	}
2725 2726

	ret = btrfs_del_item(trans, root, path);
2727
	if (ret < 0)
2728 2729
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2730
out:
2731
	btrfs_free_path(path);
2732
	return ret;
2733 2734
}

2735
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2736
{
2737
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2738 2739 2740 2741 2742 2743 2744 2745 2746 2747
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2748
	mutex_lock(&fs_info->chunk_mutex);
2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2768
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2769 2770 2771 2772 2773 2774 2775 2776 2777
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
2778
	mutex_unlock(&fs_info->chunk_mutex);
2779 2780 2781
	return ret;
}

2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810
static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
					u64 logical, u64 length)
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2811
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2812
		       struct btrfs_fs_info *fs_info, u64 chunk_offset)
2813 2814 2815
{
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2816
	u64 dev_extent_len = 0;
2817
	int i, ret = 0;
2818
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2819

2820 2821
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em)) {
2822 2823
		/*
		 * This is a logic error, but we don't want to just rely on the
2824
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2825 2826 2827
		 * do anything we still error out.
		 */
		ASSERT(0);
2828
		return PTR_ERR(em);
2829
	}
2830
	map = em->map_lookup;
2831
	mutex_lock(&fs_info->chunk_mutex);
2832
	check_system_chunk(trans, fs_info, map->type);
2833
	mutex_unlock(&fs_info->chunk_mutex);
2834

2835 2836 2837 2838 2839 2840
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
	mutex_lock(&fs_devices->device_list_mutex);
2841
	for (i = 0; i < map->num_stripes; i++) {
2842
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
2843 2844 2845
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
2846
		if (ret) {
2847
			mutex_unlock(&fs_devices->device_list_mutex);
2848
			btrfs_abort_transaction(trans, ret);
2849 2850
			goto out;
		}
2851

M
Miao Xie 已提交
2852
		if (device->bytes_used > 0) {
2853
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2854 2855
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
2856
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2857
			btrfs_clear_space_info_full(fs_info);
2858
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2859
		}
2860

2861 2862
		if (map->stripes[i].dev) {
			ret = btrfs_update_device(trans, map->stripes[i].dev);
2863
			if (ret) {
2864
				mutex_unlock(&fs_devices->device_list_mutex);
2865
				btrfs_abort_transaction(trans, ret);
2866 2867
				goto out;
			}
2868
		}
2869
	}
2870 2871
	mutex_unlock(&fs_devices->device_list_mutex);

2872
	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2873
	if (ret) {
2874
		btrfs_abort_transaction(trans, ret);
2875 2876
		goto out;
	}
2877

2878
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2879

2880
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2881
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2882
		if (ret) {
2883
			btrfs_abort_transaction(trans, ret);
2884 2885
			goto out;
		}
2886 2887
	}

2888
	ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2889
	if (ret) {
2890
		btrfs_abort_transaction(trans, ret);
2891 2892
		goto out;
	}
Y
Yan Zheng 已提交
2893

2894
out:
Y
Yan Zheng 已提交
2895 2896
	/* once for us */
	free_extent_map(em);
2897 2898
	return ret;
}
Y
Yan Zheng 已提交
2899

2900
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2901
{
2902
	struct btrfs_root *root = fs_info->chunk_root;
2903
	struct btrfs_trans_handle *trans;
2904
	int ret;
Y
Yan Zheng 已提交
2905

2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
2918
	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2919

2920
	ret = btrfs_can_relocate(fs_info, chunk_offset);
2921 2922 2923 2924
	if (ret)
		return -ENOSPC;

	/* step one, relocate all the extents inside this chunk */
2925
	btrfs_scrub_pause(fs_info);
2926
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2927
	btrfs_scrub_continue(fs_info);
2928 2929 2930
	if (ret)
		return ret;

2931 2932 2933 2934 2935 2936 2937 2938 2939 2940
	/*
	 * We add the kobjects here (and after forcing data chunk creation)
	 * since relocation is the only place we'll create chunks of a new
	 * type at runtime.  The only place where we'll remove the last
	 * chunk of a type is the call immediately below this one.  Even
	 * so, we're protected against races with the cleaner thread since
	 * we're covered by the delete_unused_bgs_mutex.
	 */
	btrfs_add_raid_kobjects(fs_info);

2941 2942 2943 2944 2945 2946 2947 2948
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

2949
	/*
2950 2951
	 * step two, delete the device extents and the
	 * chunk tree entries
2952
	 */
2953
	ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2954
	btrfs_end_transaction(trans);
2955
	return ret;
Y
Yan Zheng 已提交
2956 2957
}

2958
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2959
{
2960
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2961 2962 2963 2964 2965 2966
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
2967 2968
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
2969 2970 2971 2972 2973 2974
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2975
again:
Y
Yan Zheng 已提交
2976 2977 2978 2979 2980
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
2981
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2982
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2983
		if (ret < 0) {
2984
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2985
			goto error;
2986
		}
2987
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2988 2989 2990

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
2991
		if (ret)
2992
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2993 2994 2995 2996
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
2997

Y
Yan Zheng 已提交
2998 2999
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
3000

Y
Yan Zheng 已提交
3001 3002 3003
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3004
		btrfs_release_path(path);
3005

Y
Yan Zheng 已提交
3006
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3007
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3008 3009
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
3010 3011
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
3012
		}
3013
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3014

Y
Yan Zheng 已提交
3015 3016 3017 3018 3019
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3020 3021 3022 3023
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3024
	} else if (WARN_ON(failed && retried)) {
3025 3026
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
3027 3028 3029
error:
	btrfs_free_path(path);
	return ret;
3030 3031
}

3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
	struct btrfs_block_group_cache *cache;
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
		spin_lock(&fs_info->data_sinfo->lock);
		bytes_used = fs_info->data_sinfo->bytes_used;
		spin_unlock(&fs_info->data_sinfo->lock);

		if (!bytes_used) {
			struct btrfs_trans_handle *trans;
			int ret;

			trans =	btrfs_join_transaction(fs_info->tree_root);
			if (IS_ERR(trans))
				return PTR_ERR(trans);

			ret = btrfs_force_chunk_alloc(trans, fs_info,
						      BTRFS_BLOCK_GROUP_DATA);
			btrfs_end_transaction(trans);
			if (ret < 0)
				return ret;

3068 3069
			btrfs_add_raid_kobjects(fs_info);

3070 3071 3072 3073 3074 3075
			return 1;
		}
	}
	return 0;
}

3076
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3077 3078
			       struct btrfs_balance_control *bctl)
{
3079
	struct btrfs_root *root = fs_info->tree_root;
3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3099
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3100 3101 3102 3103 3104 3105 3106 3107 3108 3109
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3110
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3124
	err = btrfs_commit_transaction(trans);
3125 3126 3127 3128 3129
	if (err && !ret)
		ret = err;
	return ret;
}

3130
static int del_balance_item(struct btrfs_fs_info *fs_info)
3131
{
3132
	struct btrfs_root *root = fs_info->tree_root;
3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3149
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3163
	err = btrfs_commit_transaction(trans);
3164 3165 3166 3167 3168
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3193
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3194 3195 3196 3197 3198
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3199
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3200 3201 3202 3203 3204
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3205
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3206 3207 3208 3209 3210 3211
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240
/*
 * Should be called with both balance and volume mutexes held to
 * serialize other volume operations (add_dev/rm_dev/resize) with
 * restriper.  Same goes for unset_balance_control.
 */
static void set_balance_control(struct btrfs_balance_control *bctl)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;

	BUG_ON(fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
}

static void unset_balance_control(struct btrfs_fs_info *fs_info)
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
}

I
Ilya Dryomov 已提交
3241 3242 3243 3244
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3245
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3246 3247
				 struct btrfs_balance_args *bargs)
{
3248 3249
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3250

3251
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3252 3253 3254 3255 3256
		return 0;

	return 1;
}

3257
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3258
			      struct btrfs_balance_args *bargs)
3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
		user_thresh_min = div_factor_fine(cache->key.offset,
					bargs->usage_min);

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
		user_thresh_max = cache->key.offset;
	else
		user_thresh_max = div_factor_fine(cache->key.offset,
					bargs->usage_max);

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3290
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3291
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3292 3293 3294 3295 3296 3297 3298 3299
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

3300
	if (bargs->usage_min == 0)
3301
		user_thresh = 1;
3302 3303 3304 3305 3306 3307
	else if (bargs->usage > 100)
		user_thresh = cache->key.offset;
	else
		user_thresh = div_factor_fine(cache->key.offset,
					      bargs->usage);

I
Ilya Dryomov 已提交
3308 3309 3310 3311 3312 3313 3314
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

I
Ilya Dryomov 已提交
3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
D
David Woodhouse 已提交
3348 3349 3350 3351 3352 3353 3354 3355 3356
	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
		factor = num_stripes / 2;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
		factor = num_stripes - 1;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
		factor = num_stripes - 2;
	} else {
		factor = num_stripes;
	}
I
Ilya Dryomov 已提交
3357 3358 3359 3360 3361 3362 3363 3364

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3365
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3366 3367 3368 3369 3370 3371 3372 3373 3374

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3402
static int chunk_soft_convert_filter(u64 chunk_type,
3403 3404 3405 3406 3407
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3408 3409
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3410

3411
	if (bargs->target == chunk_type)
3412 3413 3414 3415 3416
		return 1;

	return 0;
}

3417
static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3418 3419 3420
				struct extent_buffer *leaf,
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3421
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3438 3439 3440 3441
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3442 3443 3444 3445
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3446
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3447
		return 0;
3448
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3449
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3450
		return 0;
I
Ilya Dryomov 已提交
3451 3452 3453 3454 3455 3456
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3457 3458 3459 3460
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3461
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3462
		return 0;
3463 3464 3465 3466 3467 3468
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3469 3470
	}

3471 3472 3473 3474 3475 3476
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3477 3478 3479 3480 3481 3482
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3483 3484 3485 3486 3487 3488 3489 3490
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3491 3492 3493
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3494
		 * determined here because we do not have the global information
3495 3496 3497 3498 3499 3500
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3501 3502
	}

3503 3504 3505
	return 1;
}

3506
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3507
{
3508
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3509 3510 3511
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct list_head *devices;
3512 3513 3514
	struct btrfs_device *device;
	u64 old_size;
	u64 size_to_free;
3515
	u64 chunk_type;
3516
	struct btrfs_chunk *chunk;
3517
	struct btrfs_path *path = NULL;
3518 3519
	struct btrfs_key key;
	struct btrfs_key found_key;
3520
	struct btrfs_trans_handle *trans;
3521 3522
	struct extent_buffer *leaf;
	int slot;
3523 3524
	int ret;
	int enospc_errors = 0;
3525
	bool counting = true;
3526
	/* The single value limit and min/max limits use the same bytes in the */
3527 3528 3529
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3530 3531 3532
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3533
	int chunk_reserved = 0;
3534 3535

	/* step one make some room on all the devices */
3536
	devices = &fs_info->fs_devices->devices;
Q
Qinghuang Feng 已提交
3537
	list_for_each_entry(device, devices, dev_list) {
3538
		old_size = btrfs_device_get_total_bytes(device);
3539
		size_to_free = div_factor(old_size, 1);
3540
		size_to_free = min_t(u64, size_to_free, SZ_1M);
3541
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3542 3543
		    btrfs_device_get_total_bytes(device) -
		    btrfs_device_get_bytes_used(device) > size_to_free ||
3544
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3545 3546 3547
			continue;

		ret = btrfs_shrink_device(device, old_size - size_to_free);
3548 3549
		if (ret == -ENOSPC)
			break;
3550 3551 3552 3553 3554
		if (ret) {
			/* btrfs_shrink_device never returns ret > 0 */
			WARN_ON(ret > 0);
			goto error;
		}
3555

3556
		trans = btrfs_start_transaction(dev_root, 0);
3557 3558 3559 3560 3561 3562 3563 3564
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3565 3566

		ret = btrfs_grow_device(trans, device, old_size);
3567
		if (ret) {
3568
			btrfs_end_transaction(trans);
3569 3570 3571 3572 3573 3574 3575 3576
			/* btrfs_grow_device never returns ret > 0 */
			WARN_ON(ret > 0);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3577

3578
		btrfs_end_transaction(trans);
3579 3580 3581 3582
	}

	/* step two, relocate all the chunks */
	path = btrfs_alloc_path();
3583 3584 3585 3586
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3587 3588 3589 3590 3591 3592

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3593
	if (!counting) {
3594 3595 3596 3597
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3598 3599 3600 3601
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3602 3603 3604 3605
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3606
	while (1) {
3607
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3608
		    atomic_read(&fs_info->balance_cancel_req)) {
3609 3610 3611 3612
			ret = -ECANCELED;
			goto error;
		}

3613
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
3614
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3615 3616
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3617
			goto error;
3618
		}
3619 3620 3621 3622 3623 3624

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3625
			BUG(); /* FIXME break ? */
3626 3627 3628

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3629
		if (ret) {
3630
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3631
			ret = 0;
3632
			break;
3633
		}
3634

3635 3636 3637
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3638

3639 3640
		if (found_key.objectid != key.objectid) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3641
			break;
3642
		}
3643

3644
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3645
		chunk_type = btrfs_chunk_type(leaf, chunk);
3646

3647 3648 3649 3650 3651 3652
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3653
		ret = should_balance_chunk(fs_info, leaf, chunk,
3654
					   found_key.offset);
3655

3656
		btrfs_release_path(path);
3657 3658
		if (!ret) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3659
			goto loop;
3660
		}
3661

3662
		if (counting) {
3663
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3664 3665 3666
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3689 3690 3691
			goto loop;
		}

3692 3693 3694 3695 3696 3697 3698 3699 3700
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
3701 3702 3703
			if (ret < 0) {
				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				goto error;
3704 3705
			} else if (ret == 1) {
				chunk_reserved = 1;
3706 3707 3708
			}
		}

3709
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3710
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3711 3712
		if (ret && ret != -ENOSPC)
			goto error;
3713
		if (ret == -ENOSPC) {
3714
			enospc_errors++;
3715 3716 3717 3718 3719
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3720
loop:
3721 3722
		if (found_key.offset == 0)
			break;
3723
		key.offset = found_key.offset - 1;
3724
	}
3725

3726 3727 3728 3729 3730
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3731 3732
error:
	btrfs_free_path(path);
3733
	if (enospc_errors) {
3734
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3735
			   enospc_errors);
3736 3737 3738 3739
		if (!ret)
			ret = -ENOSPC;
	}

3740 3741 3742
	return ret;
}

3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

	/* true if exactly one bit set */
	return (flags & (flags - 1)) == 0;
}

3767 3768
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3769 3770 3771 3772
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3773 3774
}

3775 3776
static void __cancel_balance(struct btrfs_fs_info *fs_info)
{
3777 3778
	int ret;

3779
	unset_balance_control(fs_info);
3780
	ret = del_balance_item(fs_info);
3781
	if (ret)
3782
		btrfs_handle_fs_error(fs_info, ret, NULL);
3783 3784
}

3785 3786 3787 3788 3789 3790 3791 3792 3793
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
		u64 allowed)
{
	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
		 (bctl_arg->target & ~allowed)));
}

3794 3795 3796 3797 3798 3799 3800
/*
 * Should be called with both balance and volume mutexes held
 */
int btrfs_balance(struct btrfs_balance_control *bctl,
		  struct btrfs_ioctl_balance_args *bargs)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;
3801
	u64 meta_target, data_target;
3802
	u64 allowed;
3803
	int mixed = 0;
3804
	int ret;
3805
	u64 num_devices;
3806
	unsigned seq;
3807

3808
	if (btrfs_fs_closing(fs_info) ||
3809 3810
	    atomic_read(&fs_info->balance_pause_req) ||
	    atomic_read(&fs_info->balance_cancel_req)) {
3811 3812 3813 3814
		ret = -EINVAL;
		goto out;
	}

3815 3816 3817 3818
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

3819 3820 3821 3822
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
3823 3824
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
3825 3826 3827
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
3828 3829
			btrfs_err(fs_info,
				  "with mixed groups data and metadata balance options must be the same");
3830 3831 3832 3833 3834
			ret = -EINVAL;
			goto out;
		}
	}

3835
	num_devices = fs_info->fs_devices->num_devices;
3836
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3837 3838 3839 3840
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		BUG_ON(num_devices < 1);
		num_devices--;
	}
3841
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3842 3843
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
	if (num_devices > 1)
3844
		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3845 3846 3847 3848 3849
	if (num_devices > 2)
		allowed |= BTRFS_BLOCK_GROUP_RAID5;
	if (num_devices > 3)
		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
			    BTRFS_BLOCK_GROUP_RAID6);
3850
	if (validate_convert_profile(&bctl->data, allowed)) {
J
Jeff Mahoney 已提交
3851 3852 3853
		btrfs_err(fs_info,
			  "unable to start balance with target data profile %llu",
			  bctl->data.target);
3854 3855 3856
		ret = -EINVAL;
		goto out;
	}
3857
	if (validate_convert_profile(&bctl->meta, allowed)) {
3858
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3859 3860
			  "unable to start balance with target metadata profile %llu",
			  bctl->meta.target);
3861 3862 3863
		ret = -EINVAL;
		goto out;
	}
3864
	if (validate_convert_profile(&bctl->sys, allowed)) {
3865
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3866 3867
			  "unable to start balance with target system profile %llu",
			  bctl->sys.target);
3868 3869 3870 3871 3872 3873
		ret = -EINVAL;
		goto out;
	}

	/* allow to reduce meta or sys integrity only if force set */
	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
3874 3875 3876
			BTRFS_BLOCK_GROUP_RAID10 |
			BTRFS_BLOCK_GROUP_RAID5 |
			BTRFS_BLOCK_GROUP_RAID6;
3877 3878 3879 3880 3881 3882 3883 3884 3885 3886
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
		     !(bctl->meta.target & allowed))) {
			if (bctl->flags & BTRFS_BALANCE_FORCE) {
J
Jeff Mahoney 已提交
3887 3888
				btrfs_info(fs_info,
					   "force reducing metadata integrity");
3889
			} else {
J
Jeff Mahoney 已提交
3890 3891
				btrfs_err(fs_info,
					  "balance will reduce metadata integrity, use force if you want this");
3892 3893 3894
				ret = -EINVAL;
				goto out;
			}
3895
		}
3896
	} while (read_seqretry(&fs_info->profiles_lock, seq));
3897

3898 3899 3900 3901 3902 3903 3904
	/* if we're not converting, the target field is uninitialized */
	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->data.target : fs_info->avail_data_alloc_bits;
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3905
		btrfs_warn(fs_info,
J
Jeff Mahoney 已提交
3906
			   "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
3907
			   meta_target, data_target);
3908 3909
	}

3910
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
3911
	if (ret && ret != -EEXIST)
3912 3913
		goto out;

I
Ilya Dryomov 已提交
3914 3915 3916 3917 3918 3919 3920 3921 3922
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
		set_balance_control(bctl);
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
3923

3924
	atomic_inc(&fs_info->balance_running);
3925 3926 3927 3928 3929
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
3930
	atomic_dec(&fs_info->balance_running);
3931 3932 3933

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
3934
		update_ioctl_balance_args(fs_info, 0, bargs);
3935 3936
	}

3937 3938 3939
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
		__cancel_balance(fs_info);
3940
		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3941 3942
	}

3943
	wake_up(&fs_info->balance_wait_q);
3944 3945 3946

	return ret;
out:
I
Ilya Dryomov 已提交
3947 3948
	if (bctl->flags & BTRFS_BALANCE_RESUME)
		__cancel_balance(fs_info);
3949
	else
I
Ilya Dryomov 已提交
3950
		kfree(bctl);
3951 3952
	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);

I
Ilya Dryomov 已提交
3953 3954 3955 3956 3957
	return ret;
}

static int balance_kthread(void *data)
{
3958
	struct btrfs_fs_info *fs_info = data;
3959
	int ret = 0;
I
Ilya Dryomov 已提交
3960 3961 3962 3963

	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);

3964
	if (fs_info->balance_ctl) {
3965
		btrfs_info(fs_info, "continuing balance");
3966
		ret = btrfs_balance(fs_info->balance_ctl, NULL);
3967
	}
I
Ilya Dryomov 已提交
3968 3969 3970

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
3971

I
Ilya Dryomov 已提交
3972 3973 3974
	return ret;
}

3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

	spin_lock(&fs_info->balance_lock);
	if (!fs_info->balance_ctl) {
		spin_unlock(&fs_info->balance_lock);
		return 0;
	}
	spin_unlock(&fs_info->balance_lock);

3986
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
3987
		btrfs_info(fs_info, "force skipping balance");
3988 3989 3990
		return 0;
	}

3991 3992 3993 3994 3995 3996 3997 3998 3999
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

4000
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4001
	return PTR_ERR_OR_ZERO(tsk);
4002 4003
}

4004
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
4019
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
4020 4021
	key.offset = 0;

4022
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
4023
	if (ret < 0)
4024
		goto out;
I
Ilya Dryomov 已提交
4025 4026
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
4027 4028 4029 4030 4031 4032 4033
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
4034 4035 4036 4037 4038
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

4039 4040 4041
	bctl->fs_info = fs_info;
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
4042 4043 4044 4045 4046 4047 4048 4049

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062
	/*
	 * This should never happen, as the paused balance state is recovered
	 * during mount without any chance of other exclusive ops to collide.
	 *
	 * This gives the exclusive op status to balance and keeps in paused
	 * state until user intervention (cancel or umount). If the ownership
	 * cannot be assigned, show a message but do not fail. The balance
	 * is in a paused state and must have fs_info::balance_ctl properly
	 * set up.
	 */
	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
		btrfs_warn(fs_info,
	"cannot set exclusive op status to balance, resume manually");
4063

4064 4065
	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
4066

4067 4068 4069 4070
	set_balance_control(bctl);

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
I
Ilya Dryomov 已提交
4071 4072
out:
	btrfs_free_path(path);
4073 4074 4075
	return ret;
}

4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	if (atomic_read(&fs_info->balance_running)) {
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
		BUG_ON(atomic_read(&fs_info->balance_running));
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4105 4106
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
4107
	if (sb_rdonly(fs_info->sb))
4108 4109
		return -EROFS;

4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
	if (atomic_read(&fs_info->balance_running)) {
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);
		mutex_lock(&fs_info->balance_mutex);
	} else {
		/* __cancel_balance needs volume_mutex */
		mutex_unlock(&fs_info->balance_mutex);
		mutex_lock(&fs_info->volume_mutex);
		mutex_lock(&fs_info->balance_mutex);

4132
		if (fs_info->balance_ctl) {
4133
			__cancel_balance(fs_info);
4134 4135
			clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
		}
4136 4137 4138 4139 4140 4141 4142 4143 4144 4145

		mutex_unlock(&fs_info->volume_mutex);
	}

	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

S
Stefan Behrens 已提交
4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156
static int btrfs_uuid_scan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4157
	struct btrfs_trans_handle *trans = NULL;
S
Stefan Behrens 已提交
4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4170 4171
		ret = btrfs_search_forward(root, &key, path,
				BTRFS_OLDEST_GENERATION);
S
Stefan Behrens 已提交
4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4195 4196 4197 4198 4199 4200 4201

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4202 4203 4204 4205 4206 4207 4208 4209 4210
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4211 4212 4213 4214 4215 4216
			continue;
		} else {
			goto skip;
		}
update_tree:
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4217
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4218 4219 4220 4221
						  root_item.uuid,
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4222
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4223 4224 4225 4226 4227 4228
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4229
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4230 4231 4232 4233
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4234
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4235 4236 4237 4238 4239
					ret);
				break;
			}
		}

4240
skip:
S
Stefan Behrens 已提交
4241
		if (trans) {
4242
			ret = btrfs_end_transaction(trans);
4243
			trans = NULL;
S
Stefan Behrens 已提交
4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265
			if (ret)
				break;
		}

		btrfs_release_path(path);
		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4266
	if (trans && !IS_ERR(trans))
4267
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4268
	if (ret)
4269
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4270
	else
4271
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4272 4273 4274 4275
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4276 4277 4278 4279
/*
 * Callback for btrfs_uuid_tree_iterate().
 * returns:
 * 0	check succeeded, the entry is not outdated.
4280
 * < 0	if an error occurred.
4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332
 * > 0	if the check failed, which means the caller shall remove the entry.
 */
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				       u8 *uuid, u8 type, u64 subid)
{
	struct btrfs_key key;
	int ret = 0;
	struct btrfs_root *subvol_root;

	if (type != BTRFS_UUID_KEY_SUBVOL &&
	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
		goto out;

	key.objectid = subid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(subvol_root)) {
		ret = PTR_ERR(subvol_root);
		if (ret == -ENOENT)
			ret = 1;
		goto out;
	}

	switch (type) {
	case BTRFS_UUID_KEY_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
			ret = 1;
		break;
	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.received_uuid,
			   BTRFS_UUID_SIZE))
			ret = 1;
		break;
	}

out:
	return ret;
}

static int btrfs_uuid_rescan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
	if (ret < 0) {
4333
		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4334 4335 4336 4337 4338 4339
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

4340 4341 4342 4343 4344
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4345 4346
	struct task_struct *task;
	int ret;
4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	uuid_root = btrfs_create_tree(trans, fs_info,
				      BTRFS_UUID_TREE_OBJECTID);
	if (IS_ERR(uuid_root)) {
4359
		ret = PTR_ERR(uuid_root);
4360
		btrfs_abort_transaction(trans, ret);
4361
		btrfs_end_transaction(trans);
4362
		return ret;
4363 4364 4365 4366
	}

	fs_info->uuid_root = uuid_root;

4367
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4368 4369 4370 4371 4372 4373
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4374
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4375
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4376 4377 4378 4379 4380
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4381
}
S
Stefan Behrens 已提交
4382

4383 4384 4385 4386 4387 4388 4389 4390
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4391
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
4392 4393 4394 4395 4396 4397 4398
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

4399 4400 4401 4402 4403 4404 4405
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4406 4407
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4408 4409 4410 4411 4412 4413 4414
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4415 4416
	int failed = 0;
	bool retried = false;
4417
	bool checked_pending_chunks = false;
4418 4419
	struct extent_buffer *l;
	struct btrfs_key key;
4420
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4421
	u64 old_total = btrfs_super_total_bytes(super_copy);
4422
	u64 old_size = btrfs_device_get_total_bytes(device);
4423 4424 4425
	u64 diff;

	new_size = round_down(new_size, fs_info->sectorsize);
4426
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4427

4428
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4429 4430
		return -EINVAL;

4431 4432 4433 4434
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4435
	path->reada = READA_FORWARD;
4436

4437
	mutex_lock(&fs_info->chunk_mutex);
4438

4439
	btrfs_device_set_total_bytes(device, new_size);
4440
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Y
Yan Zheng 已提交
4441
		device->fs_devices->total_rw_bytes -= diff;
4442
		atomic64_sub(diff, &fs_info->free_chunk_space);
4443
	}
4444
	mutex_unlock(&fs_info->chunk_mutex);
4445

4446
again:
4447 4448 4449 4450
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4451
	do {
4452
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
4453
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4454
		if (ret < 0) {
4455
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4456
			goto done;
4457
		}
4458 4459

		ret = btrfs_previous_item(root, path, 0, key.type);
4460
		if (ret)
4461
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4462 4463 4464 4465
		if (ret < 0)
			goto done;
		if (ret) {
			ret = 0;
4466
			btrfs_release_path(path);
4467
			break;
4468 4469 4470 4471 4472 4473
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4474
		if (key.objectid != device->devid) {
4475
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4476
			btrfs_release_path(path);
4477
			break;
4478
		}
4479 4480 4481 4482

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4483
		if (key.offset + length <= new_size) {
4484
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4485
			btrfs_release_path(path);
4486
			break;
4487
		}
4488 4489

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4490
		btrfs_release_path(path);
4491

4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
			goto done;
		}

4504 4505
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4506
		if (ret && ret != -ENOSPC)
4507
			goto done;
4508 4509
		if (ret == -ENOSPC)
			failed++;
4510
	} while (key.offset-- > 0);
4511 4512 4513 4514 4515 4516 4517 4518

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4519 4520
	}

4521
	/* Shrinking succeeded, else we would be at "done". */
4522
	trans = btrfs_start_transaction(root, 0);
4523 4524 4525 4526 4527
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4528
	mutex_lock(&fs_info->chunk_mutex);
4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545

	/*
	 * We checked in the above loop all device extents that were already in
	 * the device tree. However before we have updated the device's
	 * total_bytes to the new size, we might have had chunk allocations that
	 * have not complete yet (new block groups attached to transaction
	 * handles), and therefore their device extents were not yet in the
	 * device tree and we missed them in the loop above. So if we have any
	 * pending chunk using a device extent that overlaps the device range
	 * that we can not use anymore, commit the current transaction and
	 * repeat the search on the device tree - this way we guarantee we will
	 * not have chunks using device extents that end beyond 'new_size'.
	 */
	if (!checked_pending_chunks) {
		u64 start = new_size;
		u64 len = old_size - new_size;

4546 4547
		if (contains_pending_extent(trans->transaction, device,
					    &start, len)) {
4548
			mutex_unlock(&fs_info->chunk_mutex);
4549 4550 4551
			checked_pending_chunks = true;
			failed = 0;
			retried = false;
4552
			ret = btrfs_commit_transaction(trans);
4553 4554 4555 4556 4557 4558
			if (ret)
				goto done;
			goto again;
		}
	}

4559
	btrfs_device_set_disk_total_bytes(device, new_size);
4560 4561
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
4562
			      &fs_info->fs_devices->resized_devices);
4563 4564

	WARN_ON(diff > old_total);
4565 4566
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4567
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4568 4569 4570

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4571
	btrfs_end_transaction(trans);
4572 4573
done:
	btrfs_free_path(path);
4574
	if (ret) {
4575
		mutex_lock(&fs_info->chunk_mutex);
4576
		btrfs_device_set_total_bytes(device, old_size);
4577
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4578
			device->fs_devices->total_rw_bytes += diff;
4579
		atomic64_add(diff, &fs_info->free_chunk_space);
4580
		mutex_unlock(&fs_info->chunk_mutex);
4581
	}
4582 4583 4584
	return ret;
}

4585
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4586 4587 4588
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4589
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4590 4591 4592 4593
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4594
	mutex_lock(&fs_info->chunk_mutex);
4595
	array_size = btrfs_super_sys_array_size(super_copy);
4596
	if (array_size + item_size + sizeof(disk_key)
4597
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4598
		mutex_unlock(&fs_info->chunk_mutex);
4599
		return -EFBIG;
4600
	}
4601 4602 4603 4604 4605 4606 4607 4608

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4609
	mutex_unlock(&fs_info->chunk_mutex);
4610

4611 4612 4613
	return 0;
}

4614 4615 4616 4617
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4618
{
4619 4620
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4621

4622
	if (di_a->max_avail > di_b->max_avail)
4623
		return -1;
4624
	if (di_a->max_avail < di_b->max_avail)
4625
		return 1;
4626 4627 4628 4629 4630
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4631
}
4632

D
David Woodhouse 已提交
4633 4634
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4635
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4636 4637
		return;

4638
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4639 4640
}

4641
#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
4642 4643 4644 4645 4646 4647 4648 4649
			- sizeof(struct btrfs_chunk))		\
			/ sizeof(struct btrfs_stripe) + 1)

#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
				- 2 * sizeof(struct btrfs_disk_key)	\
				- 2 * sizeof(struct btrfs_chunk))	\
				/ sizeof(struct btrfs_stripe) + 1)

4650
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4651
			       u64 start, u64 type)
4652
{
4653
	struct btrfs_fs_info *info = trans->fs_info;
4654
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
4655
	struct btrfs_device *device;
4656 4657 4658 4659 4660 4661
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct btrfs_device_info *devices_info = NULL;
	u64 total_avail;
	int num_stripes;	/* total number of stripes to allocate */
D
David Woodhouse 已提交
4662 4663
	int data_stripes;	/* number of stripes that count for
				   block group size */
4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677
	int sub_stripes;	/* sub_stripes info for map */
	int dev_stripes;	/* stripes per dev */
	int devs_max;		/* max devs to use */
	int devs_min;		/* min devs needed */
	int devs_increment;	/* ndevs has to be a multiple of this */
	int ncopies;		/* how many copies to data has */
	int ret;
	u64 max_stripe_size;
	u64 max_chunk_size;
	u64 stripe_size;
	u64 num_bytes;
	int ndevs;
	int i;
	int j;
4678
	int index;
4679

4680
	BUG_ON(!alloc_profile_is_valid(type, 0));
4681

4682 4683 4684
	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
4685
		return -ENOSPC;
4686
	}
4687

4688
	index = btrfs_bg_flags_to_raid_index(type);
4689

4690 4691 4692 4693 4694 4695
	sub_stripes = btrfs_raid_array[index].sub_stripes;
	dev_stripes = btrfs_raid_array[index].dev_stripes;
	devs_max = btrfs_raid_array[index].devs_max;
	devs_min = btrfs_raid_array[index].devs_min;
	devs_increment = btrfs_raid_array[index].devs_increment;
	ncopies = btrfs_raid_array[index].ncopies;
4696

4697
	if (type & BTRFS_BLOCK_GROUP_DATA) {
4698
		max_stripe_size = SZ_1G;
4699
		max_chunk_size = 10 * max_stripe_size;
4700
		if (!devs_max)
4701
			devs_max = BTRFS_MAX_DEVS(info);
4702
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4703
		/* for larger filesystems, use larger metadata chunks */
4704 4705
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			max_stripe_size = SZ_1G;
4706
		else
4707
			max_stripe_size = SZ_256M;
4708
		max_chunk_size = max_stripe_size;
4709
		if (!devs_max)
4710
			devs_max = BTRFS_MAX_DEVS(info);
4711
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4712
		max_stripe_size = SZ_32M;
4713
		max_chunk_size = 2 * max_stripe_size;
4714 4715
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4716
	} else {
4717
		btrfs_err(info, "invalid chunk type 0x%llx requested",
4718 4719
		       type);
		BUG_ON(1);
4720 4721
	}

Y
Yan Zheng 已提交
4722 4723 4724
	/* we don't want a chunk larger than 10% of writeable space */
	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
			     max_chunk_size);
4725

4726
	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4727 4728 4729
			       GFP_NOFS);
	if (!devices_info)
		return -ENOMEM;
4730

4731
	/*
4732 4733
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
4734
	 */
4735
	ndevs = 0;
4736
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4737 4738
		u64 max_avail;
		u64 dev_offset;
4739

4740
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
J
Julia Lawall 已提交
4741
			WARN(1, KERN_ERR
4742
			       "BTRFS: read-only device in alloc_list\n");
4743 4744
			continue;
		}
4745

4746 4747
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
4748
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4749
			continue;
4750

4751 4752 4753 4754
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
4755 4756 4757 4758

		/* If there is no space on this device, skip it. */
		if (total_avail == 0)
			continue;
4759

4760
		ret = find_free_dev_extent(trans, device,
4761 4762 4763 4764
					   max_stripe_size * dev_stripes,
					   &dev_offset, &max_avail);
		if (ret && ret != -ENOSPC)
			goto error;
4765

4766 4767
		if (ret == 0)
			max_avail = max_stripe_size * dev_stripes;
4768

4769 4770 4771 4772 4773 4774
		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
			"%s: devid %llu has no free space, have=%llu want=%u",
					    __func__, device->devid, max_avail,
					    BTRFS_STRIPE_LEN * dev_stripes);
4775
			continue;
4776
		}
4777

4778 4779 4780 4781 4782
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
4783 4784 4785 4786 4787 4788
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
4789

4790 4791 4792 4793 4794
	/*
	 * now sort the devices by hole size / available space
	 */
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
	     btrfs_cmp_device_info, NULL);
4795

4796
	/* round down to number of usable stripes */
4797
	ndevs = round_down(ndevs, devs_increment);
4798

4799
	if (ndevs < devs_min) {
4800
		ret = -ENOSPC;
4801 4802 4803
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
4804
				    __func__, ndevs, devs_min);
4805
		}
4806
		goto error;
4807
	}
4808

4809 4810
	ndevs = min(ndevs, devs_max);

4811
	/*
4812 4813 4814 4815 4816
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
4817
	 */
4818
	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4819
	num_stripes = ndevs * dev_stripes;
4820

D
David Woodhouse 已提交
4821 4822 4823 4824 4825 4826
	/*
	 * this will have to be fixed for RAID1 and RAID10 over
	 * more drives
	 */
	data_stripes = num_stripes / ncopies;

4827
	if (type & BTRFS_BLOCK_GROUP_RAID5)
D
David Woodhouse 已提交
4828
		data_stripes = num_stripes - 1;
4829 4830

	if (type & BTRFS_BLOCK_GROUP_RAID6)
D
David Woodhouse 已提交
4831
		data_stripes = num_stripes - 2;
4832 4833 4834 4835 4836 4837 4838

	/*
	 * Use the number of data stripes to figure out how big this chunk
	 * is really going to be in terms of logical address space,
	 * and compare that answer with the max chunk size
	 */
	if (stripe_size * data_stripes > max_chunk_size) {
4839
		stripe_size = div_u64(max_chunk_size, data_stripes);
4840 4841

		/* bump the answer up to a 16MB boundary */
4842
		stripe_size = round_up(stripe_size, SZ_16M);
4843

4844 4845 4846
		/*
		 * But don't go higher than the limits we found while searching
		 * for free extents
4847
		 */
4848 4849
		stripe_size = min(devices_info[ndevs - 1].max_avail,
				  stripe_size);
4850 4851
	}

4852
	/* align to BTRFS_STRIPE_LEN */
4853
	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
4854 4855 4856 4857 4858 4859 4860

	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
	if (!map) {
		ret = -ENOMEM;
		goto error;
	}
	map->num_stripes = num_stripes;
4861

4862 4863 4864 4865 4866 4867
	for (i = 0; i < ndevs; ++i) {
		for (j = 0; j < dev_stripes; ++j) {
			int s = i * dev_stripes + j;
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
						   j * stripe_size;
4868 4869
		}
	}
4870 4871 4872
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
4873 4874
	map->type = type;
	map->sub_stripes = sub_stripes;
4875

D
David Woodhouse 已提交
4876
	num_bytes = stripe_size * data_stripes;
4877

4878
	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4879

4880
	em = alloc_extent_map();
Y
Yan Zheng 已提交
4881
	if (!em) {
4882
		kfree(map);
4883 4884
		ret = -ENOMEM;
		goto error;
4885
	}
4886
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4887
	em->map_lookup = map;
Y
Yan Zheng 已提交
4888
	em->start = start;
4889
	em->len = num_bytes;
Y
Yan Zheng 已提交
4890 4891
	em->block_start = 0;
	em->block_len = em->len;
4892
	em->orig_block_len = stripe_size;
4893

4894
	em_tree = &info->mapping_tree.map_tree;
4895
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4896
	ret = add_extent_mapping(em_tree, em, 0);
4897
	if (ret) {
4898
		write_unlock(&em_tree->lock);
4899
		free_extent_map(em);
4900
		goto error;
4901
	}
4902

4903 4904 4905 4906
	list_add_tail(&em->list, &trans->transaction->pending_chunks);
	refcount_inc(&em->refs);
	write_unlock(&em_tree->lock);

4907
	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
4908 4909
	if (ret)
		goto error_del_extent;
Y
Yan Zheng 已提交
4910

4911 4912 4913 4914
	for (i = 0; i < map->num_stripes; i++) {
		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
	}
4915

4916
	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
4917

4918
	free_extent_map(em);
4919
	check_raid56_incompat_flag(info, type);
D
David Woodhouse 已提交
4920

4921
	kfree(devices_info);
Y
Yan Zheng 已提交
4922
	return 0;
4923

4924
error_del_extent:
4925 4926 4927 4928 4929 4930 4931 4932
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
4933 4934
	/* One for the pending_chunks list reference */
	free_extent_map(em);
4935 4936 4937
error:
	kfree(devices_info);
	return ret;
Y
Yan Zheng 已提交
4938 4939
}

4940
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4941
				struct btrfs_fs_info *fs_info,
4942
				u64 chunk_offset, u64 chunk_size)
Y
Yan Zheng 已提交
4943
{
4944 4945
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
4946 4947 4948 4949
	struct btrfs_key key;
	struct btrfs_device *device;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
4950 4951 4952 4953 4954 4955
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	u64 dev_offset;
	u64 stripe_size;
	int i = 0;
4956
	int ret = 0;
Y
Yan Zheng 已提交
4957

4958 4959 4960
	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	if (IS_ERR(em))
		return PTR_ERR(em);
4961

4962
	map = em->map_lookup;
4963 4964 4965
	item_size = btrfs_chunk_item_size(map->num_stripes);
	stripe_size = em->orig_block_len;

Y
Yan Zheng 已提交
4966
	chunk = kzalloc(item_size, GFP_NOFS);
4967 4968 4969 4970 4971
	if (!chunk) {
		ret = -ENOMEM;
		goto out;
	}

4972 4973 4974 4975 4976 4977 4978
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with the map's stripes, because the device object's id can change
	 * at any time during that final phase of the device replace operation
	 * (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
4979
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4980 4981 4982
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
Y
Yan Zheng 已提交
4983

4984
		ret = btrfs_update_device(trans, device);
4985
		if (ret)
4986
			break;
4987 4988
		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
					     dev_offset, stripe_size);
4989
		if (ret)
4990 4991 4992
			break;
	}
	if (ret) {
4993
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4994
		goto out;
Y
Yan Zheng 已提交
4995 4996 4997
	}

	stripe = &chunk->stripe;
4998 4999 5000
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
5001

5002 5003 5004
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
5005
		stripe++;
5006
	}
5007
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5008

Y
Yan Zheng 已提交
5009
	btrfs_set_stack_chunk_length(chunk, chunk_size);
5010
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
5011 5012 5013 5014 5015
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5016
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
5017
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5018

Y
Yan Zheng 已提交
5019 5020 5021
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
	key.offset = chunk_offset;
5022

Y
Yan Zheng 已提交
5023
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5024 5025 5026 5027 5028
	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
		/*
		 * TODO: Cleanup of inserted chunk root in case of
		 * failure.
		 */
5029
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5030
	}
5031

5032
out:
5033
	kfree(chunk);
5034
	free_extent_map(em);
5035
	return ret;
Y
Yan Zheng 已提交
5036
}
5037

Y
Yan Zheng 已提交
5038 5039 5040 5041 5042 5043 5044 5045
/*
 * Chunk allocation falls into two parts. The first part does works
 * that make the new allocated chunk useable, but not do any operation
 * that modifies the chunk tree. The second part does the works that
 * require modifying the chunk tree. This division is important for the
 * bootstrap process of adding storage to a seed btrfs.
 */
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5046
		      struct btrfs_fs_info *fs_info, u64 type)
Y
Yan Zheng 已提交
5047 5048 5049
{
	u64 chunk_offset;

5050
	lockdep_assert_held(&fs_info->chunk_mutex);
5051
	chunk_offset = find_next_chunk(fs_info);
5052
	return __btrfs_alloc_chunk(trans, chunk_offset, type);
Y
Yan Zheng 已提交
5053 5054
}

C
Chris Mason 已提交
5055
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5056
					 struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
5057 5058 5059 5060 5061 5062
{
	u64 chunk_offset;
	u64 sys_chunk_offset;
	u64 alloc_profile;
	int ret;

5063
	chunk_offset = find_next_chunk(fs_info);
5064
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5065
	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5066 5067
	if (ret)
		return ret;
Y
Yan Zheng 已提交
5068

5069
	sys_chunk_offset = find_next_chunk(fs_info);
5070
	alloc_profile = btrfs_system_alloc_profile(fs_info);
5071
	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5072
	return ret;
Y
Yan Zheng 已提交
5073 5074
}

5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
	int max_errors;

	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			 BTRFS_BLOCK_GROUP_RAID10 |
			 BTRFS_BLOCK_GROUP_RAID5 |
			 BTRFS_BLOCK_GROUP_DUP)) {
		max_errors = 1;
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
		max_errors = 2;
	} else {
		max_errors = 0;
5088
	}
Y
Yan Zheng 已提交
5089

5090
	return max_errors;
Y
Yan Zheng 已提交
5091 5092
}

5093
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5094 5095 5096 5097
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5098
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5099 5100
	int i;

5101 5102
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5103 5104
		return 1;

5105
	map = em->map_lookup;
Y
Yan Zheng 已提交
5106
	for (i = 0; i < map->num_stripes; i++) {
5107 5108
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5109 5110 5111
			miss_ndevs++;
			continue;
		}
5112 5113
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
Y
Yan Zheng 已提交
5114
			readonly = 1;
5115
			goto end;
Y
Yan Zheng 已提交
5116 5117
		}
	}
5118 5119 5120 5121 5122 5123 5124 5125 5126

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5127
	free_extent_map(em);
Y
Yan Zheng 已提交
5128
	return readonly;
5129 5130 5131 5132
}

void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
5133
	extent_map_tree_init(&tree->map_tree);
5134 5135 5136 5137 5138 5139
}

void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
{
	struct extent_map *em;

C
Chris Mason 已提交
5140
	while (1) {
5141
		write_lock(&tree->map_tree.lock);
5142 5143 5144
		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
		if (em)
			remove_extent_mapping(&tree->map_tree, em);
5145
		write_unlock(&tree->map_tree.lock);
5146 5147 5148 5149 5150 5151 5152 5153 5154
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5155
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5156 5157 5158 5159 5160
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5161 5162 5163 5164 5165 5166 5167 5168
	em = get_chunk_map(fs_info, logical, len);
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5169 5170
		return 1;

5171
	map = em->map_lookup;
5172 5173
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
		ret = map->num_stripes;
C
Chris Mason 已提交
5174 5175
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5176 5177 5178
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
L
Liu Bo 已提交
5179 5180 5181 5182 5183 5184 5185 5186
		/*
		 * There could be two corrupted data stripes, we need
		 * to loop retry in order to rebuild the correct data.
		 * 
		 * Fail a stripe at a time on every retry except the
		 * stripe under reconstruction.
		 */
		ret = map->num_stripes;
5187 5188 5189
	else
		ret = 1;
	free_extent_map(em);
5190

5191
	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
5192 5193
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5194
		ret++;
5195
	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
5196

5197 5198 5199
	return ret;
}

5200
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5201 5202 5203 5204
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5205
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5206

5207
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5208

5209 5210 5211 5212 5213 5214
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5215 5216 5217
	return len;
}

5218
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5219 5220 5221 5222 5223
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5224
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5225

5226 5227 5228 5229 5230 5231
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5232 5233 5234
	return ret;
}

5235
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5236
			    struct map_lookup *map, int first,
5237
			    int dev_replace_is_ongoing)
5238 5239
{
	int i;
5240
	int num_stripes;
5241
	int preferred_mirror;
5242 5243 5244
	int tolerance;
	struct btrfs_device *srcdev;

5245 5246 5247 5248 5249 5250 5251 5252
	ASSERT((map->type &
		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

5253 5254
	preferred_mirror = first + current->pid % num_stripes;

5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5268 5269 5270
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5271
		for (i = first; i < first + num_stripes; i++) {
5272 5273 5274 5275
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5276
	}
5277

5278 5279 5280
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5281
	return preferred_mirror;
5282 5283
}

D
David Woodhouse 已提交
5284 5285 5286 5287 5288 5289
static inline int parity_smaller(u64 a, u64 b)
{
	return a > b;
}

/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5290
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5291 5292 5293 5294 5295 5296 5297 5298
{
	struct btrfs_bio_stripe s;
	int i;
	u64 l;
	int again = 1;

	while (again) {
		again = 0;
5299
		for (i = 0; i < num_stripes - 1; i++) {
5300 5301
			if (parity_smaller(bbio->raid_map[i],
					   bbio->raid_map[i+1])) {
D
David Woodhouse 已提交
5302
				s = bbio->stripes[i];
5303
				l = bbio->raid_map[i];
D
David Woodhouse 已提交
5304
				bbio->stripes[i] = bbio->stripes[i+1];
5305
				bbio->raid_map[i] = bbio->raid_map[i+1];
D
David Woodhouse 已提交
5306
				bbio->stripes[i+1] = s;
5307
				bbio->raid_map[i+1] = l;
5308

D
David Woodhouse 已提交
5309 5310 5311 5312 5313 5314
				again = 1;
			}
		}
	}
}

5315 5316 5317
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5318
		 /* the size of the btrfs_bio */
5319
		sizeof(struct btrfs_bio) +
5320
		/* plus the variable array for the stripes */
5321
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5322
		/* plus the variable array for the tgt dev */
5323
		sizeof(int) * (real_stripes) +
5324 5325 5326 5327 5328
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5329
		GFP_NOFS|__GFP_NOFAIL);
5330 5331

	atomic_set(&bbio->error, 0);
5332
	refcount_set(&bbio->refs, 1);
5333 5334 5335 5336 5337 5338

	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5339 5340
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5341 5342 5343 5344 5345 5346
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5347
	if (refcount_dec_and_test(&bbio->refs))
5348 5349 5350
		kfree(bbio);
}

5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

	em = get_chunk_map(fs_info, logical, length);
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
	length = min_t(u64, em->len - offset, length);

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5408
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

5674 5675 5676 5677 5678
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

5679 5680
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
5681
			     u64 logical, u64 *length,
5682
			     struct btrfs_bio **bbio_ret,
5683
			     int mirror_num, int need_raid_map)
5684 5685 5686 5687
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 offset;
5688 5689
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
5690
	u64 stripe_len;
5691
	u32 stripe_index;
5692
	int i;
L
Li Zefan 已提交
5693
	int ret = 0;
5694
	int num_stripes;
5695
	int max_errors = 0;
5696
	int tgtdev_indexes = 0;
5697
	struct btrfs_bio *bbio = NULL;
5698 5699 5700
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
5701 5702
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
5703
	u64 raid56_full_stripe_start = (u64)-1;
5704

5705 5706 5707 5708
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     *length, bbio_ret);

5709 5710 5711
	em = get_chunk_map(fs_info, logical, *length);
	if (IS_ERR(em))
		return PTR_ERR(em);
5712

5713
	map = em->map_lookup;
5714
	offset = logical - em->start;
5715

D
David Woodhouse 已提交
5716
	stripe_len = map->stripe_len;
5717 5718 5719 5720 5721
	stripe_nr = offset;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
5722
	stripe_nr = div64_u64(stripe_nr, stripe_len);
5723

D
David Woodhouse 已提交
5724
	stripe_offset = stripe_nr * stripe_len;
5725
	if (offset < stripe_offset) {
J
Jeff Mahoney 已提交
5726 5727
		btrfs_crit(fs_info,
			   "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5728 5729 5730 5731 5732
			   stripe_offset, offset, em->start, logical,
			   stripe_len);
		free_extent_map(em);
		return -EINVAL;
	}
5733 5734 5735 5736

	/* stripe_offset is the offset of this block in its stripe*/
	stripe_offset = offset - stripe_offset;

D
David Woodhouse 已提交
5737
	/* if we're here for raid56, we need to know the stripe aligned start */
5738
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
D
David Woodhouse 已提交
5739 5740 5741 5742 5743 5744
		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
		raid56_full_stripe_start = offset;

		/* allow a write of a full stripe, but make sure we don't
		 * allow straddling of stripes
		 */
5745 5746
		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				full_stripe_len);
D
David Woodhouse 已提交
5747 5748 5749
		raid56_full_stripe_start *= full_stripe_len;
	}

5750
	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
D
David Woodhouse 已提交
5751 5752 5753 5754
		u64 max_len;
		/* For writes to RAID[56], allow a full stripeset across all disks.
		   For other RAID types and for RAID[56] reads, just allow a single
		   stripe (on a single disk). */
5755
		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5756
		    (op == BTRFS_MAP_WRITE)) {
D
David Woodhouse 已提交
5757 5758 5759 5760 5761 5762 5763
			max_len = stripe_len * nr_data_stripes(map) -
				(offset - raid56_full_stripe_start);
		} else {
			/* we limit the length of each bio to what fits in a stripe */
			max_len = stripe_len - stripe_offset;
		}
		*length = min_t(u64, em->len - offset, max_len);
5764 5765 5766
	} else {
		*length = em->len - offset;
	}
5767

D
David Woodhouse 已提交
5768 5769
	/* This is for when we're called from btrfs_merge_bio_hook() and all
	   it cares about is the length */
5770
	if (!bbio_ret)
5771 5772
		goto out;

5773
	btrfs_dev_replace_read_lock(dev_replace);
5774 5775
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	if (!dev_replace_is_ongoing)
5776
		btrfs_dev_replace_read_unlock(dev_replace);
5777 5778
	else
		btrfs_dev_replace_set_lock_blocking(dev_replace);
5779

5780
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5781
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5782 5783 5784 5785 5786
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
5787
			goto out;
5788 5789
		else
			patch_the_first_stripe_for_dev_replace = 1;
5790 5791 5792 5793
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

5794
	num_stripes = 1;
5795
	stripe_index = 0;
5796
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5797 5798
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5799
		if (!need_full_stripe(op))
5800
			mirror_num = 1;
5801
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5802
		if (need_full_stripe(op))
5803
			num_stripes = map->num_stripes;
5804
		else if (mirror_num)
5805
			stripe_index = mirror_num - 1;
5806
		else {
5807 5808
			stripe_index = find_live_mirror(fs_info, map, 0,
					    dev_replace_is_ongoing);
5809
			mirror_num = stripe_index + 1;
5810
		}
5811

5812
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5813
		if (need_full_stripe(op)) {
5814
			num_stripes = map->num_stripes;
5815
		} else if (mirror_num) {
5816
			stripe_index = mirror_num - 1;
5817 5818 5819
		} else {
			mirror_num = 1;
		}
5820

C
Chris Mason 已提交
5821
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5822
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
5823

5824
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
5825 5826
		stripe_index *= map->sub_stripes;

5827
		if (need_full_stripe(op))
5828
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
5829 5830
		else if (mirror_num)
			stripe_index += mirror_num - 1;
5831
		else {
J
Jan Schmidt 已提交
5832
			int old_stripe_index = stripe_index;
5833 5834 5835
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
5836
			mirror_num = stripe_index - old_stripe_index + 1;
5837
		}
D
David Woodhouse 已提交
5838

5839
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5840
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
5841
			/* push stripe_nr back to the start of the full stripe */
5842
			stripe_nr = div64_u64(raid56_full_stripe_start,
5843
					stripe_len * nr_data_stripes(map));
D
David Woodhouse 已提交
5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
5858 5859
			stripe_nr = div_u64_rem(stripe_nr,
					nr_data_stripes(map), &stripe_index);
D
David Woodhouse 已提交
5860 5861 5862 5863 5864
			if (mirror_num > 1)
				stripe_index = nr_data_stripes(map) +
						mirror_num - 2;

			/* We distribute the parity blocks across stripes */
5865 5866
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
5867
			if (!need_full_stripe(op) && mirror_num <= 1)
5868
				mirror_num = 1;
D
David Woodhouse 已提交
5869
		}
5870 5871
	} else {
		/*
5872 5873 5874
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
5875
		 */
5876 5877
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5878
		mirror_num = stripe_index + 1;
5879
	}
5880
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
5881 5882
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5883 5884 5885 5886
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
5887

5888
	num_alloc_stripes = num_stripes;
5889
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5890
		if (op == BTRFS_MAP_WRITE)
5891
			num_alloc_stripes <<= 1;
5892
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
5893
			num_alloc_stripes++;
5894
		tgtdev_indexes = num_stripes;
5895
	}
5896

5897
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
5898 5899 5900 5901
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
5902
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5903
		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
L
Li Zefan 已提交
5904

5905
	/* build raid_map */
5906 5907
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
5908
		u64 tmp;
5909
		unsigned rot;
5910 5911 5912 5913 5914 5915 5916

		bbio->raid_map = (u64 *)((void *)bbio->stripes +
				 sizeof(struct btrfs_bio_stripe) *
				 num_alloc_stripes +
				 sizeof(int) * tgtdev_indexes);

		/* Work out the disk rotation on this stripe-set */
5917
		div_u64_rem(stripe_nr, num_stripes, &rot);
5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930

		/* Fill in the logical address of each stripe */
		tmp = stripe_nr * nr_data_stripes(map);
		for (i = 0; i < nr_data_stripes(map); i++)
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;
	}

L
Liu Bo 已提交
5931

5932 5933 5934 5935 5936 5937 5938 5939
	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset +
			stripe_nr * map->stripe_len;
		bbio->stripes[i].dev =
			map->stripes[stripe_index].dev;
		stripe_index++;
5940
	}
L
Li Zefan 已提交
5941

5942
	if (need_full_stripe(op))
5943
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
5944

5945 5946
	if (bbio->raid_map)
		sort_parity_stripes(bbio, num_stripes);
5947

5948
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5949
	    need_full_stripe(op)) {
5950 5951
		handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
					  &max_errors);
5952 5953
	}

L
Li Zefan 已提交
5954
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
5955
	bbio->map_type = map->type;
L
Li Zefan 已提交
5956 5957 5958
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
5971
out:
5972 5973
	if (dev_replace_is_ongoing) {
		btrfs_dev_replace_clear_lock_blocking(dev_replace);
5974
		btrfs_dev_replace_read_unlock(dev_replace);
5975
	}
5976
	free_extent_map(em);
L
Li Zefan 已提交
5977
	return ret;
5978 5979
}

5980
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5981
		      u64 logical, u64 *length,
5982
		      struct btrfs_bio **bbio_ret, int mirror_num)
5983
{
5984
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5985
				 mirror_num, 0);
5986 5987
}

5988
/* For Scrub/replace */
5989
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5990
		     u64 logical, u64 *length,
5991
		     struct btrfs_bio **bbio_ret)
5992
{
5993
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5994 5995
}

5996
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
Y
Yan Zheng 已提交
5997 5998 5999 6000 6001 6002 6003 6004 6005
		     u64 chunk_start, u64 physical, u64 devid,
		     u64 **logical, int *naddrs, int *stripe_len)
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 *buf;
	u64 bytenr;
	u64 length;
	u64 stripe_nr;
D
David Woodhouse 已提交
6006
	u64 rmap_len;
Y
Yan Zheng 已提交
6007 6008
	int i, j, nr = 0;

6009 6010
	em = get_chunk_map(fs_info, chunk_start, 1);
	if (IS_ERR(em))
6011 6012
		return -EIO;

6013
	map = em->map_lookup;
Y
Yan Zheng 已提交
6014
	length = em->len;
D
David Woodhouse 已提交
6015 6016
	rmap_len = map->stripe_len;

Y
Yan Zheng 已提交
6017
	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
6018
		length = div_u64(length, map->num_stripes / map->sub_stripes);
Y
Yan Zheng 已提交
6019
	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
6020
		length = div_u64(length, map->num_stripes);
6021
	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6022
		length = div_u64(length, nr_data_stripes(map));
D
David Woodhouse 已提交
6023 6024
		rmap_len = map->stripe_len * nr_data_stripes(map);
	}
Y
Yan Zheng 已提交
6025

6026
	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
6027
	BUG_ON(!buf); /* -ENOMEM */
Y
Yan Zheng 已提交
6028 6029 6030 6031 6032 6033 6034 6035 6036

	for (i = 0; i < map->num_stripes; i++) {
		if (devid && map->stripes[i].dev->devid != devid)
			continue;
		if (map->stripes[i].physical > physical ||
		    map->stripes[i].physical + length <= physical)
			continue;

		stripe_nr = physical - map->stripes[i].physical;
6037
		stripe_nr = div64_u64(stripe_nr, map->stripe_len);
Y
Yan Zheng 已提交
6038 6039 6040

		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			stripe_nr = stripe_nr * map->num_stripes + i;
6041
			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
Y
Yan Zheng 已提交
6042 6043
		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			stripe_nr = stripe_nr * map->num_stripes + i;
D
David Woodhouse 已提交
6044 6045 6046 6047 6048
		} /* else if RAID[56], multiply by nr_data_stripes().
		   * Alternatively, just use rmap_len below instead of
		   * map->stripe_len */

		bytenr = chunk_start + stripe_nr * rmap_len;
6049
		WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
6050 6051 6052 6053
		for (j = 0; j < nr; j++) {
			if (buf[j] == bytenr)
				break;
		}
6054 6055
		if (j == nr) {
			WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
6056
			buf[nr++] = bytenr;
6057
		}
Y
Yan Zheng 已提交
6058 6059 6060 6061
	}

	*logical = buf;
	*naddrs = nr;
D
David Woodhouse 已提交
6062
	*stripe_len = rmap_len;
Y
Yan Zheng 已提交
6063 6064 6065

	free_extent_map(em);
	return 0;
6066 6067
}

6068
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6069
{
6070 6071
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
6072
	bio_endio(bio);
6073

6074
	btrfs_put_bbio(bbio);
6075 6076
}

6077
static void btrfs_end_bio(struct bio *bio)
6078
{
6079
	struct btrfs_bio *bbio = bio->bi_private;
6080
	int is_orig_bio = 0;
6081

6082
	if (bio->bi_status) {
6083
		atomic_inc(&bbio->error);
6084 6085
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
6086
			unsigned int stripe_index =
6087
				btrfs_io_bio(bio)->stripe_index;
6088
			struct btrfs_device *dev;
6089 6090 6091

			BUG_ON(stripe_index >= bbio->num_stripes);
			dev = bbio->stripes[stripe_index].dev;
6092
			if (dev->bdev) {
M
Mike Christie 已提交
6093
				if (bio_op(bio) == REQ_OP_WRITE)
6094
					btrfs_dev_stat_inc_and_print(dev,
6095 6096
						BTRFS_DEV_STAT_WRITE_ERRS);
				else
6097
					btrfs_dev_stat_inc_and_print(dev,
6098
						BTRFS_DEV_STAT_READ_ERRS);
6099
				if (bio->bi_opf & REQ_PREFLUSH)
6100
					btrfs_dev_stat_inc_and_print(dev,
6101 6102
						BTRFS_DEV_STAT_FLUSH_ERRS);
			}
6103 6104
		}
	}
6105

6106
	if (bio == bbio->orig_bio)
6107 6108
		is_orig_bio = 1;

6109 6110
	btrfs_bio_counter_dec(bbio->fs_info);

6111
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6112 6113
		if (!is_orig_bio) {
			bio_put(bio);
6114
			bio = bbio->orig_bio;
6115
		}
6116

6117
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6118
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6119
		 * beyond the tolerance of the btrfs bio
6120
		 */
6121
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6122
			bio->bi_status = BLK_STS_IOERR;
6123
		} else {
6124 6125 6126 6127
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6128
			bio->bi_status = BLK_STS_OK;
6129
		}
6130

6131
		btrfs_end_bbio(bbio, bio);
6132
	} else if (!is_orig_bio) {
6133 6134 6135 6136
		bio_put(bio);
	}
}

6137 6138 6139 6140 6141 6142 6143
/*
 * see run_scheduled_bios for a description of why bios are collected for
 * async submit.
 *
 * This will add one bio to the pending list for a device and make sure
 * the work struct is scheduled.
 */
6144
static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6145
					struct bio *bio)
6146
{
6147
	struct btrfs_fs_info *fs_info = device->fs_info;
6148
	int should_queue = 1;
6149
	struct btrfs_pending_bios *pending_bios;
6150

6151 6152
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
	    !device->bdev) {
6153
		bio_io_error(bio);
D
David Woodhouse 已提交
6154 6155 6156
		return;
	}

6157
	/* don't bother with additional async steps for reads, right now */
M
Mike Christie 已提交
6158
	if (bio_op(bio) == REQ_OP_READ) {
6159
		btrfsic_submit_bio(bio);
6160
		return;
6161 6162
	}

6163
	WARN_ON(bio->bi_next);
6164 6165 6166
	bio->bi_next = NULL;

	spin_lock(&device->io_lock);
6167
	if (op_is_sync(bio->bi_opf))
6168 6169 6170
		pending_bios = &device->pending_sync_bios;
	else
		pending_bios = &device->pending_bios;
6171

6172 6173
	if (pending_bios->tail)
		pending_bios->tail->bi_next = bio;
6174

6175 6176 6177
	pending_bios->tail = bio;
	if (!pending_bios->head)
		pending_bios->head = bio;
6178 6179 6180 6181 6182 6183
	if (device->running_pending)
		should_queue = 0;

	spin_unlock(&device->io_lock);

	if (should_queue)
6184
		btrfs_queue_work(fs_info->submit_workers, &device->work);
6185 6186
}

6187 6188
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
			      u64 physical, int dev_nr, int async)
6189 6190
{
	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6191
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6192 6193

	bio->bi_private = bbio;
6194
	btrfs_io_bio(bio)->stripe_index = dev_nr;
6195
	bio->bi_end_io = btrfs_end_bio;
6196
	bio->bi_iter.bi_sector = physical >> 9;
6197 6198 6199 6200 6201 6202
#ifdef DEBUG
	{
		struct rcu_string *name;

		rcu_read_lock();
		name = rcu_dereference(dev->name);
6203 6204 6205 6206 6207 6208
		btrfs_debug(fs_info,
			"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
			bio_op(bio), bio->bi_opf,
			(u64)bio->bi_iter.bi_sector,
			(u_long)dev->bdev->bd_dev, name->str, dev->devid,
			bio->bi_iter.bi_size);
6209 6210 6211
		rcu_read_unlock();
	}
#endif
6212
	bio_set_dev(bio, dev->bdev);
6213

6214
	btrfs_bio_counter_inc_noblocked(fs_info);
6215

6216
	if (async)
6217
		btrfs_schedule_bio(dev, bio);
6218
	else
6219
		btrfsic_submit_bio(bio);
6220 6221 6222 6223 6224 6225
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6226
		/* Should be the original bio. */
6227 6228
		WARN_ON(bio != bbio->orig_bio);

6229
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6230
		bio->bi_iter.bi_sector = logical >> 9;
6231 6232 6233 6234
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6235
		btrfs_end_bbio(bbio, bio);
6236 6237 6238
	}
}

6239 6240
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
			   int mirror_num, int async_submit)
6241 6242
{
	struct btrfs_device *dev;
6243
	struct bio *first_bio = bio;
6244
	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6245 6246 6247
	u64 length = 0;
	u64 map_length;
	int ret;
6248 6249
	int dev_nr;
	int total_devs;
6250
	struct btrfs_bio *bbio = NULL;
6251

6252
	length = bio->bi_iter.bi_size;
6253
	map_length = length;
6254

6255
	btrfs_bio_counter_inc_blocked(fs_info);
6256
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6257
				&map_length, &bbio, mirror_num, 1);
6258
	if (ret) {
6259
		btrfs_bio_counter_dec(fs_info);
6260
		return errno_to_blk_status(ret);
6261
	}
6262

6263
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6264 6265 6266
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6267
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6268 6269
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6270
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
M
Mike Christie 已提交
6271
	    ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6272 6273
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
M
Mike Christie 已提交
6274
		if (bio_op(bio) == REQ_OP_WRITE) {
6275 6276
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6277
		} else {
6278 6279
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6280
		}
6281

6282
		btrfs_bio_counter_dec(fs_info);
6283
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6284 6285
	}

6286
	if (map_length < length) {
6287
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6288 6289
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6290 6291
		BUG();
	}
6292

6293
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6294
		dev = bbio->stripes[dev_nr].dev;
M
Mike Christie 已提交
6295
		if (!dev || !dev->bdev ||
6296 6297
		    (bio_op(first_bio) == REQ_OP_WRITE &&
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6298 6299 6300 6301
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6302
		if (dev_nr < total_devs - 1)
6303
			bio = btrfs_bio_clone(first_bio);
6304
		else
6305
			bio = first_bio;
6306

6307 6308
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				  dev_nr, async_submit);
6309
	}
6310
	btrfs_bio_counter_dec(fs_info);
6311
	return BLK_STS_OK;
6312 6313
}

6314
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
Y
Yan Zheng 已提交
6315
				       u8 *uuid, u8 *fsid)
6316
{
Y
Yan Zheng 已提交
6317 6318 6319
	struct btrfs_device *device;
	struct btrfs_fs_devices *cur_devices;

6320
	cur_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
6321 6322
	while (cur_devices) {
		if (!fsid ||
6323
		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6324
			device = find_device(cur_devices, devid, uuid);
Y
Yan Zheng 已提交
6325 6326 6327 6328 6329 6330
			if (device)
				return device;
		}
		cur_devices = cur_devices->seed;
	}
	return NULL;
6331 6332
}

6333
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6334 6335 6336 6337
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;

6338 6339
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
	if (IS_ERR(device))
6340
		return device;
6341 6342

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6343
	device->fs_devices = fs_devices;
6344
	fs_devices->num_devices++;
6345

6346
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6347
	fs_devices->missing_devices++;
6348

6349 6350 6351
	return device;
}

6352 6353 6354 6355 6356 6357 6358 6359 6360 6361
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6362
 * on error.  Returned struct is not linked onto any lists and must be
6363
 * destroyed with btrfs_free_device.
6364 6365 6366 6367 6368 6369 6370 6371
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6372
	if (WARN_ON(!devid && !fs_info))
6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385
		return ERR_PTR(-EINVAL);

	dev = __alloc_device();
	if (IS_ERR(dev))
		return dev;

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6386
			btrfs_free_device(dev);
6387 6388 6389 6390 6391 6392 6393 6394 6395 6396
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

6397 6398
	btrfs_init_work(&dev->work, btrfs_submit_helper,
			pending_bios_fn, NULL, NULL);
6399 6400 6401 6402

	return dev;
}

6403
/* Return -EIO if any error, otherwise return 0. */
6404
static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6405 6406
				   struct extent_buffer *leaf,
				   struct btrfs_chunk *chunk, u64 logical)
6407 6408
{
	u64 length;
6409
	u64 stripe_len;
6410 6411 6412
	u16 num_stripes;
	u16 sub_stripes;
	u64 type;
6413

6414
	length = btrfs_chunk_length(leaf, chunk);
6415 6416
	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6417 6418 6419
	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
	type = btrfs_chunk_type(leaf, chunk);

6420
	if (!num_stripes) {
6421
		btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6422 6423 6424
			  num_stripes);
		return -EIO;
	}
6425 6426
	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6427 6428
		return -EIO;
	}
6429 6430
	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
		btrfs_err(fs_info, "invalid chunk sectorsize %u",
6431 6432 6433
			  btrfs_chunk_sector_size(leaf, chunk));
		return -EIO;
	}
6434 6435
	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk length %llu", length);
6436 6437
		return -EIO;
	}
6438
	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6439
		btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6440 6441 6442 6443
			  stripe_len);
		return -EIO;
	}
	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6444
	    type) {
6445
		btrfs_err(fs_info, "unrecognized chunk type: %llu",
6446 6447 6448 6449 6450
			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
			  btrfs_chunk_type(leaf, chunk));
		return -EIO;
	}
6451 6452 6453 6454 6455 6456 6457
	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
	     num_stripes != 1)) {
6458
		btrfs_err(fs_info,
6459 6460 6461 6462 6463 6464 6465 6466 6467
			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
			num_stripes, sub_stripes,
			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
		return -EIO;
	}

	return 0;
}

6468
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6469
					u64 devid, u8 *uuid, bool error)
6470
{
6471 6472 6473 6474 6475 6476
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6477 6478
}

6479
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6480 6481 6482
			  struct extent_buffer *leaf,
			  struct btrfs_chunk *chunk)
{
6483
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6498
	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6499 6500
	if (ret)
		return ret;
6501

6502
	read_lock(&map_tree->map_tree.lock);
6503
	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6504
	read_unlock(&map_tree->map_tree.lock);
6505 6506 6507 6508 6509 6510 6511 6512 6513

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6514
	em = alloc_extent_map();
6515 6516
	if (!em)
		return -ENOMEM;
6517
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6518 6519 6520 6521 6522
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6523
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6524
	em->map_lookup = map;
6525 6526
	em->start = logical;
	em->len = length;
6527
	em->orig_start = 0;
6528
	em->block_start = 0;
C
Chris Mason 已提交
6529
	em->block_len = em->len;
6530

6531 6532 6533 6534 6535
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	map->type = btrfs_chunk_type(leaf, chunk);
C
Chris Mason 已提交
6536
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6537 6538 6539 6540
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6541 6542 6543
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
6544
		map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6545
							uuid, NULL);
6546
		if (!map->stripes[i].dev &&
6547
		    !btrfs_test_opt(fs_info, DEGRADED)) {
6548
			free_extent_map(em);
6549
			btrfs_report_missing_device(fs_info, devid, uuid, true);
6550
			return -ENOENT;
6551
		}
6552 6553
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
6554 6555
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
6556
			if (IS_ERR(map->stripes[i].dev)) {
6557
				free_extent_map(em);
6558 6559 6560 6561
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
6562
			}
6563
			btrfs_report_missing_device(fs_info, devid, uuid, false);
6564
		}
6565 6566 6567
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));

6568 6569
	}

6570
	write_lock(&map_tree->map_tree.lock);
J
Josef Bacik 已提交
6571
	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6572
	write_unlock(&map_tree->map_tree.lock);
6573
	BUG_ON(ret); /* Tree corruption */
6574 6575 6576 6577 6578
	free_extent_map(em);

	return 0;
}

6579
static void fill_device_from_item(struct extent_buffer *leaf,
6580 6581 6582 6583 6584 6585
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
6586 6587
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
6588
	device->commit_total_bytes = device->disk_total_bytes;
6589
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6590
	device->commit_bytes_used = device->bytes_used;
6591 6592 6593 6594
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6595
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6596
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6597

6598
	ptr = btrfs_device_uuid(dev_item);
6599
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6600 6601
}

6602
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6603
						  u8 *fsid)
Y
Yan Zheng 已提交
6604 6605 6606 6607
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

6608
	lockdep_assert_held(&uuid_mutex);
D
David Sterba 已提交
6609
	ASSERT(fsid);
Y
Yan Zheng 已提交
6610

6611
	fs_devices = fs_info->fs_devices->seed;
Y
Yan Zheng 已提交
6612
	while (fs_devices) {
6613
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6614 6615
			return fs_devices;

Y
Yan Zheng 已提交
6616 6617 6618 6619 6620
		fs_devices = fs_devices->seed;
	}

	fs_devices = find_fsid(fsid);
	if (!fs_devices) {
6621
		if (!btrfs_test_opt(fs_info, DEGRADED))
6622 6623 6624 6625 6626 6627 6628 6629 6630
			return ERR_PTR(-ENOENT);

		fs_devices = alloc_fs_devices(fsid);
		if (IS_ERR(fs_devices))
			return fs_devices;

		fs_devices->seeding = 1;
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
6631
	}
Y
Yan Zheng 已提交
6632 6633

	fs_devices = clone_fs_devices(fs_devices);
6634 6635
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
6636

6637
	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6638 6639
	if (ret) {
		free_fs_devices(fs_devices);
6640
		fs_devices = ERR_PTR(ret);
Y
Yan Zheng 已提交
6641
		goto out;
6642
	}
Y
Yan Zheng 已提交
6643 6644

	if (!fs_devices->seeding) {
6645
		close_fs_devices(fs_devices);
Y
Yan Zheng 已提交
6646
		free_fs_devices(fs_devices);
6647
		fs_devices = ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
6648 6649 6650
		goto out;
	}

6651 6652
	fs_devices->seed = fs_info->fs_devices->seed;
	fs_info->fs_devices->seed = fs_devices;
Y
Yan Zheng 已提交
6653
out:
6654
	return fs_devices;
Y
Yan Zheng 已提交
6655 6656
}

6657
static int read_one_dev(struct btrfs_fs_info *fs_info,
6658 6659 6660
			struct extent_buffer *leaf,
			struct btrfs_dev_item *dev_item)
{
6661
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6662 6663 6664
	struct btrfs_device *device;
	u64 devid;
	int ret;
6665
	u8 fs_uuid[BTRFS_FSID_SIZE];
6666 6667
	u8 dev_uuid[BTRFS_UUID_SIZE];

6668
	devid = btrfs_device_id(leaf, dev_item);
6669
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6670
			   BTRFS_UUID_SIZE);
6671
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6672
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
6673

6674
	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6675
		fs_devices = open_seed_devices(fs_info, fs_uuid);
6676 6677
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
6678 6679
	}

6680
	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6681
	if (!device) {
6682
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
6683 6684
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
6685
			return -ENOENT;
6686
		}
Y
Yan Zheng 已提交
6687

6688
		device = add_missing_dev(fs_devices, devid, dev_uuid);
6689 6690 6691 6692 6693 6694
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
6695
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6696
	} else {
6697
		if (!device->bdev) {
6698 6699 6700
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
6701
				return -ENOENT;
6702 6703 6704
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
6705
		}
6706

6707 6708
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6709 6710 6711 6712 6713 6714
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
6715
			device->fs_devices->missing_devices++;
6716
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Y
Yan Zheng 已提交
6717
		}
6718 6719 6720

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
6721 6722
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
6723 6724 6725 6726 6727 6728 6729 6730 6731 6732

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
6733 6734
	}

6735
	if (device->fs_devices != fs_info->fs_devices) {
6736
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Y
Yan Zheng 已提交
6737 6738 6739
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
6740
	}
6741 6742

	fill_device_from_item(leaf, dev_item, device);
6743
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6744
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6745
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Y
Yan Zheng 已提交
6746
		device->fs_devices->total_rw_bytes += device->total_bytes;
6747 6748
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
6749
	}
6750 6751 6752 6753
	ret = 0;
	return ret;
}

6754
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6755
{
6756
	struct btrfs_root *root = fs_info->tree_root;
6757
	struct btrfs_super_block *super_copy = fs_info->super_copy;
6758
	struct extent_buffer *sb;
6759 6760
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
6761 6762
	u8 *array_ptr;
	unsigned long sb_array_offset;
6763
	int ret = 0;
6764 6765 6766
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
6767
	u32 cur_offset;
6768
	u64 type;
6769
	struct btrfs_key key;
6770

6771
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6772 6773 6774 6775 6776
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
6777
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6778 6779
	if (IS_ERR(sb))
		return PTR_ERR(sb);
6780
	set_extent_buffer_uptodate(sb);
6781
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6782
	/*
6783
	 * The sb extent buffer is artificial and just used to read the system array.
6784
	 * set_extent_buffer_uptodate() call does not properly mark all it's
6785 6786 6787 6788 6789 6790 6791 6792 6793
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
6794
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6795
		SetPageUptodate(sb->pages[0]);
6796

6797
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6798 6799
	array_size = btrfs_super_sys_array_size(super_copy);

6800 6801 6802
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
6803

6804 6805
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
6806 6807 6808 6809
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

6810 6811
		btrfs_disk_key_to_cpu(&key, disk_key);

6812 6813 6814
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6815

6816
		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6817
			chunk = (struct btrfs_chunk *)sb_array_offset;
6818 6819 6820 6821 6822 6823 6824 6825 6826
			/*
			 * At least one btrfs_chunk with one stripe must be
			 * present, exact stripe count check comes afterwards
			 */
			len = btrfs_chunk_item_size(1);
			if (cur_offset + len > array_size)
				goto out_short_read;

			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6827
			if (!num_stripes) {
6828 6829
				btrfs_err(fs_info,
					"invalid number of stripes %u in sys_array at offset %u",
6830 6831 6832 6833 6834
					num_stripes, cur_offset);
				ret = -EIO;
				break;
			}

6835 6836
			type = btrfs_chunk_type(sb, chunk);
			if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6837
				btrfs_err(fs_info,
6838 6839 6840 6841 6842 6843
			    "invalid chunk type %llu in sys_array at offset %u",
					type, cur_offset);
				ret = -EIO;
				break;
			}

6844 6845 6846 6847
			len = btrfs_chunk_item_size(num_stripes);
			if (cur_offset + len > array_size)
				goto out_short_read;

6848
			ret = read_one_chunk(fs_info, &key, sb, chunk);
6849 6850
			if (ret)
				break;
6851
		} else {
6852 6853 6854
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
6855 6856
			ret = -EIO;
			break;
6857
		}
6858 6859 6860
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6861
	}
6862
	clear_extent_buffer_uptodate(sb);
6863
	free_extent_buffer_stale(sb);
6864
	return ret;
6865 6866

out_short_read:
6867
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6868
			len, cur_offset);
6869
	clear_extent_buffer_uptodate(sb);
6870
	free_extent_buffer_stale(sb);
6871
	return -EIO;
6872 6873
}

6874 6875 6876
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
6877 6878
 * If the @failing_dev is specified, it's accounted as missing.
 *
6879 6880 6881
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
6882 6883
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910
{
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

	read_lock(&map_tree->map_tree.lock);
	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
	read_unlock(&map_tree->map_tree.lock);
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

6911 6912
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6913 6914
			    dev->last_flush_error)
				missing++;
6915 6916
			else if (failing_dev && failing_dev == dev)
				missing++;
6917 6918
		}
		if (missing > max_tolerated) {
6919 6920
			if (!failing_dev)
				btrfs_warn(fs_info,
6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938
	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

		read_lock(&map_tree->map_tree.lock);
		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
					   (u64)(-1) - next_start);
		read_unlock(&map_tree->map_tree.lock);
	}
out:
	return ret;
}

6939
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
6940
{
6941
	struct btrfs_root *root = fs_info->chunk_root;
6942 6943 6944 6945 6946 6947
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
6948
	u64 total_dev = 0;
6949 6950 6951 6952 6953

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

6954
	mutex_lock(&uuid_mutex);
6955
	mutex_lock(&fs_info->chunk_mutex);
6956

6957 6958 6959 6960 6961
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6962 6963 6964 6965 6966
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6967 6968
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
6969
	while (1) {
6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6981 6982 6983
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
6984
						  struct btrfs_dev_item);
6985
			ret = read_one_dev(fs_info, leaf, dev_item);
6986 6987
			if (ret)
				goto error;
6988
			total_dev++;
6989 6990 6991
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6992
			ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
Y
Yan Zheng 已提交
6993 6994
			if (ret)
				goto error;
6995 6996 6997
		}
		path->slots[0]++;
	}
6998 6999 7000 7001 7002

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
7003 7004
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
7005
	   "super_num_devices %llu mismatch with num_devices %llu found here",
7006
			  btrfs_super_num_devices(fs_info->super_copy),
7007 7008 7009 7010
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
7011 7012 7013
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
7014
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7015 7016
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
7017 7018 7019
		ret = -EINVAL;
		goto error;
	}
7020 7021
	ret = 0;
error:
7022
	mutex_unlock(&fs_info->chunk_mutex);
7023 7024
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
7025
	btrfs_free_path(path);
7026 7027
	return ret;
}
7028

7029 7030 7031 7032 7033
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;

7034 7035 7036
	while (fs_devices) {
		mutex_lock(&fs_devices->device_list_mutex);
		list_for_each_entry(device, &fs_devices->devices, dev_list)
7037
			device->fs_info = fs_info;
7038 7039 7040 7041
		mutex_unlock(&fs_devices->device_list_mutex);

		fs_devices = fs_devices->seed;
	}
7042 7043
}

7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_dev_stat_reset(dev, i);
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct extent_buffer *eb;
	int slot;
	int ret = 0;
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
	int i;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		int item_size;
		struct btrfs_dev_stats_item *ptr;

7076 7077
		key.objectid = BTRFS_DEV_STATS_OBJECTID;
		key.type = BTRFS_PERSISTENT_ITEM_KEY;
7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113
		key.offset = device->devid;
		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
		if (ret) {
			__btrfs_reset_dev_stats(device);
			device->dev_stats_valid = 1;
			btrfs_release_path(path);
			continue;
		}
		slot = path->slots[0];
		eb = path->nodes[0];
		btrfs_item_key_to_cpu(eb, &found_key, slot);
		item_size = btrfs_item_size_nr(eb, slot);

		ptr = btrfs_item_ptr(eb, slot,
				     struct btrfs_dev_stats_item);

		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (item_size >= (1 + i) * sizeof(__le64))
				btrfs_dev_stat_set(device, i,
					btrfs_dev_stats_value(eb, ptr, i));
			else
				btrfs_dev_stat_reset(device, i);
		}

		device->dev_stats_valid = 1;
		btrfs_dev_stat_print_on_load(device);
		btrfs_release_path(path);
	}
	mutex_unlock(&fs_devices->device_list_mutex);

out:
	btrfs_free_path(path);
	return ret < 0 ? ret : 0;
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7114
				struct btrfs_fs_info *fs_info,
7115 7116
				struct btrfs_device *device)
{
7117
	struct btrfs_root *dev_root = fs_info->dev_root;
7118 7119 7120 7121 7122 7123 7124
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7125 7126
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7127 7128 7129
	key.offset = device->devid;

	path = btrfs_alloc_path();
7130 7131
	if (!path)
		return -ENOMEM;
7132 7133
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7134
		btrfs_warn_in_rcu(fs_info,
7135
			"error %d while searching for dev_stats item for device %s",
7136
			      ret, rcu_str_deref(device->name));
7137 7138 7139 7140 7141 7142 7143 7144
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7145
			btrfs_warn_in_rcu(fs_info,
7146
				"delete too small dev_stats item for device %s failed %d",
7147
				      rcu_str_deref(device->name), ret);
7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7159
			btrfs_warn_in_rcu(fs_info,
7160 7161
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7186
	int stats_cnt;
7187 7188 7189 7190
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7191 7192
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7193 7194
			continue;

7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7209
		ret = update_dev_stat_item(trans, fs_info, device);
7210
		if (!ret)
7211
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7212 7213 7214 7215 7216 7217
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7218 7219 7220 7221 7222 7223
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7224
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7225
{
7226 7227
	if (!dev->dev_stats_valid)
		return;
7228
	btrfs_err_rl_in_rcu(dev->fs_info,
7229
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7230
			   rcu_str_deref(dev->name),
7231 7232 7233
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7234 7235
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7236
}
7237

7238 7239
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7240 7241 7242 7243 7244 7245 7246 7247
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7248
	btrfs_info_in_rcu(dev->fs_info,
7249
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7250
	       rcu_str_deref(dev->name),
7251 7252 7253 7254 7255 7256 7257
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7258
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7259
			struct btrfs_ioctl_get_dev_stats *stats)
7260 7261
{
	struct btrfs_device *dev;
7262
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7263 7264 7265
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7266
	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7267 7268 7269
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7270
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7271
		return -ENODEV;
7272
	} else if (!dev->dev_stats_valid) {
7273
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7274
		return -ENODEV;
7275
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
				btrfs_dev_stat_reset(dev, i);
		}
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7292

7293
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7294 7295 7296
{
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
7297
	int copy_num;
7298

7299 7300
	if (!bdev)
		return;
7301

7302 7303
	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
		copy_num++) {
7304

7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320
		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
			continue;

		disk_super = (struct btrfs_super_block *)bh->b_data;

		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
		set_buffer_dirty(bh);
		sync_dirty_buffer(bh);
		brelse(bh);
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
7321
}
7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335

/*
 * Update the size of all devices, which is used for writing out the
 * super blocks.
 */
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *curr, *next;

	if (list_empty(&fs_devices->resized_devices))
		return;

	mutex_lock(&fs_devices->device_list_mutex);
7336
	mutex_lock(&fs_info->chunk_mutex);
7337 7338 7339 7340 7341
	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				 resized_list) {
		list_del_init(&curr->resized_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
	}
7342
	mutex_unlock(&fs_info->chunk_mutex);
7343 7344
	mutex_unlock(&fs_devices->device_list_mutex);
}
7345 7346

/* Must be invoked during the transaction commit */
7347
void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7348
{
7349
	struct btrfs_fs_info *fs_info = trans->fs_info;
7350 7351 7352 7353 7354
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_device *dev;
	int i;

7355
	if (list_empty(&trans->pending_chunks))
7356 7357 7358
		return;

	/* In order to kick the device replace finish process */
7359
	mutex_lock(&fs_info->chunk_mutex);
7360
	list_for_each_entry(em, &trans->pending_chunks, list) {
7361
		map = em->map_lookup;
7362 7363 7364 7365 7366 7367

		for (i = 0; i < map->num_stripes; i++) {
			dev = map->stripes[i].dev;
			dev->commit_bytes_used = dev->bytes_used;
		}
	}
7368
	mutex_unlock(&fs_info->chunk_mutex);
7369
}
7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387

void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = fs_info;
		fs_devices = fs_devices->seed;
	}
}

void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = NULL;
		fs_devices = fs_devices->seed;
	}
}