volumes.c 188.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */
#include <linux/sched.h>
#include <linux/bio.h>
20
#include <linux/slab.h>
21
#include <linux/buffer_head.h>
22
#include <linux/blkdev.h>
23
#include <linux/iocontext.h>
24
#include <linux/capability.h>
25
#include <linux/ratelimit.h>
I
Ilya Dryomov 已提交
26
#include <linux/kthread.h>
D
David Woodhouse 已提交
27
#include <linux/raid/pq.h>
S
Stefan Behrens 已提交
28
#include <linux/semaphore.h>
29
#include <linux/uuid.h>
D
David Woodhouse 已提交
30
#include <asm/div64.h>
31 32 33 34 35 36
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
D
David Woodhouse 已提交
37
#include "raid56.h"
38
#include "async-thread.h"
39
#include "check-integrity.h"
40
#include "rcu-string.h"
41
#include "math.h"
42
#include "dev-replace.h"
43
#include "sysfs.h"
44

Z
Zhao Lei 已提交
45 46 47 48 49 50
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
		.devs_min	= 4,
51
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
52 53 54 55 56 57 58 59
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
60
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
61 62 63 64 65 66 67 68
		.devs_increment	= 2,
		.ncopies	= 2,
	},
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
69
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
70 71 72 73 74 75 76 77
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
78
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
79 80 81 82 83 84 85 86
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
87
		.tolerated_failures = 0,
Z
Zhao Lei 已提交
88 89 90 91 92 93 94 95
		.devs_increment	= 1,
		.ncopies	= 1,
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
96
		.tolerated_failures = 1,
Z
Zhao Lei 已提交
97 98 99 100 101 102 103 104
		.devs_increment	= 1,
		.ncopies	= 2,
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
105
		.tolerated_failures = 2,
Z
Zhao Lei 已提交
106 107 108 109 110
		.devs_increment	= 1,
		.ncopies	= 3,
	},
};

111
const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
Z
Zhao Lei 已提交
112 113 114 115 116 117 118 119 120
	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
	[BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
	[BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
};

121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
/*
 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 * condition is not met. Zero means there's no corresponding
 * BTRFS_ERROR_DEV_*_NOT_MET value.
 */
const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
	[BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	[BTRFS_RAID_DUP]    = 0,
	[BTRFS_RAID_RAID0]  = 0,
	[BTRFS_RAID_SINGLE] = 0,
	[BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
	[BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
};

Y
Yan Zheng 已提交
136
static int init_first_rw_device(struct btrfs_trans_handle *trans,
137
				struct btrfs_fs_info *fs_info);
138
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
139
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
140
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
141
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
142 143 144 145 146
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
			     u64 logical, u64 *length,
			     struct btrfs_bio **bbio_ret,
			     int mirror_num, int need_raid_map);
Y
Yan Zheng 已提交
147

148
DEFINE_MUTEX(uuid_mutex);
149
static LIST_HEAD(fs_uuids);
150 151 152 153
struct list_head *btrfs_get_fs_uuids(void)
{
	return &fs_uuids;
}
154

D
David Sterba 已提交
155 156 157 158 159 160 161 162 163
/*
 * alloc_fs_devices - allocate struct btrfs_fs_devices
 * @fsid:	if not NULL, copy the uuid to fs_devices::fsid
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
164 165 166
{
	struct btrfs_fs_devices *fs_devs;

167
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
168 169 170 171 172 173
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
174
	INIT_LIST_HEAD(&fs_devs->resized_devices);
175 176 177 178 179 180 181 182
	INIT_LIST_HEAD(&fs_devs->alloc_list);
	INIT_LIST_HEAD(&fs_devs->list);
	if (fsid)
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);

	return fs_devs;
}

Y
Yan Zheng 已提交
183 184 185 186 187 188 189 190
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
191
		rcu_string_free(device->name);
192
		bio_put(device->flush_bio);
Y
Yan Zheng 已提交
193 194 195 196 197
		kfree(device);
	}
	kfree(fs_devices);
}

198 199 200 201 202 203 204
static void btrfs_kobject_uevent(struct block_device *bdev,
				 enum kobject_action action)
{
	int ret;

	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
	if (ret)
205
		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
206 207 208 209 210
			action,
			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
			&disk_to_dev(bdev->bd_disk)->kobj);
}

211
void btrfs_cleanup_fs_uuids(void)
212 213 214
{
	struct btrfs_fs_devices *fs_devices;

Y
Yan Zheng 已提交
215 216 217 218
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
					struct btrfs_fs_devices, list);
		list_del(&fs_devices->list);
Y
Yan Zheng 已提交
219
		free_fs_devices(fs_devices);
220 221 222
	}
}

223 224 225 226
static struct btrfs_device *__alloc_device(void)
{
	struct btrfs_device *dev;

227
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
228 229 230
	if (!dev)
		return ERR_PTR(-ENOMEM);

231 232 233 234 235 236 237 238 239 240
	/*
	 * Preallocate a bio that's always going to be used for flushing device
	 * barriers and matches the device lifespan
	 */
	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
	if (!dev->flush_bio) {
		kfree(dev);
		return ERR_PTR(-ENOMEM);
	}

241 242
	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
243
	INIT_LIST_HEAD(&dev->resized_list);
244 245 246 247 248

	spin_lock_init(&dev->io_lock);

	spin_lock_init(&dev->reada_lock);
	atomic_set(&dev->reada_in_flight, 0);
249
	atomic_set(&dev->dev_stats_ccnt, 0);
250
	btrfs_device_data_ordered_init(dev);
251
	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
252
	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
253 254 255 256

	return dev;
}

257 258 259 260 261 262 263 264 265
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
		u64 devid, const u8 *uuid)
266
{
267
	struct list_head *head = &fs_devices->devices;
268 269
	struct btrfs_device *dev;

Q
Qinghuang Feng 已提交
270
	list_for_each_entry(dev, head, dev_list) {
271
		if (dev->devid == devid &&
272
		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
273
			return dev;
274
		}
275 276 277 278
	}
	return NULL;
}

279
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
280 281 282
{
	struct btrfs_fs_devices *fs_devices;

Q
Qinghuang Feng 已提交
283
	list_for_each_entry(fs_devices, &fs_uuids, list) {
284 285 286 287 288 289
		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
			return fs_devices;
	}
	return NULL;
}

290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
		      int flush, struct block_device **bdev,
		      struct buffer_head **bh)
{
	int ret;

	*bdev = blkdev_get_by_path(device_path, flags, holder);

	if (IS_ERR(*bdev)) {
		ret = PTR_ERR(*bdev);
		goto error;
	}

	if (flush)
		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
306
	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
307 308 309 310 311 312
	if (ret) {
		blkdev_put(*bdev, flags);
		goto error;
	}
	invalidate_bdev(*bdev);
	*bh = btrfs_read_dev_super(*bdev);
313 314
	if (IS_ERR(*bh)) {
		ret = PTR_ERR(*bh);
315 316 317 318 319 320 321 322 323 324 325 326
		blkdev_put(*bdev, flags);
		goto error;
	}

	return 0;

error:
	*bdev = NULL;
	*bh = NULL;
	return ret;
}

327 328 329 330 331 332 333 334 335 336 337 338 339 340
static void requeue_list(struct btrfs_pending_bios *pending_bios,
			struct bio *head, struct bio *tail)
{

	struct bio *old_head;

	old_head = pending_bios->head;
	pending_bios->head = head;
	if (pending_bios->tail)
		tail->bi_next = old_head;
	else
		pending_bios->tail = tail;
}

341 342 343 344 345 346 347 348 349 350 351
/*
 * we try to collect pending bios for a device so we don't get a large
 * number of procs sending bios down to the same device.  This greatly
 * improves the schedulers ability to collect and merge the bios.
 *
 * But, it also turns into a long list of bios to process and that is sure
 * to eventually make the worker thread block.  The solution here is to
 * make some progress and then put this work struct back at the end of
 * the list if the block device is congested.  This way, multiple devices
 * can make progress from a single worker thread.
 */
352
static noinline void run_scheduled_bios(struct btrfs_device *device)
353
{
354
	struct btrfs_fs_info *fs_info = device->fs_info;
355 356
	struct bio *pending;
	struct backing_dev_info *bdi;
357
	struct btrfs_pending_bios *pending_bios;
358 359 360
	struct bio *tail;
	struct bio *cur;
	int again = 0;
361
	unsigned long num_run;
362
	unsigned long batch_run = 0;
363
	unsigned long last_waited = 0;
364
	int force_reg = 0;
M
Miao Xie 已提交
365
	int sync_pending = 0;
366 367 368 369 370 371 372 373 374
	struct blk_plug plug;

	/*
	 * this function runs all the bios we've collected for
	 * a particular device.  We don't want to wander off to
	 * another device without first sending all of these down.
	 * So, setup a plug here and finish it off before we return
	 */
	blk_start_plug(&plug);
375

376
	bdi = device->bdev->bd_bdi;
377

378 379 380
loop:
	spin_lock(&device->io_lock);

381
loop_lock:
382
	num_run = 0;
383

384 385 386 387 388
	/* take all the bios off the list at once and process them
	 * later on (without the lock held).  But, remember the
	 * tail and other pointers so the bios can be properly reinserted
	 * into the list if we hit congestion
	 */
389
	if (!force_reg && device->pending_sync_bios.head) {
390
		pending_bios = &device->pending_sync_bios;
391 392
		force_reg = 1;
	} else {
393
		pending_bios = &device->pending_bios;
394 395
		force_reg = 0;
	}
396 397 398

	pending = pending_bios->head;
	tail = pending_bios->tail;
399 400 401 402 403 404 405 406 407 408
	WARN_ON(pending && !tail);

	/*
	 * if pending was null this time around, no bios need processing
	 * at all and we can stop.  Otherwise it'll loop back up again
	 * and do an additional check so no bios are missed.
	 *
	 * device->running_pending is used to synchronize with the
	 * schedule_bio code.
	 */
409 410
	if (device->pending_sync_bios.head == NULL &&
	    device->pending_bios.head == NULL) {
411 412
		again = 0;
		device->running_pending = 0;
413 414 415
	} else {
		again = 1;
		device->running_pending = 1;
416
	}
417 418 419 420

	pending_bios->head = NULL;
	pending_bios->tail = NULL;

421 422
	spin_unlock(&device->io_lock);

C
Chris Mason 已提交
423
	while (pending) {
424 425

		rmb();
426 427 428 429 430 431 432 433
		/* we want to work on both lists, but do more bios on the
		 * sync list than the regular list
		 */
		if ((num_run > 32 &&
		    pending_bios != &device->pending_sync_bios &&
		    device->pending_sync_bios.head) ||
		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
		    device->pending_bios.head)) {
434 435 436 437 438
			spin_lock(&device->io_lock);
			requeue_list(pending_bios, pending, tail);
			goto loop_lock;
		}

439 440 441
		cur = pending;
		pending = pending->bi_next;
		cur->bi_next = NULL;
442

443
		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
444

445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
		/*
		 * if we're doing the sync list, record that our
		 * plug has some sync requests on it
		 *
		 * If we're doing the regular list and there are
		 * sync requests sitting around, unplug before
		 * we add more
		 */
		if (pending_bios == &device->pending_sync_bios) {
			sync_pending = 1;
		} else if (sync_pending) {
			blk_finish_plug(&plug);
			blk_start_plug(&plug);
			sync_pending = 0;
		}

461
		btrfsic_submit_bio(cur);
462 463
		num_run++;
		batch_run++;
464 465

		cond_resched();
466 467 468 469 470 471

		/*
		 * we made progress, there is more work to do and the bdi
		 * is now congested.  Back off and let other work structs
		 * run instead
		 */
C
Chris Mason 已提交
472
		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
473
		    fs_info->fs_devices->open_devices > 1) {
474
			struct io_context *ioc;
475

476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
			ioc = current->io_context;

			/*
			 * the main goal here is that we don't want to
			 * block if we're going to be able to submit
			 * more requests without blocking.
			 *
			 * This code does two great things, it pokes into
			 * the elevator code from a filesystem _and_
			 * it makes assumptions about how batching works.
			 */
			if (ioc && ioc->nr_batch_requests > 0 &&
			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
			    (last_waited == 0 ||
			     ioc->last_waited == last_waited)) {
				/*
				 * we want to go through our batch of
				 * requests and stop.  So, we copy out
				 * the ioc->last_waited time and test
				 * against it before looping
				 */
				last_waited = ioc->last_waited;
498
				cond_resched();
499 500
				continue;
			}
501
			spin_lock(&device->io_lock);
502
			requeue_list(pending_bios, pending, tail);
503
			device->running_pending = 1;
504 505

			spin_unlock(&device->io_lock);
506 507
			btrfs_queue_work(fs_info->submit_workers,
					 &device->work);
508 509 510
			goto done;
		}
	}
511

512 513 514 515 516 517 518 519 520
	cond_resched();
	if (again)
		goto loop;

	spin_lock(&device->io_lock);
	if (device->pending_bios.head || device->pending_sync_bios.head)
		goto loop_lock;
	spin_unlock(&device->io_lock);

521
done:
522
	blk_finish_plug(&plug);
523 524
}

525
static void pending_bios_fn(struct btrfs_work *work)
526 527 528 529 530 531 532
{
	struct btrfs_device *device;

	device = container_of(work, struct btrfs_device, work);
	run_scheduled_bios(device);
}

A
Anand Jain 已提交
533

534
static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
A
Anand Jain 已提交
535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
{
	struct btrfs_fs_devices *fs_devs;
	struct btrfs_device *dev;

	if (!cur_dev->name)
		return;

	list_for_each_entry(fs_devs, &fs_uuids, list) {
		int del = 1;

		if (fs_devs->opened)
			continue;
		if (fs_devs->seeding)
			continue;

		list_for_each_entry(dev, &fs_devs->devices, dev_list) {

			if (dev == cur_dev)
				continue;
			if (!dev->name)
				continue;

			/*
			 * Todo: This won't be enough. What if the same device
			 * comes back (with new uuid and) with its mapper path?
			 * But for now, this does help as mostly an admin will
			 * either use mapper or non mapper path throughout.
			 */
			rcu_read_lock();
			del = strcmp(rcu_str_deref(dev->name),
						rcu_str_deref(cur_dev->name));
			rcu_read_unlock();
			if (!del)
				break;
		}

		if (!del) {
			/* delete the stale device */
			if (fs_devs->num_devices == 1) {
				btrfs_sysfs_remove_fsid(fs_devs);
				list_del(&fs_devs->list);
				free_fs_devices(fs_devs);
			} else {
				fs_devs->num_devices--;
				list_del(&dev->dev_list);
				rcu_string_free(dev->name);
581
				bio_put(dev->flush_bio);
A
Anand Jain 已提交
582 583 584 585 586 587 588
				kfree(dev);
			}
			break;
		}
	}
}

589 590 591 592 593 594 595 596
/*
 * Add new device to list of registered devices
 *
 * Returns:
 * 1   - first time device is seen
 * 0   - device already known
 * < 0 - error
 */
597
static noinline int device_list_add(const char *path,
598 599 600 601 602
			   struct btrfs_super_block *disk_super,
			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_device *device;
	struct btrfs_fs_devices *fs_devices;
603
	struct rcu_string *name;
604
	int ret = 0;
605 606 607 608
	u64 found_transid = btrfs_super_generation(disk_super);

	fs_devices = find_fsid(disk_super->fsid);
	if (!fs_devices) {
609 610 611 612
		fs_devices = alloc_fs_devices(disk_super->fsid);
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);

613
		list_add(&fs_devices->list, &fs_uuids);
614

615 616
		device = NULL;
	} else {
617 618
		device = find_device(fs_devices, devid,
				disk_super->dev_item.uuid);
619
	}
620

621
	if (!device) {
Y
Yan Zheng 已提交
622 623 624
		if (fs_devices->opened)
			return -EBUSY;

625 626 627
		device = btrfs_alloc_device(NULL, &devid,
					    disk_super->dev_item.uuid);
		if (IS_ERR(device)) {
628
			/* we can safely leave the fs_devices entry around */
629
			return PTR_ERR(device);
630
		}
631 632 633

		name = rcu_string_strdup(path, GFP_NOFS);
		if (!name) {
634
			bio_put(device->flush_bio);
635 636 637
			kfree(device);
			return -ENOMEM;
		}
638
		rcu_assign_pointer(device->name, name);
639

640
		mutex_lock(&fs_devices->device_list_mutex);
641
		list_add_rcu(&device->dev_list, &fs_devices->devices);
642
		fs_devices->num_devices++;
643 644
		mutex_unlock(&fs_devices->device_list_mutex);

645
		ret = 1;
Y
Yan Zheng 已提交
646
		device->fs_devices = fs_devices;
647
	} else if (!device->name || strcmp(device->name->str, path)) {
648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
669 670 671 672
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
673
		 */
674
		if (!fs_devices->opened && found_transid < device->generation) {
675 676 677 678 679 680 681
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
682
			return -EEXIST;
683
		}
684

685
		name = rcu_string_strdup(path, GFP_NOFS);
686 687
		if (!name)
			return -ENOMEM;
688 689
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
690 691 692 693
		if (device->missing) {
			fs_devices->missing_devices--;
			device->missing = 0;
		}
694 695
	}

696 697 698 699 700 701 702 703 704
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
	if (!fs_devices->opened)
		device->generation = found_transid;

A
Anand Jain 已提交
705 706 707 708
	/*
	 * if there is new btrfs on an already registered device,
	 * then remove the stale device entry.
	 */
709 710
	if (ret > 0)
		btrfs_free_stale_device(device);
A
Anand Jain 已提交
711

712
	*fs_devices_ret = fs_devices;
713 714

	return ret;
715 716
}

Y
Yan Zheng 已提交
717 718 719 720 721 722
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;

723 724 725
	fs_devices = alloc_fs_devices(orig->fsid);
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
726

727
	mutex_lock(&orig->device_list_mutex);
J
Josef Bacik 已提交
728
	fs_devices->total_devices = orig->total_devices;
Y
Yan Zheng 已提交
729

730
	/* We have held the volume lock, it is safe to get the devices. */
Y
Yan Zheng 已提交
731
	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
732 733
		struct rcu_string *name;

734 735 736
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
					    orig_dev->uuid);
		if (IS_ERR(device))
Y
Yan Zheng 已提交
737 738
			goto error;

739 740 741 742
		/*
		 * This is ok to do without rcu read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
743
		if (orig_dev->name) {
744 745
			name = rcu_string_strdup(orig_dev->name->str,
					GFP_KERNEL);
746
			if (!name) {
747
				bio_put(device->flush_bio);
748 749 750 751
				kfree(device);
				goto error;
			}
			rcu_assign_pointer(device->name, name);
J
Julia Lawall 已提交
752
		}
Y
Yan Zheng 已提交
753 754 755 756 757

		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
758
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
759 760
	return fs_devices;
error:
761
	mutex_unlock(&orig->device_list_mutex);
Y
Yan Zheng 已提交
762 763 764 765
	free_fs_devices(fs_devices);
	return ERR_PTR(-ENOMEM);
}

766
void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
767
{
Q
Qinghuang Feng 已提交
768
	struct btrfs_device *device, *next;
769
	struct btrfs_device *latest_dev = NULL;
770

771 772
	mutex_lock(&uuid_mutex);
again:
773
	/* This is the initialized path, it is safe to release the devices. */
Q
Qinghuang Feng 已提交
774
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
775
		if (device->in_fs_metadata) {
776
			if (!device->is_tgtdev_for_dev_replace &&
777 778 779
			    (!latest_dev ||
			     device->generation > latest_dev->generation)) {
				latest_dev = device;
780
			}
Y
Yan Zheng 已提交
781
			continue;
782
		}
Y
Yan Zheng 已提交
783

784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
			/*
			 * In the first step, keep the device which has
			 * the correct fsid and the devid that is used
			 * for the dev_replace procedure.
			 * In the second step, the dev_replace state is
			 * read from the device tree and it is known
			 * whether the procedure is really active or
			 * not, which means whether this device is
			 * used or whether it should be removed.
			 */
			if (step == 0 || device->is_tgtdev_for_dev_replace) {
				continue;
			}
		}
Y
Yan Zheng 已提交
799
		if (device->bdev) {
800
			blkdev_put(device->bdev, device->mode);
Y
Yan Zheng 已提交
801 802 803 804 805 806
			device->bdev = NULL;
			fs_devices->open_devices--;
		}
		if (device->writeable) {
			list_del_init(&device->dev_alloc_list);
			device->writeable = 0;
807 808
			if (!device->is_tgtdev_for_dev_replace)
				fs_devices->rw_devices--;
Y
Yan Zheng 已提交
809
		}
Y
Yan Zheng 已提交
810 811
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
812
		rcu_string_free(device->name);
813
		bio_put(device->flush_bio);
Y
Yan Zheng 已提交
814
		kfree(device);
815
	}
Y
Yan Zheng 已提交
816 817 818 819 820 821

	if (fs_devices->seed) {
		fs_devices = fs_devices->seed;
		goto again;
	}

822
	fs_devices->latest_bdev = latest_dev->bdev;
823

824 825
	mutex_unlock(&uuid_mutex);
}
826

827
static void free_device_rcu(struct rcu_head *head)
828 829 830
{
	struct btrfs_device *device;

L
Liu Bo 已提交
831
	device = container_of(head, struct btrfs_device, rcu);
832
	rcu_string_free(device->name);
833
	bio_put(device->flush_bio);
834 835 836
	kfree(device);
}

837 838 839 840 841 842 843 844 845 846 847
static void btrfs_close_bdev(struct btrfs_device *device)
{
	if (device->bdev && device->writeable) {
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

	if (device->bdev)
		blkdev_put(device->bdev, device->mode);
}

848
static void btrfs_prepare_close_one_device(struct btrfs_device *device)
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;
	struct btrfs_device *new_device;
	struct rcu_string *name;

	if (device->bdev)
		fs_devices->open_devices--;

	if (device->writeable &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

	if (device->missing)
		fs_devices->missing_devices--;

	new_device = btrfs_alloc_device(NULL, &device->devid,
					device->uuid);
	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */

	/* Safe because we are under uuid_mutex */
	if (device->name) {
		name = rcu_string_strdup(device->name->str, GFP_NOFS);
		BUG_ON(!name); /* -ENOMEM */
		rcu_assign_pointer(new_device->name, name);
	}

	list_replace_rcu(&device->dev_list, &new_device->dev_list);
	new_device->fs_devices = device->fs_devices;
}

Y
Yan Zheng 已提交
881
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
882
{
883
	struct btrfs_device *device, *tmp;
884 885 886
	struct list_head pending_put;

	INIT_LIST_HEAD(&pending_put);
Y
Yan Zheng 已提交
887

Y
Yan Zheng 已提交
888 889
	if (--fs_devices->opened > 0)
		return 0;
890

891
	mutex_lock(&fs_devices->device_list_mutex);
892
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
893 894
		btrfs_prepare_close_one_device(device);
		list_add(&device->dev_list, &pending_put);
895
	}
896 897
	mutex_unlock(&fs_devices->device_list_mutex);

898 899 900 901 902 903 904 905 906 907 908
	/*
	 * btrfs_show_devname() is using the device_list_mutex,
	 * sometimes call to blkdev_put() leads vfs calling
	 * into this func. So do put outside of device_list_mutex,
	 * as of now.
	 */
	while (!list_empty(&pending_put)) {
		device = list_first_entry(&pending_put,
				struct btrfs_device, dev_list);
		list_del(&device->dev_list);
		btrfs_close_bdev(device);
909
		call_rcu(&device->rcu, free_device_rcu);
910 911
	}

Y
Yan Zheng 已提交
912 913
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Y
Yan Zheng 已提交
914 915 916
	fs_devices->opened = 0;
	fs_devices->seeding = 0;

917 918 919
	return 0;
}

Y
Yan Zheng 已提交
920 921
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
Y
Yan Zheng 已提交
922
	struct btrfs_fs_devices *seed_devices = NULL;
Y
Yan Zheng 已提交
923 924 925 926
	int ret;

	mutex_lock(&uuid_mutex);
	ret = __btrfs_close_devices(fs_devices);
Y
Yan Zheng 已提交
927 928 929 930
	if (!fs_devices->opened) {
		seed_devices = fs_devices->seed;
		fs_devices->seed = NULL;
	}
Y
Yan Zheng 已提交
931
	mutex_unlock(&uuid_mutex);
Y
Yan Zheng 已提交
932 933 934 935 936 937 938

	while (seed_devices) {
		fs_devices = seed_devices;
		seed_devices = fs_devices->seed;
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
	}
Y
Yan Zheng 已提交
939 940 941
	return ret;
}

Y
Yan Zheng 已提交
942 943
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				fmode_t flags, void *holder)
944
{
945
	struct request_queue *q;
946 947 948
	struct block_device *bdev;
	struct list_head *head = &fs_devices->devices;
	struct btrfs_device *device;
949
	struct btrfs_device *latest_dev = NULL;
950 951 952
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	u64 devid;
Y
Yan Zheng 已提交
953
	int seeding = 1;
954
	int ret = 0;
955

956 957
	flags |= FMODE_EXCL;

Q
Qinghuang Feng 已提交
958
	list_for_each_entry(device, head, dev_list) {
959 960
		if (device->bdev)
			continue;
961 962 963
		if (!device->name)
			continue;

964 965 966
		/* Just open everything we can; ignore failures here */
		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
					    &bdev, &bh))
967
			continue;
968 969

		disk_super = (struct btrfs_super_block *)bh->b_data;
970
		devid = btrfs_stack_device_id(&disk_super->dev_item);
971 972 973
		if (devid != device->devid)
			goto error_brelse;

Y
Yan Zheng 已提交
974 975 976 977 978
		if (memcmp(device->uuid, disk_super->dev_item.uuid,
			   BTRFS_UUID_SIZE))
			goto error_brelse;

		device->generation = btrfs_super_generation(disk_super);
979 980 981
		if (!latest_dev ||
		    device->generation > latest_dev->generation)
			latest_dev = device;
982

Y
Yan Zheng 已提交
983 984 985 986 987 988 989
		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
			device->writeable = 0;
		} else {
			device->writeable = !bdev_read_only(bdev);
			seeding = 0;
		}

990
		q = bdev_get_queue(bdev);
991
		if (blk_queue_discard(q))
992
			device->can_discard = 1;
993 994
		if (!blk_queue_nonrot(q))
			fs_devices->rotating = 1;
995

996
		device->bdev = bdev;
997
		device->in_fs_metadata = 0;
998 999
		device->mode = flags;

1000
		fs_devices->open_devices++;
1001 1002
		if (device->writeable &&
		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
Y
Yan Zheng 已提交
1003 1004 1005 1006
			fs_devices->rw_devices++;
			list_add(&device->dev_alloc_list,
				 &fs_devices->alloc_list);
		}
1007
		brelse(bh);
1008
		continue;
1009

1010 1011
error_brelse:
		brelse(bh);
1012
		blkdev_put(bdev, flags);
1013
		continue;
1014
	}
1015
	if (fs_devices->open_devices == 0) {
1016
		ret = -EINVAL;
1017 1018
		goto out;
	}
Y
Yan Zheng 已提交
1019 1020
	fs_devices->seeding = seeding;
	fs_devices->opened = 1;
1021
	fs_devices->latest_bdev = latest_dev->bdev;
Y
Yan Zheng 已提交
1022
	fs_devices->total_rw_bytes = 0;
1023
out:
Y
Yan Zheng 已提交
1024 1025 1026 1027
	return ret;
}

int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1028
		       fmode_t flags, void *holder)
Y
Yan Zheng 已提交
1029 1030 1031 1032 1033
{
	int ret;

	mutex_lock(&uuid_mutex);
	if (fs_devices->opened) {
Y
Yan Zheng 已提交
1034 1035
		fs_devices->opened++;
		ret = 0;
Y
Yan Zheng 已提交
1036
	} else {
1037
		ret = __btrfs_open_devices(fs_devices, flags, holder);
Y
Yan Zheng 已提交
1038
	}
1039 1040 1041 1042
	mutex_unlock(&uuid_mutex);
	return ret;
}

1043
static void btrfs_release_disk_super(struct page *page)
1044 1045 1046 1047 1048
{
	kunmap(page);
	put_page(page);
}

1049 1050 1051
static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				 struct page **page,
				 struct btrfs_super_block **disk_super)
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
{
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
		return 1;

	/* make sure our super fits in the page */
	if (sizeof(**disk_super) > PAGE_SIZE)
		return 1;

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
		return 1;

	/* pull in the page with our super */
	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				   index, GFP_KERNEL);

	if (IS_ERR_OR_NULL(*page))
		return 1;

	p = kmap(*page);

	/* align our pointer to the offset of the super block */
	*disk_super = p + (bytenr & ~PAGE_MASK);

	if (btrfs_super_bytenr(*disk_super) != bytenr ||
	    btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(*page);
		return 1;
	}

	if ((*disk_super)->label[0] &&
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
		(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';

	return 0;
}

1094 1095 1096 1097 1098
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache
 */
1099
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1100 1101 1102 1103
			  struct btrfs_fs_devices **fs_devices_ret)
{
	struct btrfs_super_block *disk_super;
	struct block_device *bdev;
1104 1105
	struct page *page;
	int ret = -EINVAL;
1106
	u64 devid;
1107
	u64 transid;
J
Josef Bacik 已提交
1108
	u64 total_devices;
1109
	u64 bytenr;
1110

1111 1112 1113 1114 1115 1116 1117
	/*
	 * we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	bytenr = btrfs_sb_offset(0);
1118
	flags |= FMODE_EXCL;
1119
	mutex_lock(&uuid_mutex);
1120 1121 1122 1123

	bdev = blkdev_get_by_path(path, flags, holder);
	if (IS_ERR(bdev)) {
		ret = PTR_ERR(bdev);
1124
		goto error;
1125 1126
	}

1127
	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
1128 1129
		goto error_bdev_put;

1130
	devid = btrfs_stack_device_id(&disk_super->dev_item);
1131
	transid = btrfs_super_generation(disk_super);
J
Josef Bacik 已提交
1132
	total_devices = btrfs_super_num_devices(disk_super);
1133

1134
	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
1135 1136
	if (ret > 0) {
		if (disk_super->label[0]) {
1137
			pr_info("BTRFS: device label %s ", disk_super->label);
1138
		} else {
1139
			pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
1140 1141
		}

1142
		pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
1143 1144
		ret = 0;
	}
J
Josef Bacik 已提交
1145 1146
	if (!ret && fs_devices_ret)
		(*fs_devices_ret)->total_devices = total_devices;
1147

1148
	btrfs_release_disk_super(page);
1149 1150

error_bdev_put:
1151
	blkdev_put(bdev, flags);
1152
error:
1153
	mutex_unlock(&uuid_mutex);
1154 1155
	return ret;
}
1156

1157 1158 1159 1160 1161
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
				   u64 end, u64 *length)
{
	struct btrfs_key key;
1162
	struct btrfs_root *root = device->fs_info->dev_root;
1163 1164 1165 1166 1167 1168 1169 1170 1171
	struct btrfs_dev_extent *dev_extent;
	struct btrfs_path *path;
	u64 extent_end;
	int ret;
	int slot;
	struct extent_buffer *l;

	*length = 0;

1172
	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1173 1174 1175 1176 1177
		return 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1178
	path->reada = READA_FORWARD;
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
			goto out;
	}

	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto out;

			break;
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
			break;

1213
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
			goto next;

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (key.offset <= start && extent_end > end) {
			*length = end - start + 1;
			break;
		} else if (key.offset <= start && extent_end > start)
			*length += extent_end - start;
		else if (key.offset > start && extent_end <= end)
			*length += extent_end - key.offset;
		else if (key.offset > start && key.offset <= end) {
			*length += end - key.offset + 1;
			break;
		} else if (key.offset > end)
			break;

next:
		path->slots[0]++;
	}
	ret = 0;
out:
	btrfs_free_path(path);
	return ret;
}

1241
static int contains_pending_extent(struct btrfs_transaction *transaction,
1242 1243 1244
				   struct btrfs_device *device,
				   u64 *start, u64 len)
{
1245
	struct btrfs_fs_info *fs_info = device->fs_info;
1246
	struct extent_map *em;
1247
	struct list_head *search_list = &fs_info->pinned_chunks;
1248
	int ret = 0;
1249
	u64 physical_start = *start;
1250

1251 1252
	if (transaction)
		search_list = &transaction->pending_chunks;
1253 1254
again:
	list_for_each_entry(em, search_list, list) {
1255 1256 1257
		struct map_lookup *map;
		int i;

1258
		map = em->map_lookup;
1259
		for (i = 0; i < map->num_stripes; i++) {
1260 1261
			u64 end;

1262 1263
			if (map->stripes[i].dev != device)
				continue;
1264
			if (map->stripes[i].physical >= physical_start + len ||
1265
			    map->stripes[i].physical + em->orig_block_len <=
1266
			    physical_start)
1267
				continue;
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284
			/*
			 * Make sure that while processing the pinned list we do
			 * not override our *start with a lower value, because
			 * we can have pinned chunks that fall within this
			 * device hole and that have lower physical addresses
			 * than the pending chunks we processed before. If we
			 * do not take this special care we can end up getting
			 * 2 pending chunks that start at the same physical
			 * device offsets because the end offset of a pinned
			 * chunk can be equal to the start offset of some
			 * pending chunk.
			 */
			end = map->stripes[i].physical + em->orig_block_len;
			if (end > *start) {
				*start = end;
				ret = 1;
			}
1285 1286
		}
	}
1287 1288
	if (search_list != &fs_info->pinned_chunks) {
		search_list = &fs_info->pinned_chunks;
1289 1290
		goto again;
	}
1291 1292 1293 1294 1295

	return ret;
}


1296
/*
1297 1298 1299 1300 1301 1302 1303
 * find_free_dev_extent_start - find free space in the specified device
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1304
 *
1305 1306 1307
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
1308 1309 1310 1311 1312 1313 1314 1315
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1316
 */
1317 1318 1319
int find_free_dev_extent_start(struct btrfs_transaction *transaction,
			       struct btrfs_device *device, u64 num_bytes,
			       u64 search_start, u64 *start, u64 *len)
1320
{
1321 1322
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1323
	struct btrfs_key key;
1324
	struct btrfs_dev_extent *dev_extent;
Y
Yan Zheng 已提交
1325
	struct btrfs_path *path;
1326 1327 1328 1329
	u64 hole_size;
	u64 max_hole_start;
	u64 max_hole_size;
	u64 extent_end;
1330 1331
	u64 search_end = device->total_bytes;
	int ret;
1332
	int slot;
1333
	struct extent_buffer *l;
1334 1335 1336 1337 1338 1339

	/*
	 * We don't want to overwrite the superblock on the drive nor any area
	 * used by the boot loader (grub for example), so we make sure to start
	 * at an offset of at least 1MB.
	 */
1340
	search_start = max_t(u64, search_start, SZ_1M);
1341

1342 1343 1344
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1345

1346 1347 1348
	max_hole_start = search_start;
	max_hole_size = 0;

1349
again:
1350
	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1351
		ret = -ENOSPC;
1352
		goto out;
1353 1354
	}

1355
	path->reada = READA_FORWARD;
1356 1357
	path->search_commit_root = 1;
	path->skip_locking = 1;
1358

1359 1360 1361
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1362

1363
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1364
	if (ret < 0)
1365
		goto out;
1366 1367 1368
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid, key.type);
		if (ret < 0)
1369
			goto out;
1370
	}
1371

1372 1373 1374 1375 1376 1377 1378 1379
	while (1) {
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1380 1381 1382
				goto out;

			break;
1383 1384 1385 1386 1387 1388 1389
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1390
			break;
1391

1392
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1393
			goto next;
1394

1395 1396
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1397

1398 1399 1400 1401
			/*
			 * Have to check before we set max_hole_start, otherwise
			 * we could end up sending back this offset anyway.
			 */
1402
			if (contains_pending_extent(transaction, device,
1403
						    &search_start,
1404 1405 1406 1407 1408 1409 1410 1411
						    hole_size)) {
				if (key.offset >= search_start) {
					hole_size = key.offset - search_start;
				} else {
					WARN_ON_ONCE(1);
					hole_size = 0;
				}
			}
1412

1413 1414 1415 1416
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1417

1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1430 1431 1432 1433
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1434 1435 1436 1437
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1438 1439 1440 1441 1442
next:
		path->slots[0]++;
		cond_resched();
	}

1443 1444 1445 1446 1447
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1448
	if (search_end > search_start) {
1449 1450
		hole_size = search_end - search_start;

1451
		if (contains_pending_extent(transaction, device, &search_start,
1452 1453 1454 1455
					    hole_size)) {
			btrfs_release_path(path);
			goto again;
		}
1456

1457 1458 1459 1460
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1461 1462
	}

1463
	/* See above. */
1464
	if (max_hole_size < num_bytes)
1465 1466 1467 1468 1469
		ret = -ENOSPC;
	else
		ret = 0;

out:
Y
Yan Zheng 已提交
1470
	btrfs_free_path(path);
1471
	*start = max_hole_start;
1472
	if (len)
1473
		*len = max_hole_size;
1474 1475 1476
	return ret;
}

1477 1478 1479 1480 1481 1482
int find_free_dev_extent(struct btrfs_trans_handle *trans,
			 struct btrfs_device *device, u64 num_bytes,
			 u64 *start, u64 *len)
{
	/* FIXME use last free of some kind */
	return find_free_dev_extent_start(trans->transaction, device,
1483
					  num_bytes, 0, start, len);
1484 1485
}

1486
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1487
			  struct btrfs_device *device,
M
Miao Xie 已提交
1488
			  u64 start, u64 *dev_extent_len)
1489
{
1490 1491
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1492 1493 1494
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1495 1496 1497
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1498 1499 1500 1501 1502 1503 1504 1505

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
M
Miao Xie 已提交
1506
again:
1507
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1508 1509 1510
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1511 1512
		if (ret)
			goto out;
1513 1514 1515 1516 1517 1518
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
M
Miao Xie 已提交
1519 1520 1521
		key = found_key;
		btrfs_release_path(path);
		goto again;
1522 1523 1524 1525
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1526
	} else {
1527
		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1528
		goto out;
1529
	}
1530

M
Miao Xie 已提交
1531 1532
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1533
	ret = btrfs_del_item(trans, root, path);
1534
	if (ret) {
1535 1536
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to remove dev extent item");
Z
Zhao Lei 已提交
1537
	} else {
1538
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1539
	}
1540
out:
1541 1542 1543 1544
	btrfs_free_path(path);
	return ret;
}

1545 1546 1547
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_device *device,
				  u64 chunk_offset, u64 start, u64 num_bytes)
1548 1549 1550
{
	int ret;
	struct btrfs_path *path;
1551 1552
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1553 1554 1555 1556
	struct btrfs_dev_extent *extent;
	struct extent_buffer *leaf;
	struct btrfs_key key;

1557
	WARN_ON(!device->in_fs_metadata);
1558
	WARN_ON(device->is_tgtdev_for_dev_replace);
1559 1560 1561 1562 1563
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
Y
Yan Zheng 已提交
1564
	key.offset = start;
1565 1566 1567
	key.type = BTRFS_DEV_EXTENT_KEY;
	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*extent));
1568 1569
	if (ret)
		goto out;
1570 1571 1572 1573

	leaf = path->nodes[0];
	extent = btrfs_item_ptr(leaf, path->slots[0],
				struct btrfs_dev_extent);
1574 1575
	btrfs_set_dev_extent_chunk_tree(leaf, extent,
					BTRFS_CHUNK_TREE_OBJECTID);
1576 1577
	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1578 1579
	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

1580 1581
	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
	btrfs_mark_buffer_dirty(leaf);
1582
out:
1583 1584 1585 1586
	btrfs_free_path(path);
	return ret;
}

1587
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1588
{
1589 1590 1591 1592
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct rb_node *n;
	u64 ret = 0;
1593

1594 1595 1596 1597 1598 1599
	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	n = rb_last(&em_tree->map);
	if (n) {
		em = rb_entry(n, struct extent_map, rb_node);
		ret = em->start + em->len;
1600
	}
1601 1602
	read_unlock(&em_tree->lock);

1603 1604 1605
	return ret;
}

1606 1607
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1608 1609 1610 1611
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Y
Yan Zheng 已提交
1612 1613 1614 1615 1616
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1617 1618 1619 1620 1621

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1622
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1623 1624 1625
	if (ret < 0)
		goto error;

1626
	BUG_ON(ret == 0); /* Corruption */
1627

1628 1629
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1630 1631
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1632
		*devid_ret = 1;
1633 1634 1635
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1636
		*devid_ret = found_key.offset + 1;
1637 1638 1639
	}
	ret = 0;
error:
Y
Yan Zheng 已提交
1640
	btrfs_free_path(path);
1641 1642 1643 1644 1645 1646 1647
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1648
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1649
			    struct btrfs_fs_info *fs_info,
1650
			    struct btrfs_device *device)
1651
{
1652
	struct btrfs_root *root = fs_info->chunk_root;
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Y
Yan Zheng 已提交
1666
	key.offset = device->devid;
1667 1668

	ret = btrfs_insert_empty_item(trans, root, path, &key,
1669
				      sizeof(*dev_item));
1670 1671 1672 1673 1674 1675 1676
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Y
Yan Zheng 已提交
1677
	btrfs_set_device_generation(leaf, dev_item, 0);
1678 1679 1680 1681
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1682 1683 1684 1685
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1686 1687 1688
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1689
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1690

1691
	ptr = btrfs_device_uuid(dev_item);
1692
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1693
	ptr = btrfs_device_fsid(dev_item);
1694
	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1695 1696
	btrfs_mark_buffer_dirty(leaf);

Y
Yan Zheng 已提交
1697
	ret = 0;
1698 1699 1700 1701
out:
	btrfs_free_path(path);
	return ret;
}
1702

1703 1704 1705 1706
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 */
1707
static void update_dev_time(const char *path_name)
1708 1709 1710 1711
{
	struct file *filp;

	filp = filp_open(path_name, O_RDWR, 0);
1712
	if (IS_ERR(filp))
1713 1714 1715 1716 1717
		return;
	file_update_time(filp);
	filp_close(filp, NULL);
}

1718
static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1719 1720
			     struct btrfs_device *device)
{
1721
	struct btrfs_root *root = fs_info->chunk_root;
1722 1723 1724 1725 1726 1727 1728 1729 1730
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1731
	trans = btrfs_start_transaction(root, 0);
1732 1733 1734 1735
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}
1736 1737 1738 1739 1740
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1741 1742 1743 1744 1745
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
1746 1747 1748 1749
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
1750 1751 1752 1753 1754
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	}

1755 1756
out:
	btrfs_free_path(path);
1757 1758
	if (!ret)
		ret = btrfs_commit_transaction(trans);
1759 1760 1761
	return ret;
}

1762 1763 1764 1765 1766 1767 1768
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1769 1770
{
	u64 all_avail;
1771
	unsigned seq;
1772
	int i;
1773

1774
	do {
1775
		seq = read_seqbegin(&fs_info->profiles_lock);
1776

1777 1778 1779 1780
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
1781

1782 1783 1784
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
		if (!(all_avail & btrfs_raid_group[i]))
			continue;
1785

1786 1787
		if (num_devices < btrfs_raid_array[i].devs_min) {
			int ret = btrfs_raid_mindev_error[i];
1788

1789 1790 1791
			if (ret)
				return ret;
		}
D
David Woodhouse 已提交
1792 1793
	}

1794
	return 0;
1795 1796
}

1797 1798
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1799
{
Y
Yan Zheng 已提交
1800
	struct btrfs_device *next_device;
1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
			!next_device->missing && next_device->bdev)
			return next_device;
	}

	return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_bdev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
		struct btrfs_device *device, struct btrfs_device *this_dev)
{
	struct btrfs_device *next_device;

	if (this_dev)
		next_device = this_dev;
	else
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
								device);
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

	if (fs_info->fs_devices->latest_bdev == device->bdev)
		fs_info->fs_devices->latest_bdev = next_device->bdev;
}

1837 1838
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
		u64 devid)
1839 1840
{
	struct btrfs_device *device;
1841
	struct btrfs_fs_devices *cur_devices;
Y
Yan Zheng 已提交
1842
	u64 num_devices;
1843 1844
	int ret = 0;

1845
	mutex_lock(&fs_info->volume_mutex);
1846 1847
	mutex_lock(&uuid_mutex);

1848 1849 1850
	num_devices = fs_info->fs_devices->num_devices;
	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1851 1852 1853
		WARN_ON(num_devices < 1);
		num_devices--;
	}
1854
	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
1855

1856
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1857
	if (ret)
1858 1859
		goto out;

1860 1861
	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
					   &device);
1862
	if (ret)
D
David Woodhouse 已提交
1863
		goto out;
1864

1865
	if (device->is_tgtdev_for_dev_replace) {
1866
		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1867
		goto out;
1868 1869
	}

1870
	if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
1871
		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1872
		goto out;
Y
Yan Zheng 已提交
1873 1874 1875
	}

	if (device->writeable) {
1876
		mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
1877
		list_del_init(&device->dev_alloc_list);
1878
		device->fs_devices->rw_devices--;
1879
		mutex_unlock(&fs_info->chunk_mutex);
1880
	}
1881

1882
	mutex_unlock(&uuid_mutex);
1883
	ret = btrfs_shrink_device(device, 0);
1884
	mutex_lock(&uuid_mutex);
1885
	if (ret)
1886
		goto error_undo;
1887

1888 1889 1890 1891 1892
	/*
	 * TODO: the superblock still includes this device in its num_devices
	 * counter although write_all_supers() is not locked out. This
	 * could give a filesystem state which requires a degraded mount.
	 */
1893
	ret = btrfs_rm_dev_item(fs_info, device);
1894
	if (ret)
1895
		goto error_undo;
1896

Y
Yan Zheng 已提交
1897
	device->in_fs_metadata = 0;
1898
	btrfs_scrub_cancel_dev(fs_info, device);
1899 1900 1901 1902

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
1903 1904 1905 1906 1907
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
1908
	 */
1909 1910

	cur_devices = device->fs_devices;
1911
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
1912
	list_del_rcu(&device->dev_list);
1913

Y
Yan Zheng 已提交
1914
	device->fs_devices->num_devices--;
J
Josef Bacik 已提交
1915
	device->fs_devices->total_devices--;
Y
Yan Zheng 已提交
1916

1917
	if (device->missing)
1918
		device->fs_devices->missing_devices--;
1919

1920
	btrfs_assign_next_active_device(fs_info, device, NULL);
Y
Yan Zheng 已提交
1921

1922
	if (device->bdev) {
Y
Yan Zheng 已提交
1923
		device->fs_devices->open_devices--;
1924
		/* remove sysfs entry */
1925
		btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
1926
	}
1927

1928 1929 1930
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
1931

1932 1933 1934 1935 1936 1937 1938 1939 1940
	/*
	 * at this point, the device is zero sized and detached from
	 * the devices list.  All that's left is to zero out the old
	 * supers and free the device.
	 */
	if (device->writeable)
		btrfs_scratch_superblocks(device->bdev, device->name->str);

	btrfs_close_bdev(device);
1941
	call_rcu(&device->rcu, free_device_rcu);
1942

1943
	if (cur_devices->open_devices == 0) {
Y
Yan Zheng 已提交
1944
		struct btrfs_fs_devices *fs_devices;
1945
		fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
1946
		while (fs_devices) {
1947 1948
			if (fs_devices->seed == cur_devices) {
				fs_devices->seed = cur_devices->seed;
Y
Yan Zheng 已提交
1949
				break;
1950
			}
Y
Yan Zheng 已提交
1951
			fs_devices = fs_devices->seed;
Y
Yan Zheng 已提交
1952
		}
1953 1954 1955
		cur_devices->seed = NULL;
		__btrfs_close_devices(cur_devices);
		free_fs_devices(cur_devices);
Y
Yan Zheng 已提交
1956 1957
	}

1958 1959
out:
	mutex_unlock(&uuid_mutex);
1960
	mutex_unlock(&fs_info->volume_mutex);
1961
	return ret;
1962

1963 1964
error_undo:
	if (device->writeable) {
1965
		mutex_lock(&fs_info->chunk_mutex);
1966
		list_add(&device->dev_alloc_list,
1967
			 &fs_info->fs_devices->alloc_list);
1968
		device->fs_devices->rw_devices++;
1969
		mutex_unlock(&fs_info->chunk_mutex);
1970
	}
1971
	goto out;
1972 1973
}

1974 1975
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
					struct btrfs_device *srcdev)
1976
{
1977 1978
	struct btrfs_fs_devices *fs_devices;

1979
	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1980

1981 1982 1983 1984 1985 1986 1987
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
1988

1989
	list_del_rcu(&srcdev->dev_list);
1990
	list_del(&srcdev->dev_alloc_list);
1991
	fs_devices->num_devices--;
1992
	if (srcdev->missing)
1993
		fs_devices->missing_devices--;
1994

1995
	if (srcdev->writeable)
1996
		fs_devices->rw_devices--;
1997

1998
	if (srcdev->bdev)
1999
		fs_devices->open_devices--;
2000 2001 2002 2003 2004 2005
}

void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *srcdev)
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2006

2007 2008 2009 2010
	if (srcdev->writeable) {
		/* zero out the old super if it is writable */
		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
	}
2011 2012

	btrfs_close_bdev(srcdev);
2013
	call_rcu(&srcdev->rcu, free_device_rcu);
2014 2015 2016 2017 2018

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
		struct btrfs_fs_devices *tmp_fs_devices;

2019 2020 2021 2022 2023 2024 2025 2026
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2027 2028 2029 2030 2031 2032 2033 2034 2035
		tmp_fs_devices = fs_info->fs_devices;
		while (tmp_fs_devices) {
			if (tmp_fs_devices->seed == fs_devices) {
				tmp_fs_devices->seed = fs_devices->seed;
				break;
			}
			tmp_fs_devices = tmp_fs_devices->seed;
		}
		fs_devices->seed = NULL;
2036 2037
		__btrfs_close_devices(fs_devices);
		free_fs_devices(fs_devices);
2038
	}
2039 2040 2041 2042 2043
}

void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				      struct btrfs_device *tgtdev)
{
2044
	mutex_lock(&uuid_mutex);
2045 2046
	WARN_ON(!tgtdev);
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2047

2048
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2049

2050
	if (tgtdev->bdev)
2051
		fs_info->fs_devices->open_devices--;
2052

2053 2054
	fs_info->fs_devices->num_devices--;

2055
	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2056 2057 2058 2059

	list_del_rcu(&tgtdev->dev_list);

	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2060
	mutex_unlock(&uuid_mutex);
2061 2062 2063 2064 2065 2066 2067 2068 2069

	/*
	 * The update_dev_time() with in btrfs_scratch_superblocks()
	 * may lead to a call to btrfs_show_devname() which will try
	 * to hold device_list_mutex. And here this device
	 * is already out of device list, so we don't have to hold
	 * the device_list_mutex lock.
	 */
	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2070 2071

	btrfs_close_bdev(tgtdev);
2072
	call_rcu(&tgtdev->rcu, free_device_rcu);
2073 2074
}

2075
static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2076
				     const char *device_path,
2077
				     struct btrfs_device **device)
2078 2079 2080 2081 2082 2083 2084 2085 2086 2087
{
	int ret = 0;
	struct btrfs_super_block *disk_super;
	u64 devid;
	u8 *dev_uuid;
	struct block_device *bdev;
	struct buffer_head *bh;

	*device = NULL;
	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2088
				    fs_info->bdev_holder, 0, &bdev, &bh);
2089 2090 2091 2092 2093
	if (ret)
		return ret;
	disk_super = (struct btrfs_super_block *)bh->b_data;
	devid = btrfs_stack_device_id(&disk_super->dev_item);
	dev_uuid = disk_super->dev_item.uuid;
2094
	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2095 2096 2097 2098 2099 2100 2101
	brelse(bh);
	if (!*device)
		ret = -ENOENT;
	blkdev_put(bdev, FMODE_READ);
	return ret;
}

2102
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2103
					 const char *device_path,
2104 2105 2106 2107 2108 2109 2110
					 struct btrfs_device **device)
{
	*device = NULL;
	if (strcmp(device_path, "missing") == 0) {
		struct list_head *devices;
		struct btrfs_device *tmp;

2111
		devices = &fs_info->fs_devices->devices;
2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122
		/*
		 * It is safe to read the devices since the volume_mutex
		 * is held by the caller.
		 */
		list_for_each_entry(tmp, devices, dev_list) {
			if (tmp->in_fs_metadata && !tmp->bdev) {
				*device = tmp;
				break;
			}
		}

2123 2124
		if (!*device)
			return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2125 2126 2127

		return 0;
	} else {
2128
		return btrfs_find_device_by_path(fs_info, device_path, device);
2129 2130 2131
	}
}

2132 2133 2134
/*
 * Lookup a device given by device id, or the path if the id is 0.
 */
2135
int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2136 2137
				 const char *devpath,
				 struct btrfs_device **device)
2138 2139 2140
{
	int ret;

2141
	if (devid) {
2142
		ret = 0;
2143
		*device = btrfs_find_device(fs_info, devid, NULL, NULL);
2144 2145 2146
		if (!*device)
			ret = -ENOENT;
	} else {
2147
		if (!devpath || !devpath[0])
2148 2149
			return -EINVAL;

2150
		ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2151 2152 2153 2154 2155
							   device);
	}
	return ret;
}

Y
Yan Zheng 已提交
2156 2157 2158
/*
 * does all the dirty work required for changing file system's UUID.
 */
2159
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2160
{
2161
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2162
	struct btrfs_fs_devices *old_devices;
Y
Yan Zheng 已提交
2163
	struct btrfs_fs_devices *seed_devices;
2164
	struct btrfs_super_block *disk_super = fs_info->super_copy;
Y
Yan Zheng 已提交
2165 2166 2167 2168
	struct btrfs_device *device;
	u64 super_flags;

	BUG_ON(!mutex_is_locked(&uuid_mutex));
Y
Yan Zheng 已提交
2169
	if (!fs_devices->seeding)
Y
Yan Zheng 已提交
2170 2171
		return -EINVAL;

D
David Sterba 已提交
2172
	seed_devices = alloc_fs_devices(NULL);
2173 2174
	if (IS_ERR(seed_devices))
		return PTR_ERR(seed_devices);
Y
Yan Zheng 已提交
2175

Y
Yan Zheng 已提交
2176 2177 2178 2179
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
		return PTR_ERR(old_devices);
Y
Yan Zheng 已提交
2180
	}
Y
Yan Zheng 已提交
2181

Y
Yan Zheng 已提交
2182 2183
	list_add(&old_devices->list, &fs_uuids);

Y
Yan Zheng 已提交
2184 2185 2186 2187
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2188
	mutex_init(&seed_devices->device_list_mutex);
2189

2190
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2191 2192
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
M
Miao Xie 已提交
2193 2194
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2195

2196
	mutex_lock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2197
	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2198
	mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2199

Y
Yan Zheng 已提交
2200 2201 2202
	fs_devices->seeding = 0;
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2203 2204
	fs_devices->missing_devices = 0;
	fs_devices->rotating = 0;
Y
Yan Zheng 已提交
2205
	fs_devices->seed = seed_devices;
Y
Yan Zheng 已提交
2206 2207

	generate_random_uuid(fs_devices->fsid);
2208
	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
2209
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2210
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2211

Y
Yan Zheng 已提交
2212 2213 2214 2215 2216 2217 2218 2219
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);

	return 0;
}

/*
2220
 * Store the expected generation for seed devices in device items.
Y
Yan Zheng 已提交
2221 2222
 */
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2223
			       struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2224
{
2225
	struct btrfs_root *root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2226 2227 2228 2229 2230
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2231
	u8 fs_uuid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
	u8 dev_uuid[BTRFS_UUID_SIZE];
	u64 devid;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2259
			btrfs_release_path(path);
Y
Yan Zheng 已提交
2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
		devid = btrfs_device_id(leaf, dev_item);
2271
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Y
Yan Zheng 已提交
2272
				   BTRFS_UUID_SIZE);
2273
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2274
				   BTRFS_FSID_SIZE);
2275
		device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2276
		BUG_ON(!device); /* Logic error */
Y
Yan Zheng 已提交
2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
			btrfs_mark_buffer_dirty(leaf);
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2293
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2294
{
2295
	struct btrfs_root *root = fs_info->dev_root;
2296
	struct request_queue *q;
2297 2298 2299 2300
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
2301
	struct super_block *sb = fs_info->sb;
2302
	struct rcu_string *name;
2303
	u64 tmp;
Y
Yan Zheng 已提交
2304
	int seeding_dev = 0;
2305
	int ret = 0;
2306
	bool unlocked = false;
2307

2308
	if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2309
		return -EROFS;
2310

2311
	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2312
				  fs_info->bdev_holder);
2313 2314
	if (IS_ERR(bdev))
		return PTR_ERR(bdev);
2315

2316
	if (fs_info->fs_devices->seeding) {
Y
Yan Zheng 已提交
2317 2318 2319 2320 2321
		seeding_dev = 1;
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
	}

2322
	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2323

2324
	devices = &fs_info->fs_devices->devices;
2325

2326
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
Q
Qinghuang Feng 已提交
2327
	list_for_each_entry(device, devices, dev_list) {
2328 2329
		if (device->bdev == bdev) {
			ret = -EEXIST;
2330
			mutex_unlock(
2331
				&fs_info->fs_devices->device_list_mutex);
Y
Yan Zheng 已提交
2332
			goto error;
2333 2334
		}
	}
2335
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2336

2337
	device = btrfs_alloc_device(fs_info, NULL, NULL);
2338
	if (IS_ERR(device)) {
2339
		/* we can safely leave the fs_devices entry around */
2340
		ret = PTR_ERR(device);
Y
Yan Zheng 已提交
2341
		goto error;
2342 2343
	}

2344
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2345
	if (!name) {
2346
		bio_put(device->flush_bio);
2347
		kfree(device);
Y
Yan Zheng 已提交
2348 2349
		ret = -ENOMEM;
		goto error;
2350
	}
2351
	rcu_assign_pointer(device->name, name);
Y
Yan Zheng 已提交
2352

2353
	trans = btrfs_start_transaction(root, 0);
2354
	if (IS_ERR(trans)) {
2355
		rcu_string_free(device->name);
2356
		bio_put(device->flush_bio);
2357 2358 2359 2360 2361
		kfree(device);
		ret = PTR_ERR(trans);
		goto error;
	}

2362 2363 2364
	q = bdev_get_queue(bdev);
	if (blk_queue_discard(q))
		device->can_discard = 1;
Y
Yan Zheng 已提交
2365 2366
	device->writeable = 1;
	device->generation = trans->transid;
2367 2368 2369
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2370 2371
	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
					 fs_info->sectorsize);
2372
	device->disk_total_bytes = device->total_bytes;
2373
	device->commit_total_bytes = device->total_bytes;
2374
	device->fs_info = fs_info;
2375
	device->bdev = bdev;
2376
	device->in_fs_metadata = 1;
2377
	device->is_tgtdev_for_dev_replace = 0;
2378
	device->mode = FMODE_EXCL;
2379
	device->dev_stats_valid = 1;
2380
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2381

Y
Yan Zheng 已提交
2382
	if (seeding_dev) {
2383
		sb->s_flags &= ~SB_RDONLY;
2384
		ret = btrfs_prepare_sprout(fs_info);
2385 2386 2387 2388
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
Y
Yan Zheng 已提交
2389
	}
2390

2391
	device->fs_devices = fs_info->fs_devices;
2392

2393
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2394
	mutex_lock(&fs_info->chunk_mutex);
2395
	list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
Y
Yan Zheng 已提交
2396
	list_add(&device->dev_alloc_list,
2397 2398 2399 2400 2401 2402
		 &fs_info->fs_devices->alloc_list);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
	fs_info->fs_devices->rw_devices++;
	fs_info->fs_devices->total_devices++;
	fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2403

2404
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2405

2406
	if (!blk_queue_nonrot(q))
2407
		fs_info->fs_devices->rotating = 1;
C
Chris Mason 已提交
2408

2409 2410
	tmp = btrfs_super_total_bytes(fs_info->super_copy);
	btrfs_set_super_total_bytes(fs_info->super_copy,
2411
		round_down(tmp + device->total_bytes, fs_info->sectorsize));
2412

2413 2414
	tmp = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2415 2416

	/* add sysfs device entry */
2417
	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2418

M
Miao Xie 已提交
2419 2420 2421 2422
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2423
	btrfs_clear_space_info_full(fs_info);
M
Miao Xie 已提交
2424

2425
	mutex_unlock(&fs_info->chunk_mutex);
2426
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2427

Y
Yan Zheng 已提交
2428
	if (seeding_dev) {
2429
		mutex_lock(&fs_info->chunk_mutex);
2430
		ret = init_first_rw_device(trans, fs_info);
2431
		mutex_unlock(&fs_info->chunk_mutex);
2432
		if (ret) {
2433
			btrfs_abort_transaction(trans, ret);
2434
			goto error_sysfs;
2435
		}
M
Miao Xie 已提交
2436 2437
	}

2438
	ret = btrfs_add_dev_item(trans, fs_info, device);
M
Miao Xie 已提交
2439
	if (ret) {
2440
		btrfs_abort_transaction(trans, ret);
2441
		goto error_sysfs;
M
Miao Xie 已提交
2442 2443 2444 2445 2446
	}

	if (seeding_dev) {
		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];

2447
		ret = btrfs_finish_sprout(trans, fs_info);
2448
		if (ret) {
2449
			btrfs_abort_transaction(trans, ret);
2450
			goto error_sysfs;
2451
		}
2452 2453 2454 2455 2456

		/* Sprouting would change fsid of the mounted root,
		 * so rename the fsid on the sysfs
		 */
		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2457 2458 2459 2460
						fs_info->fsid);
		if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
			btrfs_warn(fs_info,
				   "sysfs: failed to create fsid for sprout");
Y
Yan Zheng 已提交
2461 2462
	}

2463
	ret = btrfs_commit_transaction(trans);
2464

Y
Yan Zheng 已提交
2465 2466 2467
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2468
		unlocked = true;
2469

2470 2471 2472
		if (ret) /* transaction commit */
			return ret;

2473
		ret = btrfs_relocate_sys_chunks(fs_info);
2474
		if (ret < 0)
2475
			btrfs_handle_fs_error(fs_info, ret,
J
Jeff Mahoney 已提交
2476
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2477 2478 2479 2480
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2481 2482 2483
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2484
		}
2485
		ret = btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
2486
	}
2487

2488 2489
	/* Update ctime/mtime for libblkid */
	update_dev_time(device_path);
Y
Yan Zheng 已提交
2490
	return ret;
2491

2492 2493
error_sysfs:
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2494
error_trans:
2495
	if (seeding_dev)
2496
		sb->s_flags |= SB_RDONLY;
2497 2498
	if (trans)
		btrfs_end_transaction(trans);
2499
	rcu_string_free(device->name);
2500
	bio_put(device->flush_bio);
2501
	kfree(device);
Y
Yan Zheng 已提交
2502
error:
2503
	blkdev_put(bdev, FMODE_EXCL);
2504
	if (seeding_dev && !unlocked) {
Y
Yan Zheng 已提交
2505 2506 2507
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2508
	return ret;
2509 2510
}

2511
int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2512
				  const char *device_path,
2513
				  struct btrfs_device *srcdev,
2514 2515 2516 2517 2518 2519 2520
				  struct btrfs_device **device_out)
{
	struct request_queue *q;
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
	struct rcu_string *name;
2521
	u64 devid = BTRFS_DEV_REPLACE_DEVID;
2522 2523 2524
	int ret = 0;

	*device_out = NULL;
2525 2526
	if (fs_info->fs_devices->seeding) {
		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2527
		return -EINVAL;
2528
	}
2529 2530 2531

	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
				  fs_info->bdev_holder);
2532 2533
	if (IS_ERR(bdev)) {
		btrfs_err(fs_info, "target device %s is invalid!", device_path);
2534
		return PTR_ERR(bdev);
2535
	}
2536 2537 2538 2539 2540 2541

	filemap_write_and_wait(bdev->bd_inode->i_mapping);

	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
		if (device->bdev == bdev) {
J
Jeff Mahoney 已提交
2542 2543
			btrfs_err(fs_info,
				  "target device is in the filesystem!");
2544 2545 2546 2547 2548
			ret = -EEXIST;
			goto error;
		}
	}

2549

2550 2551
	if (i_size_read(bdev->bd_inode) <
	    btrfs_device_get_total_bytes(srcdev)) {
J
Jeff Mahoney 已提交
2552 2553
		btrfs_err(fs_info,
			  "target device is smaller than source device!");
2554 2555 2556 2557 2558
		ret = -EINVAL;
		goto error;
	}


2559 2560 2561
	device = btrfs_alloc_device(NULL, &devid, NULL);
	if (IS_ERR(device)) {
		ret = PTR_ERR(device);
2562 2563 2564
		goto error;
	}

2565
	name = rcu_string_strdup(device_path, GFP_KERNEL);
2566
	if (!name) {
2567
		bio_put(device->flush_bio);
2568 2569 2570 2571 2572 2573 2574 2575 2576
		kfree(device);
		ret = -ENOMEM;
		goto error;
	}
	rcu_assign_pointer(device->name, name);

	q = bdev_get_queue(bdev);
	if (blk_queue_discard(q))
		device->can_discard = 1;
2577
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2578 2579
	device->writeable = 1;
	device->generation = 0;
2580 2581 2582
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2583 2584 2585
	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2586 2587
	ASSERT(list_empty(&srcdev->resized_list));
	device->commit_total_bytes = srcdev->commit_total_bytes;
2588
	device->commit_bytes_used = device->bytes_used;
2589
	device->fs_info = fs_info;
2590 2591 2592 2593
	device->bdev = bdev;
	device->in_fs_metadata = 1;
	device->is_tgtdev_for_dev_replace = 1;
	device->mode = FMODE_EXCL;
2594
	device->dev_stats_valid = 1;
2595
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2596 2597 2598 2599
	device->fs_devices = fs_info->fs_devices;
	list_add(&device->dev_list, &fs_info->fs_devices->devices);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
2600
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612

	*device_out = device;
	return ret;

error:
	blkdev_put(bdev, FMODE_EXCL);
	return ret;
}

void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
					      struct btrfs_device *tgtdev)
{
2613 2614
	u32 sectorsize = fs_info->sectorsize;

2615
	WARN_ON(fs_info->fs_devices->rw_devices == 0);
2616 2617 2618
	tgtdev->io_width = sectorsize;
	tgtdev->io_align = sectorsize;
	tgtdev->sector_size = sectorsize;
2619
	tgtdev->fs_info = fs_info;
2620 2621 2622
	tgtdev->in_fs_metadata = 1;
}

C
Chris Mason 已提交
2623 2624
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2625 2626 2627
{
	int ret;
	struct btrfs_path *path;
2628
	struct btrfs_root *root = device->fs_info->chunk_root;
2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2658 2659 2660 2661
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2662 2663 2664 2665 2666 2667 2668
	btrfs_mark_buffer_dirty(leaf);

out:
	btrfs_free_path(path);
	return ret;
}

M
Miao Xie 已提交
2669
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2670 2671
		      struct btrfs_device *device, u64 new_size)
{
2672 2673
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2674
	struct btrfs_fs_devices *fs_devices;
M
Miao Xie 已提交
2675 2676
	u64 old_total;
	u64 diff;
2677

Y
Yan Zheng 已提交
2678 2679
	if (!device->writeable)
		return -EACCES;
M
Miao Xie 已提交
2680

2681 2682
	new_size = round_down(new_size, fs_info->sectorsize);

2683
	mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2684
	old_total = btrfs_super_total_bytes(super_copy);
2685
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
M
Miao Xie 已提交
2686

2687
	if (new_size <= device->total_bytes ||
M
Miao Xie 已提交
2688
	    device->is_tgtdev_for_dev_replace) {
2689
		mutex_unlock(&fs_info->chunk_mutex);
Y
Yan Zheng 已提交
2690
		return -EINVAL;
M
Miao Xie 已提交
2691
	}
Y
Yan Zheng 已提交
2692

2693
	fs_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
2694

2695 2696
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Y
Yan Zheng 已提交
2697 2698
	device->fs_devices->total_rw_bytes += diff;

2699 2700
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2701
	btrfs_clear_space_info_full(device->fs_info);
2702 2703 2704
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
			      &fs_devices->resized_devices);
2705
	mutex_unlock(&fs_info->chunk_mutex);
2706

2707 2708 2709 2710
	return btrfs_update_device(trans, device);
}

static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2711
			    struct btrfs_fs_info *fs_info, u64 chunk_offset)
2712
{
2713
	struct btrfs_root *root = fs_info->chunk_root;
2714 2715 2716 2717 2718 2719 2720 2721
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2722
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2723 2724 2725 2726
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2727 2728 2729
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2730 2731
		btrfs_handle_fs_error(fs_info, -ENOENT,
				      "Failed lookup while freeing chunk.");
2732 2733 2734
		ret = -ENOENT;
		goto out;
	}
2735 2736

	ret = btrfs_del_item(trans, root, path);
2737
	if (ret < 0)
2738 2739
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to delete chunk item.");
2740
out:
2741
	btrfs_free_path(path);
2742
	return ret;
2743 2744
}

2745
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2746
{
2747
	struct btrfs_super_block *super_copy = fs_info->super_copy;
2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

2758
	mutex_lock(&fs_info->chunk_mutex);
2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
2778
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2779 2780 2781 2782 2783 2784 2785 2786 2787
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
2788
	mutex_unlock(&fs_info->chunk_mutex);
2789 2790 2791
	return ret;
}

2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820
static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
					u64 logical, u64 length)
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;

	em_tree = &fs_info->mapping_tree.map_tree;
	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, logical, length);
	read_unlock(&em_tree->lock);

	if (!em) {
		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

	if (em->start > logical || em->start + em->len < logical) {
		btrfs_crit(fs_info,
			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
			   logical, length, em->start, em->start + em->len);
		free_extent_map(em);
		return ERR_PTR(-EINVAL);
	}

	/* callers are responsible for dropping em's ref. */
	return em;
}

2821
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2822
		       struct btrfs_fs_info *fs_info, u64 chunk_offset)
2823 2824 2825
{
	struct extent_map *em;
	struct map_lookup *map;
M
Miao Xie 已提交
2826
	u64 dev_extent_len = 0;
2827
	int i, ret = 0;
2828
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2829

2830 2831
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em)) {
2832 2833
		/*
		 * This is a logic error, but we don't want to just rely on the
2834
		 * user having built with ASSERT enabled, so if ASSERT doesn't
2835 2836 2837
		 * do anything we still error out.
		 */
		ASSERT(0);
2838
		return PTR_ERR(em);
2839
	}
2840
	map = em->map_lookup;
2841
	mutex_lock(&fs_info->chunk_mutex);
2842
	check_system_chunk(trans, fs_info, map->type);
2843
	mutex_unlock(&fs_info->chunk_mutex);
2844

2845 2846 2847 2848 2849 2850
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
	mutex_lock(&fs_devices->device_list_mutex);
2851
	for (i = 0; i < map->num_stripes; i++) {
2852
		struct btrfs_device *device = map->stripes[i].dev;
M
Miao Xie 已提交
2853 2854 2855
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
2856
		if (ret) {
2857
			mutex_unlock(&fs_devices->device_list_mutex);
2858
			btrfs_abort_transaction(trans, ret);
2859 2860
			goto out;
		}
2861

M
Miao Xie 已提交
2862
		if (device->bytes_used > 0) {
2863
			mutex_lock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2864 2865
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
2866
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2867
			btrfs_clear_space_info_full(fs_info);
2868
			mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
2869
		}
2870

2871 2872
		if (map->stripes[i].dev) {
			ret = btrfs_update_device(trans, map->stripes[i].dev);
2873
			if (ret) {
2874
				mutex_unlock(&fs_devices->device_list_mutex);
2875
				btrfs_abort_transaction(trans, ret);
2876 2877
				goto out;
			}
2878
		}
2879
	}
2880 2881
	mutex_unlock(&fs_devices->device_list_mutex);

2882
	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2883
	if (ret) {
2884
		btrfs_abort_transaction(trans, ret);
2885 2886
		goto out;
	}
2887

2888
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2889

2890
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2891
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2892
		if (ret) {
2893
			btrfs_abort_transaction(trans, ret);
2894 2895
			goto out;
		}
2896 2897
	}

2898
	ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2899
	if (ret) {
2900
		btrfs_abort_transaction(trans, ret);
2901 2902
		goto out;
	}
Y
Yan Zheng 已提交
2903

2904
out:
Y
Yan Zheng 已提交
2905 2906
	/* once for us */
	free_extent_map(em);
2907 2908
	return ret;
}
Y
Yan Zheng 已提交
2909

2910
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2911
{
2912
	struct btrfs_root *root = fs_info->chunk_root;
2913
	struct btrfs_trans_handle *trans;
2914
	int ret;
Y
Yan Zheng 已提交
2915

2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
2928
	ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
2929

2930
	ret = btrfs_can_relocate(fs_info, chunk_offset);
2931 2932 2933 2934
	if (ret)
		return -ENOSPC;

	/* step one, relocate all the extents inside this chunk */
2935
	btrfs_scrub_pause(fs_info);
2936
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2937
	btrfs_scrub_continue(fs_info);
2938 2939 2940
	if (ret)
		return ret;

2941 2942 2943 2944 2945 2946 2947 2948
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

2949
	/*
2950 2951
	 * step two, delete the device extents and the
	 * chunk tree entries
2952
	 */
2953
	ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2954
	btrfs_end_transaction(trans);
2955
	return ret;
Y
Yan Zheng 已提交
2956 2957
}

2958
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
2959
{
2960
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
2961 2962 2963 2964 2965 2966
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
2967 2968
	bool retried = false;
	int failed = 0;
Y
Yan Zheng 已提交
2969 2970 2971 2972 2973 2974
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2975
again:
Y
Yan Zheng 已提交
2976 2977 2978 2979 2980
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
2981
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2982
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2983
		if (ret < 0) {
2984
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2985
			goto error;
2986
		}
2987
		BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2988 2989 2990

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
2991
		if (ret)
2992
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
Y
Yan Zheng 已提交
2993 2994 2995 2996
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
Z
Zheng Yan 已提交
2997

Y
Yan Zheng 已提交
2998 2999
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
Z
Zheng Yan 已提交
3000

Y
Yan Zheng 已提交
3001 3002 3003
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3004
		btrfs_release_path(path);
3005

Y
Yan Zheng 已提交
3006
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3007
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3008 3009
			if (ret == -ENOSPC)
				failed++;
H
HIMANGI SARAOGI 已提交
3010 3011
			else
				BUG_ON(ret);
Y
Yan Zheng 已提交
3012
		}
3013
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3014

Y
Yan Zheng 已提交
3015 3016 3017 3018 3019
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3020 3021 3022 3023
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3024
	} else if (WARN_ON(failed && retried)) {
3025 3026
		ret = -ENOSPC;
	}
Y
Yan Zheng 已提交
3027 3028 3029
error:
	btrfs_free_path(path);
	return ret;
3030 3031
}

3032
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3033 3034
			       struct btrfs_balance_control *bctl)
{
3035
	struct btrfs_root *root = fs_info->tree_root;
3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3055
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3066
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
3080
	err = btrfs_commit_transaction(trans);
3081 3082 3083 3084 3085
	if (err && !ret)
		ret = err;
	return ret;
}

3086
static int del_balance_item(struct btrfs_fs_info *fs_info)
3087
{
3088
	struct btrfs_root *root = fs_info->tree_root;
3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3105
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3119
	err = btrfs_commit_transaction(trans);
3120 3121 3122 3123 3124
	if (err && !ret)
		ret = err;
	return ret;
}

I
Ilya Dryomov 已提交
3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3149
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3150 3151 3152 3153 3154
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3155
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3156 3157 3158 3159 3160
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3161
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
I
Ilya Dryomov 已提交
3162 3163 3164 3165 3166 3167
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196
/*
 * Should be called with both balance and volume mutexes held to
 * serialize other volume operations (add_dev/rm_dev/resize) with
 * restriper.  Same goes for unset_balance_control.
 */
static void set_balance_control(struct btrfs_balance_control *bctl)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;

	BUG_ON(fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
}

static void unset_balance_control(struct btrfs_fs_info *fs_info)
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	BUG_ON(!fs_info->balance_ctl);

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
}

I
Ilya Dryomov 已提交
3197 3198 3199 3200
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3201
static int chunk_profiles_filter(u64 chunk_type,
I
Ilya Dryomov 已提交
3202 3203
				 struct btrfs_balance_args *bargs)
{
3204 3205
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
I
Ilya Dryomov 已提交
3206

3207
	if (bargs->profiles & chunk_type)
I
Ilya Dryomov 已提交
3208 3209 3210 3211 3212
		return 0;

	return 1;
}

3213
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
I
Ilya Dryomov 已提交
3214
			      struct btrfs_balance_args *bargs)
3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
		user_thresh_min = div_factor_fine(cache->key.offset,
					bargs->usage_min);

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
		user_thresh_max = cache->key.offset;
	else
		user_thresh_max = div_factor_fine(cache->key.offset,
					bargs->usage_max);

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3246
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3247
		u64 chunk_offset, struct btrfs_balance_args *bargs)
I
Ilya Dryomov 已提交
3248 3249 3250 3251 3252 3253 3254 3255
{
	struct btrfs_block_group_cache *cache;
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	chunk_used = btrfs_block_group_used(&cache->item);

3256
	if (bargs->usage_min == 0)
3257
		user_thresh = 1;
3258 3259 3260 3261 3262 3263
	else if (bargs->usage > 100)
		user_thresh = cache->key.offset;
	else
		user_thresh = div_factor_fine(cache->key.offset,
					      bargs->usage);

I
Ilya Dryomov 已提交
3264 3265 3266 3267 3268 3269 3270
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

I
Ilya Dryomov 已提交
3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

I
Ilya Dryomov 已提交
3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
D
David Woodhouse 已提交
3304 3305 3306 3307 3308 3309 3310 3311 3312
	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
		factor = num_stripes / 2;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
		factor = num_stripes - 1;
	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
		factor = num_stripes - 2;
	} else {
		factor = num_stripes;
	}
I
Ilya Dryomov 已提交
3313 3314 3315 3316 3317 3318 3319 3320

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3321
		stripe_length = div_u64(stripe_length, factor);
I
Ilya Dryomov 已提交
3322 3323 3324 3325 3326 3327 3328 3329 3330

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3358
static int chunk_soft_convert_filter(u64 chunk_type,
3359 3360 3361 3362 3363
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3364 3365
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3366

3367
	if (bargs->target == chunk_type)
3368 3369 3370 3371 3372
		return 1;

	return 0;
}

3373
static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3374 3375 3376
				struct extent_buffer *leaf,
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3377
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

I
Ilya Dryomov 已提交
3394 3395 3396 3397
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3398 3399 3400 3401
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3402
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
I
Ilya Dryomov 已提交
3403
		return 0;
3404
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3405
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3406
		return 0;
I
Ilya Dryomov 已提交
3407 3408 3409 3410 3411 3412
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3413 3414 3415 3416
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3417
	    chunk_drange_filter(leaf, chunk, bargs)) {
I
Ilya Dryomov 已提交
3418
		return 0;
3419 3420 3421 3422 3423 3424
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
I
Ilya Dryomov 已提交
3425 3426
	}

3427 3428 3429 3430 3431 3432
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3433 3434 3435 3436 3437 3438
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3439 3440 3441 3442 3443 3444 3445 3446
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3447 3448 3449
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3450
		 * determined here because we do not have the global information
3451 3452 3453 3454 3455 3456
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
3457 3458
	}

3459 3460 3461
	return 1;
}

3462
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3463
{
3464
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3465 3466 3467
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct list_head *devices;
3468 3469 3470
	struct btrfs_device *device;
	u64 old_size;
	u64 size_to_free;
3471
	u64 chunk_type;
3472
	struct btrfs_chunk *chunk;
3473
	struct btrfs_path *path = NULL;
3474 3475
	struct btrfs_key key;
	struct btrfs_key found_key;
3476
	struct btrfs_trans_handle *trans;
3477 3478
	struct extent_buffer *leaf;
	int slot;
3479 3480
	int ret;
	int enospc_errors = 0;
3481
	bool counting = true;
3482
	/* The single value limit and min/max limits use the same bytes in the */
3483 3484 3485
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
3486 3487 3488
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
3489
	int chunk_reserved = 0;
3490
	u64 bytes_used = 0;
3491 3492

	/* step one make some room on all the devices */
3493
	devices = &fs_info->fs_devices->devices;
Q
Qinghuang Feng 已提交
3494
	list_for_each_entry(device, devices, dev_list) {
3495
		old_size = btrfs_device_get_total_bytes(device);
3496
		size_to_free = div_factor(old_size, 1);
3497
		size_to_free = min_t(u64, size_to_free, SZ_1M);
Y
Yan Zheng 已提交
3498
		if (!device->writeable ||
3499 3500
		    btrfs_device_get_total_bytes(device) -
		    btrfs_device_get_bytes_used(device) > size_to_free ||
3501
		    device->is_tgtdev_for_dev_replace)
3502 3503 3504
			continue;

		ret = btrfs_shrink_device(device, old_size - size_to_free);
3505 3506
		if (ret == -ENOSPC)
			break;
3507 3508 3509 3510 3511
		if (ret) {
			/* btrfs_shrink_device never returns ret > 0 */
			WARN_ON(ret > 0);
			goto error;
		}
3512

3513
		trans = btrfs_start_transaction(dev_root, 0);
3514 3515 3516 3517 3518 3519 3520 3521
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3522 3523

		ret = btrfs_grow_device(trans, device, old_size);
3524
		if (ret) {
3525
			btrfs_end_transaction(trans);
3526 3527 3528 3529 3530 3531 3532 3533
			/* btrfs_grow_device never returns ret > 0 */
			WARN_ON(ret > 0);
			btrfs_info_in_rcu(fs_info,
		 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
					  rcu_str_deref(device->name), ret,
					  old_size, old_size - size_to_free);
			goto error;
		}
3534

3535
		btrfs_end_transaction(trans);
3536 3537 3538 3539
	}

	/* step two, relocate all the chunks */
	path = btrfs_alloc_path();
3540 3541 3542 3543
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
3544 3545 3546 3547 3548 3549

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
3550
	if (!counting) {
3551 3552 3553 3554
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
3555 3556 3557 3558
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
3559 3560 3561 3562
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

C
Chris Mason 已提交
3563
	while (1) {
3564
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3565
		    atomic_read(&fs_info->balance_cancel_req)) {
3566 3567 3568 3569
			ret = -ECANCELED;
			goto error;
		}

3570
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
3571
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3572 3573
		if (ret < 0) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3574
			goto error;
3575
		}
3576 3577 3578 3579 3580 3581

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
3582
			BUG(); /* FIXME break ? */
3583 3584 3585

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
3586
		if (ret) {
3587
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3588
			ret = 0;
3589
			break;
3590
		}
3591

3592 3593 3594
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3595

3596 3597
		if (found_key.objectid != key.objectid) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3598
			break;
3599
		}
3600

3601
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3602
		chunk_type = btrfs_chunk_type(leaf, chunk);
3603

3604 3605 3606 3607 3608 3609
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

3610
		ret = should_balance_chunk(fs_info, leaf, chunk,
3611
					   found_key.offset);
3612

3613
		btrfs_release_path(path);
3614 3615
		if (!ret) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3616
			goto loop;
3617
		}
3618

3619
		if (counting) {
3620
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3621 3622 3623
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3646 3647 3648
			goto loop;
		}

3649 3650 3651 3652 3653 3654 3655
		ASSERT(fs_info->data_sinfo);
		spin_lock(&fs_info->data_sinfo->lock);
		bytes_used = fs_info->data_sinfo->bytes_used;
		spin_unlock(&fs_info->data_sinfo->lock);

		if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
		    !chunk_reserved && !bytes_used) {
3656 3657 3658 3659 3660 3661 3662
			trans = btrfs_start_transaction(chunk_root, 0);
			if (IS_ERR(trans)) {
				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				ret = PTR_ERR(trans);
				goto error;
			}

3663
			ret = btrfs_force_chunk_alloc(trans, fs_info,
3664
						      BTRFS_BLOCK_GROUP_DATA);
3665
			btrfs_end_transaction(trans);
3666 3667 3668 3669 3670 3671 3672
			if (ret < 0) {
				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				goto error;
			}
			chunk_reserved = 1;
		}

3673
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3674
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3675 3676
		if (ret && ret != -ENOSPC)
			goto error;
3677
		if (ret == -ENOSPC) {
3678
			enospc_errors++;
3679 3680 3681 3682 3683
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
3684
loop:
3685 3686
		if (found_key.offset == 0)
			break;
3687
		key.offset = found_key.offset - 1;
3688
	}
3689

3690 3691 3692 3693 3694
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
3695 3696
error:
	btrfs_free_path(path);
3697
	if (enospc_errors) {
3698
		btrfs_info(fs_info, "%d enospc errors during balance",
J
Jeff Mahoney 已提交
3699
			   enospc_errors);
3700 3701 3702 3703
		if (!ret)
			ret = -ENOSPC;
	}

3704 3705 3706
	return ret;
}

3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730
/**
 * alloc_profile_is_valid - see if a given profile is valid and reduced
 * @flags: profile to validate
 * @extended: if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

	/* true if exactly one bit set */
	return (flags & (flags - 1)) == 0;
}

3731 3732
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
3733 3734 3735 3736
	/* cancel requested || normal exit path */
	return atomic_read(&fs_info->balance_cancel_req) ||
		(atomic_read(&fs_info->balance_pause_req) == 0 &&
		 atomic_read(&fs_info->balance_cancel_req) == 0);
3737 3738
}

3739 3740
static void __cancel_balance(struct btrfs_fs_info *fs_info)
{
3741 3742
	int ret;

3743
	unset_balance_control(fs_info);
3744
	ret = del_balance_item(fs_info);
3745
	if (ret)
3746
		btrfs_handle_fs_error(fs_info, ret, NULL);
3747

3748
	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3749 3750
}

3751 3752 3753 3754 3755 3756 3757 3758 3759
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
		u64 allowed)
{
	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		(!alloc_profile_is_valid(bctl_arg->target, 1) ||
		 (bctl_arg->target & ~allowed)));
}

3760 3761 3762 3763 3764 3765 3766
/*
 * Should be called with both balance and volume mutexes held
 */
int btrfs_balance(struct btrfs_balance_control *bctl,
		  struct btrfs_ioctl_balance_args *bargs)
{
	struct btrfs_fs_info *fs_info = bctl->fs_info;
3767
	u64 meta_target, data_target;
3768
	u64 allowed;
3769
	int mixed = 0;
3770
	int ret;
3771
	u64 num_devices;
3772
	unsigned seq;
3773

3774
	if (btrfs_fs_closing(fs_info) ||
3775 3776
	    atomic_read(&fs_info->balance_pause_req) ||
	    atomic_read(&fs_info->balance_cancel_req)) {
3777 3778 3779 3780
		ret = -EINVAL;
		goto out;
	}

3781 3782 3783 3784
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

3785 3786 3787 3788
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
3789 3790
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
3791 3792 3793
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
J
Jeff Mahoney 已提交
3794 3795
			btrfs_err(fs_info,
				  "with mixed groups data and metadata balance options must be the same");
3796 3797 3798 3799 3800
			ret = -EINVAL;
			goto out;
		}
	}

3801
	num_devices = fs_info->fs_devices->num_devices;
3802
	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3803 3804 3805 3806
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		BUG_ON(num_devices < 1);
		num_devices--;
	}
3807
	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3808 3809
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
	if (num_devices > 1)
3810
		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3811 3812 3813 3814 3815
	if (num_devices > 2)
		allowed |= BTRFS_BLOCK_GROUP_RAID5;
	if (num_devices > 3)
		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
			    BTRFS_BLOCK_GROUP_RAID6);
3816
	if (validate_convert_profile(&bctl->data, allowed)) {
J
Jeff Mahoney 已提交
3817 3818 3819
		btrfs_err(fs_info,
			  "unable to start balance with target data profile %llu",
			  bctl->data.target);
3820 3821 3822
		ret = -EINVAL;
		goto out;
	}
3823
	if (validate_convert_profile(&bctl->meta, allowed)) {
3824
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3825 3826
			  "unable to start balance with target metadata profile %llu",
			  bctl->meta.target);
3827 3828 3829
		ret = -EINVAL;
		goto out;
	}
3830
	if (validate_convert_profile(&bctl->sys, allowed)) {
3831
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
3832 3833
			  "unable to start balance with target system profile %llu",
			  bctl->sys.target);
3834 3835 3836 3837 3838 3839
		ret = -EINVAL;
		goto out;
	}

	/* allow to reduce meta or sys integrity only if force set */
	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
3840 3841 3842
			BTRFS_BLOCK_GROUP_RAID10 |
			BTRFS_BLOCK_GROUP_RAID5 |
			BTRFS_BLOCK_GROUP_RAID6;
3843 3844 3845 3846 3847 3848 3849 3850 3851 3852
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
		     !(bctl->meta.target & allowed))) {
			if (bctl->flags & BTRFS_BALANCE_FORCE) {
J
Jeff Mahoney 已提交
3853 3854
				btrfs_info(fs_info,
					   "force reducing metadata integrity");
3855
			} else {
J
Jeff Mahoney 已提交
3856 3857
				btrfs_err(fs_info,
					  "balance will reduce metadata integrity, use force if you want this");
3858 3859 3860
				ret = -EINVAL;
				goto out;
			}
3861
		}
3862
	} while (read_seqretry(&fs_info->profiles_lock, seq));
3863

3864 3865 3866 3867 3868 3869 3870
	/* if we're not converting, the target field is uninitialized */
	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
		bctl->data.target : fs_info->avail_data_alloc_bits;
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3871
		btrfs_warn(fs_info,
J
Jeff Mahoney 已提交
3872
			   "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
3873
			   meta_target, data_target);
3874 3875
	}

3876
	ret = insert_balance_item(fs_info, bctl);
I
Ilya Dryomov 已提交
3877
	if (ret && ret != -EEXIST)
3878 3879
		goto out;

I
Ilya Dryomov 已提交
3880 3881 3882 3883 3884 3885 3886 3887 3888
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
		set_balance_control(bctl);
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
3889

3890
	atomic_inc(&fs_info->balance_running);
3891 3892 3893 3894 3895
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
3896
	atomic_dec(&fs_info->balance_running);
3897 3898 3899

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
3900
		update_ioctl_balance_args(fs_info, 0, bargs);
3901 3902
	}

3903 3904 3905 3906 3907
	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
	    balance_need_close(fs_info)) {
		__cancel_balance(fs_info);
	}

3908
	wake_up(&fs_info->balance_wait_q);
3909 3910 3911

	return ret;
out:
I
Ilya Dryomov 已提交
3912 3913
	if (bctl->flags & BTRFS_BALANCE_RESUME)
		__cancel_balance(fs_info);
3914
	else {
I
Ilya Dryomov 已提交
3915
		kfree(bctl);
3916
		clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3917
	}
I
Ilya Dryomov 已提交
3918 3919 3920 3921 3922
	return ret;
}

static int balance_kthread(void *data)
{
3923
	struct btrfs_fs_info *fs_info = data;
3924
	int ret = 0;
I
Ilya Dryomov 已提交
3925 3926 3927 3928

	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);

3929
	if (fs_info->balance_ctl) {
3930
		btrfs_info(fs_info, "continuing balance");
3931
		ret = btrfs_balance(fs_info->balance_ctl, NULL);
3932
	}
I
Ilya Dryomov 已提交
3933 3934 3935

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
3936

I
Ilya Dryomov 已提交
3937 3938 3939
	return ret;
}

3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

	spin_lock(&fs_info->balance_lock);
	if (!fs_info->balance_ctl) {
		spin_unlock(&fs_info->balance_lock);
		return 0;
	}
	spin_unlock(&fs_info->balance_lock);

3951
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
3952
		btrfs_info(fs_info, "force skipping balance");
3953 3954 3955 3956
		return 0;
	}

	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3957
	return PTR_ERR_OR_ZERO(tsk);
3958 3959
}

3960
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
I
Ilya Dryomov 已提交
3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
3975
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
I
Ilya Dryomov 已提交
3976 3977
	key.offset = 0;

3978
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
I
Ilya Dryomov 已提交
3979
	if (ret < 0)
3980
		goto out;
I
Ilya Dryomov 已提交
3981 3982
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
3983 3984 3985 3986 3987 3988 3989
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
I
Ilya Dryomov 已提交
3990 3991 3992 3993 3994
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3995 3996 3997
	bctl->fs_info = fs_info;
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
I
Ilya Dryomov 已提交
3998 3999 4000 4001 4002 4003 4004 4005

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4006
	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
4007

4008 4009
	mutex_lock(&fs_info->volume_mutex);
	mutex_lock(&fs_info->balance_mutex);
I
Ilya Dryomov 已提交
4010

4011 4012 4013 4014
	set_balance_control(bctl);

	mutex_unlock(&fs_info->balance_mutex);
	mutex_unlock(&fs_info->volume_mutex);
I
Ilya Dryomov 已提交
4015 4016
out:
	btrfs_free_path(path);
4017 4018 4019
	return ret;
}

4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	if (atomic_read(&fs_info->balance_running)) {
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
		BUG_ON(atomic_read(&fs_info->balance_running));
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4049 4050
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
4051
	if (sb_rdonly(fs_info->sb))
4052 4053
		return -EROFS;

4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
	if (atomic_read(&fs_info->balance_running)) {
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
			   atomic_read(&fs_info->balance_running) == 0);
		mutex_lock(&fs_info->balance_mutex);
	} else {
		/* __cancel_balance needs volume_mutex */
		mutex_unlock(&fs_info->balance_mutex);
		mutex_lock(&fs_info->volume_mutex);
		mutex_lock(&fs_info->balance_mutex);

		if (fs_info->balance_ctl)
			__cancel_balance(fs_info);

		mutex_unlock(&fs_info->volume_mutex);
	}

	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

S
Stefan Behrens 已提交
4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098
static int btrfs_uuid_scan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	int ret = 0;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_root_item root_item;
	u32 item_size;
4099
	struct btrfs_trans_handle *trans = NULL;
S
Stefan Behrens 已提交
4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;

	while (1) {
4112
		ret = btrfs_search_forward(root, &key, path, 0);
S
Stefan Behrens 已提交
4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135
		if (ret) {
			if (ret > 0)
				ret = 0;
			break;
		}

		if (key.type != BTRFS_ROOT_ITEM_KEY ||
		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
			goto skip;

		eb = path->nodes[0];
		slot = path->slots[0];
		item_size = btrfs_item_size_nr(eb, slot);
		if (item_size < sizeof(root_item))
			goto skip;

		read_extent_buffer(eb, &root_item,
				   btrfs_item_ptr_offset(eb, slot),
				   (int)sizeof(root_item));
		if (btrfs_root_refs(&root_item) == 0)
			goto skip;
4136 4137 4138 4139 4140 4141 4142

		if (!btrfs_is_empty_uuid(root_item.uuid) ||
		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
			if (trans)
				goto update_tree;

			btrfs_release_path(path);
S
Stefan Behrens 已提交
4143 4144 4145 4146 4147 4148 4149 4150 4151
			/*
			 * 1 - subvol uuid item
			 * 1 - received_subvol uuid item
			 */
			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
4152 4153 4154 4155 4156 4157
			continue;
		} else {
			goto skip;
		}
update_tree:
		if (!btrfs_is_empty_uuid(root_item.uuid)) {
4158
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4159 4160 4161 4162
						  root_item.uuid,
						  BTRFS_UUID_KEY_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4163
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4164 4165 4166 4167 4168 4169
					ret);
				break;
			}
		}

		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4170
			ret = btrfs_uuid_tree_add(trans, fs_info,
S
Stefan Behrens 已提交
4171 4172 4173 4174
						  root_item.received_uuid,
						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
						  key.objectid);
			if (ret < 0) {
4175
				btrfs_warn(fs_info, "uuid_tree_add failed %d",
S
Stefan Behrens 已提交
4176 4177 4178 4179 4180
					ret);
				break;
			}
		}

4181
skip:
S
Stefan Behrens 已提交
4182
		if (trans) {
4183
			ret = btrfs_end_transaction(trans);
4184
			trans = NULL;
S
Stefan Behrens 已提交
4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206
			if (ret)
				break;
		}

		btrfs_release_path(path);
		if (key.offset < (u64)-1) {
			key.offset++;
		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
		} else if (key.objectid < (u64)-1) {
			key.offset = 0;
			key.type = BTRFS_ROOT_ITEM_KEY;
			key.objectid++;
		} else {
			break;
		}
		cond_resched();
	}

out:
	btrfs_free_path(path);
4207
	if (trans && !IS_ERR(trans))
4208
		btrfs_end_transaction(trans);
S
Stefan Behrens 已提交
4209
	if (ret)
4210
		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4211
	else
4212
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
S
Stefan Behrens 已提交
4213 4214 4215 4216
	up(&fs_info->uuid_tree_rescan_sem);
	return 0;
}

4217 4218 4219 4220
/*
 * Callback for btrfs_uuid_tree_iterate().
 * returns:
 * 0	check succeeded, the entry is not outdated.
4221
 * < 0	if an error occurred.
4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273
 * > 0	if the check failed, which means the caller shall remove the entry.
 */
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				       u8 *uuid, u8 type, u64 subid)
{
	struct btrfs_key key;
	int ret = 0;
	struct btrfs_root *subvol_root;

	if (type != BTRFS_UUID_KEY_SUBVOL &&
	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
		goto out;

	key.objectid = subid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(subvol_root)) {
		ret = PTR_ERR(subvol_root);
		if (ret == -ENOENT)
			ret = 1;
		goto out;
	}

	switch (type) {
	case BTRFS_UUID_KEY_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
			ret = 1;
		break;
	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
		if (memcmp(uuid, subvol_root->root_item.received_uuid,
			   BTRFS_UUID_SIZE))
			ret = 1;
		break;
	}

out:
	return ret;
}

static int btrfs_uuid_rescan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
	if (ret < 0) {
4274
		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4275 4276 4277 4278 4279 4280
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

4281 4282 4283 4284 4285
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *uuid_root;
S
Stefan Behrens 已提交
4286 4287
	struct task_struct *task;
	int ret;
4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299

	/*
	 * 1 - root node
	 * 1 - root item
	 */
	trans = btrfs_start_transaction(tree_root, 2);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	uuid_root = btrfs_create_tree(trans, fs_info,
				      BTRFS_UUID_TREE_OBJECTID);
	if (IS_ERR(uuid_root)) {
4300
		ret = PTR_ERR(uuid_root);
4301
		btrfs_abort_transaction(trans, ret);
4302
		btrfs_end_transaction(trans);
4303
		return ret;
4304 4305 4306 4307
	}

	fs_info->uuid_root = uuid_root;

4308
	ret = btrfs_commit_transaction(trans);
S
Stefan Behrens 已提交
4309 4310 4311 4312 4313 4314
	if (ret)
		return ret;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
4315
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4316
		btrfs_warn(fs_info, "failed to start uuid_scan task");
S
Stefan Behrens 已提交
4317 4318 4319 4320 4321
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
4322
}
S
Stefan Behrens 已提交
4323

4324 4325 4326 4327 4328 4329 4330 4331
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
4332
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
4333 4334 4335 4336 4337 4338 4339
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

4340 4341 4342 4343 4344 4345 4346
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4347 4348
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4349 4350 4351 4352 4353 4354 4355
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4356 4357
	int failed = 0;
	bool retried = false;
4358
	bool checked_pending_chunks = false;
4359 4360
	struct extent_buffer *l;
	struct btrfs_key key;
4361
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4362
	u64 old_total = btrfs_super_total_bytes(super_copy);
4363
	u64 old_size = btrfs_device_get_total_bytes(device);
4364 4365 4366
	u64 diff;

	new_size = round_down(new_size, fs_info->sectorsize);
4367
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4368

4369 4370 4371
	if (device->is_tgtdev_for_dev_replace)
		return -EINVAL;

4372 4373 4374 4375
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4376
	path->reada = READA_FORWARD;
4377

4378
	mutex_lock(&fs_info->chunk_mutex);
4379

4380
	btrfs_device_set_total_bytes(device, new_size);
4381
	if (device->writeable) {
Y
Yan Zheng 已提交
4382
		device->fs_devices->total_rw_bytes -= diff;
4383
		atomic64_sub(diff, &fs_info->free_chunk_space);
4384
	}
4385
	mutex_unlock(&fs_info->chunk_mutex);
4386

4387
again:
4388 4389 4390 4391
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4392
	do {
4393
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
4394
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4395
		if (ret < 0) {
4396
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4397
			goto done;
4398
		}
4399 4400

		ret = btrfs_previous_item(root, path, 0, key.type);
4401
		if (ret)
4402
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4403 4404 4405 4406
		if (ret < 0)
			goto done;
		if (ret) {
			ret = 0;
4407
			btrfs_release_path(path);
4408
			break;
4409 4410 4411 4412 4413 4414
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4415
		if (key.objectid != device->devid) {
4416
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4417
			btrfs_release_path(path);
4418
			break;
4419
		}
4420 4421 4422 4423

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4424
		if (key.offset + length <= new_size) {
4425
			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4426
			btrfs_release_path(path);
4427
			break;
4428
		}
4429 4430

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4431
		btrfs_release_path(path);
4432

4433 4434
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4435
		if (ret && ret != -ENOSPC)
4436
			goto done;
4437 4438
		if (ret == -ENOSPC)
			failed++;
4439
	} while (key.offset-- > 0);
4440 4441 4442 4443 4444 4445 4446 4447

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4448 4449
	}

4450
	/* Shrinking succeeded, else we would be at "done". */
4451
	trans = btrfs_start_transaction(root, 0);
4452 4453 4454 4455 4456
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4457
	mutex_lock(&fs_info->chunk_mutex);
4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474

	/*
	 * We checked in the above loop all device extents that were already in
	 * the device tree. However before we have updated the device's
	 * total_bytes to the new size, we might have had chunk allocations that
	 * have not complete yet (new block groups attached to transaction
	 * handles), and therefore their device extents were not yet in the
	 * device tree and we missed them in the loop above. So if we have any
	 * pending chunk using a device extent that overlaps the device range
	 * that we can not use anymore, commit the current transaction and
	 * repeat the search on the device tree - this way we guarantee we will
	 * not have chunks using device extents that end beyond 'new_size'.
	 */
	if (!checked_pending_chunks) {
		u64 start = new_size;
		u64 len = old_size - new_size;

4475 4476
		if (contains_pending_extent(trans->transaction, device,
					    &start, len)) {
4477
			mutex_unlock(&fs_info->chunk_mutex);
4478 4479 4480
			checked_pending_chunks = true;
			failed = 0;
			retried = false;
4481
			ret = btrfs_commit_transaction(trans);
4482 4483 4484 4485 4486 4487
			if (ret)
				goto done;
			goto again;
		}
	}

4488
	btrfs_device_set_disk_total_bytes(device, new_size);
4489 4490
	if (list_empty(&device->resized_list))
		list_add_tail(&device->resized_list,
4491
			      &fs_info->fs_devices->resized_devices);
4492 4493

	WARN_ON(diff > old_total);
4494 4495
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4496
	mutex_unlock(&fs_info->chunk_mutex);
M
Miao Xie 已提交
4497 4498 4499

	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4500
	btrfs_end_transaction(trans);
4501 4502
done:
	btrfs_free_path(path);
4503
	if (ret) {
4504
		mutex_lock(&fs_info->chunk_mutex);
4505 4506 4507
		btrfs_device_set_total_bytes(device, old_size);
		if (device->writeable)
			device->fs_devices->total_rw_bytes += diff;
4508
		atomic64_add(diff, &fs_info->free_chunk_space);
4509
		mutex_unlock(&fs_info->chunk_mutex);
4510
	}
4511 4512 4513
	return ret;
}

4514
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4515 4516 4517
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
4518
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4519 4520 4521 4522
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

4523
	mutex_lock(&fs_info->chunk_mutex);
4524
	array_size = btrfs_super_sys_array_size(super_copy);
4525
	if (array_size + item_size + sizeof(disk_key)
4526
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4527
		mutex_unlock(&fs_info->chunk_mutex);
4528
		return -EFBIG;
4529
	}
4530 4531 4532 4533 4534 4535 4536 4537

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4538
	mutex_unlock(&fs_info->chunk_mutex);
4539

4540 4541 4542
	return 0;
}

4543 4544 4545 4546
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
4547
{
4548 4549
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
4550

4551
	if (di_a->max_avail > di_b->max_avail)
4552
		return -1;
4553
	if (di_a->max_avail < di_b->max_avail)
4554
		return 1;
4555 4556 4557 4558 4559
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
4560
}
4561

D
David Woodhouse 已提交
4562 4563
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
4564
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
D
David Woodhouse 已提交
4565 4566
		return;

4567
	btrfs_set_fs_incompat(info, RAID56);
D
David Woodhouse 已提交
4568 4569
}

4570
#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info)		\
4571 4572 4573 4574 4575 4576 4577 4578
			- sizeof(struct btrfs_chunk))		\
			/ sizeof(struct btrfs_stripe) + 1)

#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
				- 2 * sizeof(struct btrfs_disk_key)	\
				- 2 * sizeof(struct btrfs_chunk))	\
				/ sizeof(struct btrfs_stripe) + 1)

4579
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4580
			       u64 start, u64 type)
4581
{
4582
	struct btrfs_fs_info *info = trans->fs_info;
4583
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
4584
	struct btrfs_device *device;
4585 4586 4587 4588 4589 4590
	struct map_lookup *map = NULL;
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	struct btrfs_device_info *devices_info = NULL;
	u64 total_avail;
	int num_stripes;	/* total number of stripes to allocate */
D
David Woodhouse 已提交
4591 4592
	int data_stripes;	/* number of stripes that count for
				   block group size */
4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606
	int sub_stripes;	/* sub_stripes info for map */
	int dev_stripes;	/* stripes per dev */
	int devs_max;		/* max devs to use */
	int devs_min;		/* min devs needed */
	int devs_increment;	/* ndevs has to be a multiple of this */
	int ncopies;		/* how many copies to data has */
	int ret;
	u64 max_stripe_size;
	u64 max_chunk_size;
	u64 stripe_size;
	u64 num_bytes;
	int ndevs;
	int i;
	int j;
4607
	int index;
4608

4609
	BUG_ON(!alloc_profile_is_valid(type, 0));
4610

4611 4612
	if (list_empty(&fs_devices->alloc_list))
		return -ENOSPC;
4613

4614
	index = __get_raid_index(type);
4615

4616 4617 4618 4619 4620 4621
	sub_stripes = btrfs_raid_array[index].sub_stripes;
	dev_stripes = btrfs_raid_array[index].dev_stripes;
	devs_max = btrfs_raid_array[index].devs_max;
	devs_min = btrfs_raid_array[index].devs_min;
	devs_increment = btrfs_raid_array[index].devs_increment;
	ncopies = btrfs_raid_array[index].ncopies;
4622

4623
	if (type & BTRFS_BLOCK_GROUP_DATA) {
4624
		max_stripe_size = SZ_1G;
4625
		max_chunk_size = 10 * max_stripe_size;
4626 4627
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4628
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4629
		/* for larger filesystems, use larger metadata chunks */
4630 4631
		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
			max_stripe_size = SZ_1G;
4632
		else
4633
			max_stripe_size = SZ_256M;
4634
		max_chunk_size = max_stripe_size;
4635 4636
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4637
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4638
		max_stripe_size = SZ_32M;
4639
		max_chunk_size = 2 * max_stripe_size;
4640 4641
		if (!devs_max)
			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4642
	} else {
4643
		btrfs_err(info, "invalid chunk type 0x%llx requested",
4644 4645
		       type);
		BUG_ON(1);
4646 4647
	}

Y
Yan Zheng 已提交
4648 4649 4650
	/* we don't want a chunk larger than 10% of writeable space */
	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
			     max_chunk_size);
4651

4652
	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4653 4654 4655
			       GFP_NOFS);
	if (!devices_info)
		return -ENOMEM;
4656

4657
	/*
4658 4659
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
4660
	 */
4661
	ndevs = 0;
4662
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4663 4664
		u64 max_avail;
		u64 dev_offset;
4665

4666
		if (!device->writeable) {
J
Julia Lawall 已提交
4667
			WARN(1, KERN_ERR
4668
			       "BTRFS: read-only device in alloc_list\n");
4669 4670
			continue;
		}
4671

4672 4673
		if (!device->in_fs_metadata ||
		    device->is_tgtdev_for_dev_replace)
4674
			continue;
4675

4676 4677 4678 4679
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
4680 4681 4682 4683

		/* If there is no space on this device, skip it. */
		if (total_avail == 0)
			continue;
4684

4685
		ret = find_free_dev_extent(trans, device,
4686 4687 4688 4689
					   max_stripe_size * dev_stripes,
					   &dev_offset, &max_avail);
		if (ret && ret != -ENOSPC)
			goto error;
4690

4691 4692
		if (ret == 0)
			max_avail = max_stripe_size * dev_stripes;
4693

4694 4695
		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
			continue;
4696

4697 4698 4699 4700 4701
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
4702 4703 4704 4705 4706 4707
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
4708

4709 4710 4711 4712 4713
	/*
	 * now sort the devices by hole size / available space
	 */
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
	     btrfs_cmp_device_info, NULL);
4714

4715
	/* round down to number of usable stripes */
4716
	ndevs = round_down(ndevs, devs_increment);
4717

4718 4719 4720
	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
		ret = -ENOSPC;
		goto error;
4721
	}
4722

4723 4724
	ndevs = min(ndevs, devs_max);

4725 4726 4727 4728 4729 4730
	/*
	 * the primary goal is to maximize the number of stripes, so use as many
	 * devices as possible, even if the stripes are not maximum sized.
	 */
	stripe_size = devices_info[ndevs-1].max_avail;
	num_stripes = ndevs * dev_stripes;
4731

D
David Woodhouse 已提交
4732 4733 4734 4735 4736 4737
	/*
	 * this will have to be fixed for RAID1 and RAID10 over
	 * more drives
	 */
	data_stripes = num_stripes / ncopies;

4738
	if (type & BTRFS_BLOCK_GROUP_RAID5)
D
David Woodhouse 已提交
4739
		data_stripes = num_stripes - 1;
4740 4741

	if (type & BTRFS_BLOCK_GROUP_RAID6)
D
David Woodhouse 已提交
4742
		data_stripes = num_stripes - 2;
4743 4744 4745 4746 4747 4748 4749 4750

	/*
	 * Use the number of data stripes to figure out how big this chunk
	 * is really going to be in terms of logical address space,
	 * and compare that answer with the max chunk size
	 */
	if (stripe_size * data_stripes > max_chunk_size) {
		u64 mask = (1ULL << 24) - 1;
4751 4752

		stripe_size = div_u64(max_chunk_size, data_stripes);
4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763

		/* bump the answer up to a 16MB boundary */
		stripe_size = (stripe_size + mask) & ~mask;

		/* but don't go higher than the limits we found
		 * while searching for free extents
		 */
		if (stripe_size > devices_info[ndevs-1].max_avail)
			stripe_size = devices_info[ndevs-1].max_avail;
	}

4764
	stripe_size = div_u64(stripe_size, dev_stripes);
4765 4766

	/* align to BTRFS_STRIPE_LEN */
4767
	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
4768 4769 4770 4771 4772 4773 4774

	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
	if (!map) {
		ret = -ENOMEM;
		goto error;
	}
	map->num_stripes = num_stripes;
4775

4776 4777 4778 4779 4780 4781
	for (i = 0; i < ndevs; ++i) {
		for (j = 0; j < dev_stripes; ++j) {
			int s = i * dev_stripes + j;
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
						   j * stripe_size;
4782 4783
		}
	}
4784 4785 4786
	map->stripe_len = BTRFS_STRIPE_LEN;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
Y
Yan Zheng 已提交
4787 4788
	map->type = type;
	map->sub_stripes = sub_stripes;
4789

D
David Woodhouse 已提交
4790
	num_bytes = stripe_size * data_stripes;
4791

4792
	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4793

4794
	em = alloc_extent_map();
Y
Yan Zheng 已提交
4795
	if (!em) {
4796
		kfree(map);
4797 4798
		ret = -ENOMEM;
		goto error;
4799
	}
4800
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4801
	em->map_lookup = map;
Y
Yan Zheng 已提交
4802
	em->start = start;
4803
	em->len = num_bytes;
Y
Yan Zheng 已提交
4804 4805
	em->block_start = 0;
	em->block_len = em->len;
4806
	em->orig_block_len = stripe_size;
4807

4808
	em_tree = &info->mapping_tree.map_tree;
4809
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4810
	ret = add_extent_mapping(em_tree, em, 0);
4811
	if (ret) {
4812
		write_unlock(&em_tree->lock);
4813
		free_extent_map(em);
4814
		goto error;
4815
	}
4816

4817 4818 4819 4820
	list_add_tail(&em->list, &trans->transaction->pending_chunks);
	refcount_inc(&em->refs);
	write_unlock(&em_tree->lock);

4821
	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
4822 4823
	if (ret)
		goto error_del_extent;
Y
Yan Zheng 已提交
4824

4825 4826 4827 4828
	for (i = 0; i < map->num_stripes; i++) {
		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
	}
4829

4830
	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
4831

4832
	free_extent_map(em);
4833
	check_raid56_incompat_flag(info, type);
D
David Woodhouse 已提交
4834

4835
	kfree(devices_info);
Y
Yan Zheng 已提交
4836
	return 0;
4837

4838
error_del_extent:
4839 4840 4841 4842 4843 4844 4845 4846
	write_lock(&em_tree->lock);
	remove_extent_mapping(em_tree, em);
	write_unlock(&em_tree->lock);

	/* One for our allocation */
	free_extent_map(em);
	/* One for the tree reference */
	free_extent_map(em);
4847 4848
	/* One for the pending_chunks list reference */
	free_extent_map(em);
4849 4850 4851
error:
	kfree(devices_info);
	return ret;
Y
Yan Zheng 已提交
4852 4853
}

4854
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4855
				struct btrfs_fs_info *fs_info,
4856
				u64 chunk_offset, u64 chunk_size)
Y
Yan Zheng 已提交
4857
{
4858 4859
	struct btrfs_root *extent_root = fs_info->extent_root;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Y
Yan Zheng 已提交
4860 4861 4862 4863
	struct btrfs_key key;
	struct btrfs_device *device;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
4864 4865 4866 4867 4868 4869
	struct extent_map *em;
	struct map_lookup *map;
	size_t item_size;
	u64 dev_offset;
	u64 stripe_size;
	int i = 0;
4870
	int ret = 0;
Y
Yan Zheng 已提交
4871

4872 4873 4874
	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	if (IS_ERR(em))
		return PTR_ERR(em);
4875

4876
	map = em->map_lookup;
4877 4878 4879
	item_size = btrfs_chunk_item_size(map->num_stripes);
	stripe_size = em->orig_block_len;

Y
Yan Zheng 已提交
4880
	chunk = kzalloc(item_size, GFP_NOFS);
4881 4882 4883 4884 4885
	if (!chunk) {
		ret = -ENOMEM;
		goto out;
	}

4886 4887 4888 4889 4890 4891 4892
	/*
	 * Take the device list mutex to prevent races with the final phase of
	 * a device replace operation that replaces the device object associated
	 * with the map's stripes, because the device object's id can change
	 * at any time during that final phase of the device replace operation
	 * (dev-replace.c:btrfs_dev_replace_finishing()).
	 */
4893
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4894 4895 4896
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
Y
Yan Zheng 已提交
4897

4898
		ret = btrfs_update_device(trans, device);
4899
		if (ret)
4900
			break;
4901 4902
		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
					     dev_offset, stripe_size);
4903
		if (ret)
4904 4905 4906
			break;
	}
	if (ret) {
4907
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4908
		goto out;
Y
Yan Zheng 已提交
4909 4910 4911
	}

	stripe = &chunk->stripe;
4912 4913 4914
	for (i = 0; i < map->num_stripes; i++) {
		device = map->stripes[i].dev;
		dev_offset = map->stripes[i].physical;
4915

4916 4917 4918
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Y
Yan Zheng 已提交
4919
		stripe++;
4920
	}
4921
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4922

Y
Yan Zheng 已提交
4923
	btrfs_set_stack_chunk_length(chunk, chunk_size);
4924
	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
Y
Yan Zheng 已提交
4925 4926 4927 4928 4929
	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4930
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Y
Yan Zheng 已提交
4931
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4932

Y
Yan Zheng 已提交
4933 4934 4935
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
	key.offset = chunk_offset;
4936

Y
Yan Zheng 已提交
4937
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4938 4939 4940 4941 4942
	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
		/*
		 * TODO: Cleanup of inserted chunk root in case of
		 * failure.
		 */
4943
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
4944
	}
4945

4946
out:
4947
	kfree(chunk);
4948
	free_extent_map(em);
4949
	return ret;
Y
Yan Zheng 已提交
4950
}
4951

Y
Yan Zheng 已提交
4952 4953 4954 4955 4956 4957 4958 4959
/*
 * Chunk allocation falls into two parts. The first part does works
 * that make the new allocated chunk useable, but not do any operation
 * that modifies the chunk tree. The second part does the works that
 * require modifying the chunk tree. This division is important for the
 * bootstrap process of adding storage to a seed btrfs.
 */
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4960
		      struct btrfs_fs_info *fs_info, u64 type)
Y
Yan Zheng 已提交
4961 4962 4963
{
	u64 chunk_offset;

4964 4965
	ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
	chunk_offset = find_next_chunk(fs_info);
4966
	return __btrfs_alloc_chunk(trans, chunk_offset, type);
Y
Yan Zheng 已提交
4967 4968
}

C
Chris Mason 已提交
4969
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4970
					 struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
4971 4972 4973 4974 4975 4976
{
	u64 chunk_offset;
	u64 sys_chunk_offset;
	u64 alloc_profile;
	int ret;

4977
	chunk_offset = find_next_chunk(fs_info);
4978
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
4979
	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
4980 4981
	if (ret)
		return ret;
Y
Yan Zheng 已提交
4982

4983
	sys_chunk_offset = find_next_chunk(fs_info);
4984
	alloc_profile = btrfs_system_alloc_profile(fs_info);
4985
	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
4986
	return ret;
Y
Yan Zheng 已提交
4987 4988
}

4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
	int max_errors;

	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
			 BTRFS_BLOCK_GROUP_RAID10 |
			 BTRFS_BLOCK_GROUP_RAID5 |
			 BTRFS_BLOCK_GROUP_DUP)) {
		max_errors = 1;
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
		max_errors = 2;
	} else {
		max_errors = 0;
5002
	}
Y
Yan Zheng 已提交
5003

5004
	return max_errors;
Y
Yan Zheng 已提交
5005 5006
}

5007
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Y
Yan Zheng 已提交
5008 5009 5010 5011
{
	struct extent_map *em;
	struct map_lookup *map;
	int readonly = 0;
5012
	int miss_ndevs = 0;
Y
Yan Zheng 已提交
5013 5014
	int i;

5015 5016
	em = get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(em))
Y
Yan Zheng 已提交
5017 5018
		return 1;

5019
	map = em->map_lookup;
Y
Yan Zheng 已提交
5020
	for (i = 0; i < map->num_stripes; i++) {
5021 5022 5023 5024 5025
		if (map->stripes[i].dev->missing) {
			miss_ndevs++;
			continue;
		}

Y
Yan Zheng 已提交
5026 5027
		if (!map->stripes[i].dev->writeable) {
			readonly = 1;
5028
			goto end;
Y
Yan Zheng 已提交
5029 5030
		}
	}
5031 5032 5033 5034 5035 5036 5037 5038 5039

	/*
	 * If the number of missing devices is larger than max errors,
	 * we can not write the data into that chunk successfully, so
	 * set it readonly.
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
		readonly = 1;
end:
5040
	free_extent_map(em);
Y
Yan Zheng 已提交
5041
	return readonly;
5042 5043 5044 5045
}

void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
5046
	extent_map_tree_init(&tree->map_tree);
5047 5048 5049 5050 5051 5052
}

void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
{
	struct extent_map *em;

C
Chris Mason 已提交
5053
	while (1) {
5054
		write_lock(&tree->map_tree.lock);
5055 5056 5057
		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
		if (em)
			remove_extent_mapping(&tree->map_tree, em);
5058
		write_unlock(&tree->map_tree.lock);
5059 5060 5061 5062 5063 5064 5065 5066 5067
		if (!em)
			break;
		/* once for us */
		free_extent_map(em);
		/* once for the tree */
		free_extent_map(em);
	}
}

5068
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5069 5070 5071 5072 5073
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret;

5074 5075 5076 5077 5078 5079 5080 5081
	em = get_chunk_map(fs_info, logical, len);
	if (IS_ERR(em))
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5082 5083
		return 1;

5084
	map = em->map_lookup;
5085 5086
	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
		ret = map->num_stripes;
C
Chris Mason 已提交
5087 5088
	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		ret = map->sub_stripes;
D
David Woodhouse 已提交
5089 5090 5091 5092
	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		ret = 2;
	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
		ret = 3;
5093 5094 5095
	else
		ret = 1;
	free_extent_map(em);
5096

5097
	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
5098 5099
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
	    fs_info->dev_replace.tgtdev)
5100
		ret++;
5101
	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
5102

5103 5104 5105
	return ret;
}

5106
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
D
David Woodhouse 已提交
5107 5108 5109 5110
				    u64 logical)
{
	struct extent_map *em;
	struct map_lookup *map;
5111
	unsigned long len = fs_info->sectorsize;
D
David Woodhouse 已提交
5112

5113
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5114

5115 5116 5117 5118 5119 5120
	if (!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			len = map->stripe_len * nr_data_stripes(map);
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5121 5122 5123
	return len;
}

5124
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
D
David Woodhouse 已提交
5125 5126 5127 5128 5129
{
	struct extent_map *em;
	struct map_lookup *map;
	int ret = 0;

5130
	em = get_chunk_map(fs_info, logical, len);
D
David Woodhouse 已提交
5131

5132 5133 5134 5135 5136 5137
	if(!WARN_ON(IS_ERR(em))) {
		map = em->map_lookup;
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
		free_extent_map(em);
	}
D
David Woodhouse 已提交
5138 5139 5140
	return ret;
}

5141 5142 5143
static int find_live_mirror(struct btrfs_fs_info *fs_info,
			    struct map_lookup *map, int first, int num,
			    int optimal, int dev_replace_is_ongoing)
5144 5145
{
	int i;
5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169
	int tolerance;
	struct btrfs_device *srcdev;

	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
		if (map->stripes[optimal].dev->bdev &&
		    (tolerance || map->stripes[optimal].dev != srcdev))
			return optimal;
		for (i = first; i < first + num; i++) {
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5170
	}
5171

5172 5173 5174 5175 5176 5177
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
	return optimal;
}

D
David Woodhouse 已提交
5178 5179 5180 5181 5182 5183
static inline int parity_smaller(u64 a, u64 b)
{
	return a > b;
}

/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5184
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
D
David Woodhouse 已提交
5185 5186 5187 5188 5189 5190 5191 5192
{
	struct btrfs_bio_stripe s;
	int i;
	u64 l;
	int again = 1;

	while (again) {
		again = 0;
5193
		for (i = 0; i < num_stripes - 1; i++) {
5194 5195
			if (parity_smaller(bbio->raid_map[i],
					   bbio->raid_map[i+1])) {
D
David Woodhouse 已提交
5196
				s = bbio->stripes[i];
5197
				l = bbio->raid_map[i];
D
David Woodhouse 已提交
5198
				bbio->stripes[i] = bbio->stripes[i+1];
5199
				bbio->raid_map[i] = bbio->raid_map[i+1];
D
David Woodhouse 已提交
5200
				bbio->stripes[i+1] = s;
5201
				bbio->raid_map[i+1] = l;
5202

D
David Woodhouse 已提交
5203 5204 5205 5206 5207 5208
				again = 1;
			}
		}
	}
}

5209 5210 5211
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
	struct btrfs_bio *bbio = kzalloc(
5212
		 /* the size of the btrfs_bio */
5213
		sizeof(struct btrfs_bio) +
5214
		/* plus the variable array for the stripes */
5215
		sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5216
		/* plus the variable array for the tgt dev */
5217
		sizeof(int) * (real_stripes) +
5218 5219 5220 5221 5222
		/*
		 * plus the raid_map, which includes both the tgt dev
		 * and the stripes
		 */
		sizeof(u64) * (total_stripes),
5223
		GFP_NOFS|__GFP_NOFAIL);
5224 5225

	atomic_set(&bbio->error, 0);
5226
	refcount_set(&bbio->refs, 1);
5227 5228 5229 5230 5231 5232

	return bbio;
}

void btrfs_get_bbio(struct btrfs_bio *bbio)
{
5233 5234
	WARN_ON(!refcount_read(&bbio->refs));
	refcount_inc(&bbio->refs);
5235 5236 5237 5238 5239 5240
}

void btrfs_put_bbio(struct btrfs_bio *bbio)
{
	if (!bbio)
		return;
5241
	if (refcount_dec_and_test(&bbio->refs))
5242 5243 5244
		kfree(bbio);
}

5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 struct btrfs_bio **bbio_ret)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_bio *bbio;
	u64 offset;
	u64 stripe_nr;
	u64 stripe_nr_end;
	u64 stripe_end_offset;
	u64 stripe_cnt;
	u64 stripe_len;
	u64 stripe_offset;
	u64 num_stripes;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
	u64 stripes_per_dev = 0;
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
	int ret = 0;
	int i;

	/* discard always return a bbio */
	ASSERT(bbio_ret);

	em = get_chunk_map(fs_info, logical, length);
	if (IS_ERR(em))
		return PTR_ERR(em);

	map = em->map_lookup;
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
		goto out;
	}

	offset = logical - em->start;
	length = min_t(u64, em->len - offset, length);

	stripe_len = map->stripe_len;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
	stripe_nr = div64_u64(offset, stripe_len);

	/* stripe_offset is the offset of this block in its stripe */
	stripe_offset = offset - stripe_nr * stripe_len;

	stripe_nr_end = round_up(offset + length, map->stripe_len);
5302
	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396
	stripe_cnt = stripe_nr_end - stripe_nr;
	stripe_end_offset = stripe_nr_end * map->stripe_len -
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
	num_stripes = 1;
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
		num_stripes = min_t(u64, map->num_stripes,
				    sub_stripes * stripe_cnt);
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
		stripe_index *= sub_stripes;
		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
					      &remaining_stripes);
		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
		last_stripe *= sub_stripes;
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
				BTRFS_BLOCK_GROUP_DUP)) {
		num_stripes = map->num_stripes;
	} else {
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
					&stripe_index);
	}

	bbio = alloc_btrfs_bio(num_stripes, 0);
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}

	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset + stripe_nr * map->stripe_len;
		bbio->stripes[i].dev = map->stripes[stripe_index].dev;

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			bbio->stripes[i].length = stripes_per_dev *
				map->stripe_len;

			if (i / sub_stripes < remaining_stripes)
				bbio->stripes[i].length +=
					map->stripe_len;

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
				bbio->stripes[i].length -=
					stripe_offset;

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
				bbio->stripes[i].length -=
					stripe_end_offset;

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
			bbio->stripes[i].length = length;
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

	*bbio_ret = bbio;
	bbio->map_type = map->type;
	bbio->num_stripes = num_stripes;
out:
	free_extent_map(em);
	return ret;
}

5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473
/*
 * In dev-replace case, for repair case (that's the only case where the mirror
 * is selected explicitly when calling btrfs_map_block), blocks left of the
 * left cursor can also be read from the target drive.
 *
 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
 * array of stripes.
 * For READ, it also needs to be supported using the same mirror number.
 *
 * If the requested block is not left of the left cursor, EIO is returned. This
 * can happen because btrfs_num_copies() returns one more in the dev-replace
 * case.
 */
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
					 u64 logical, u64 length,
					 u64 srcdev_devid, int *mirror_num,
					 u64 *physical)
{
	struct btrfs_bio *bbio = NULL;
	int num_stripes;
	int index_srcdev = 0;
	int found = 0;
	u64 physical_of_found = 0;
	int i;
	int ret = 0;

	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				logical, &length, &bbio, 0, 0);
	if (ret) {
		ASSERT(bbio == NULL);
		return ret;
	}

	num_stripes = bbio->num_stripes;
	if (*mirror_num > num_stripes) {
		/*
		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
		 * that means that the requested area is not left of the left
		 * cursor
		 */
		btrfs_put_bbio(bbio);
		return -EIO;
	}

	/*
	 * process the rest of the function using the mirror_num of the source
	 * drive. Therefore look it up first.  At the end, patch the device
	 * pointer to the one of the target drive.
	 */
	for (i = 0; i < num_stripes; i++) {
		if (bbio->stripes[i].dev->devid != srcdev_devid)
			continue;

		/*
		 * In case of DUP, in order to keep it simple, only add the
		 * mirror with the lowest physical address
		 */
		if (found &&
		    physical_of_found <= bbio->stripes[i].physical)
			continue;

		index_srcdev = i;
		found = 1;
		physical_of_found = bbio->stripes[i].physical;
	}

	btrfs_put_bbio(bbio);

	ASSERT(found);
	if (!found)
		return -EIO;

	*mirror_num = index_srcdev + 1;
	*physical = physical_of_found;
	return ret;
}

5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				      struct btrfs_bio **bbio_ret,
				      struct btrfs_dev_replace *dev_replace,
				      int *num_stripes_ret, int *max_errors_ret)
{
	struct btrfs_bio *bbio = *bbio_ret;
	u64 srcdev_devid = dev_replace->srcdev->devid;
	int tgtdev_indexes = 0;
	int num_stripes = *num_stripes_ret;
	int max_errors = *max_errors_ret;
	int i;

	if (op == BTRFS_MAP_WRITE) {
		int index_where_to_add;

		/*
		 * duplicate the write operations while the dev replace
		 * procedure is running. Since the copying of the old disk to
		 * the new disk takes place at run time while the filesystem is
		 * mounted writable, the regular write operations to the old
		 * disk have to be duplicated to go to the new disk as well.
		 *
		 * Note that device->missing is handled by the caller, and that
		 * the write to the old disk is already set up in the stripes
		 * array.
		 */
		index_where_to_add = num_stripes;
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/* write to new disk, too */
				struct btrfs_bio_stripe *new =
					bbio->stripes + index_where_to_add;
				struct btrfs_bio_stripe *old =
					bbio->stripes + i;

				new->physical = old->physical;
				new->length = old->length;
				new->dev = dev_replace->tgtdev;
				bbio->tgtdev_map[i] = index_where_to_add;
				index_where_to_add++;
				max_errors++;
				tgtdev_indexes++;
			}
		}
		num_stripes = index_where_to_add;
	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
		int index_srcdev = 0;
		int found = 0;
		u64 physical_of_found = 0;

		/*
		 * During the dev-replace procedure, the target drive can also
		 * be used to read data in case it is needed to repair a corrupt
		 * block elsewhere. This is possible if the requested area is
		 * left of the left cursor. In this area, the target drive is a
		 * full copy of the source drive.
		 */
		for (i = 0; i < num_stripes; i++) {
			if (bbio->stripes[i].dev->devid == srcdev_devid) {
				/*
				 * In case of DUP, in order to keep it simple,
				 * only add the mirror with the lowest physical
				 * address
				 */
				if (found &&
				    physical_of_found <=
				     bbio->stripes[i].physical)
					continue;
				index_srcdev = i;
				found = 1;
				physical_of_found = bbio->stripes[i].physical;
			}
		}
		if (found) {
			struct btrfs_bio_stripe *tgtdev_stripe =
				bbio->stripes + num_stripes;

			tgtdev_stripe->physical = physical_of_found;
			tgtdev_stripe->length =
				bbio->stripes[index_srcdev].length;
			tgtdev_stripe->dev = dev_replace->tgtdev;
			bbio->tgtdev_map[index_srcdev] = num_stripes;

			tgtdev_indexes++;
			num_stripes++;
		}
	}

	*num_stripes_ret = num_stripes;
	*max_errors_ret = max_errors;
	bbio->num_tgtdevs = tgtdev_indexes;
	*bbio_ret = bbio;
}

5568 5569 5570 5571 5572
static bool need_full_stripe(enum btrfs_map_op op)
{
	return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}

5573 5574
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
			     enum btrfs_map_op op,
5575
			     u64 logical, u64 *length,
5576
			     struct btrfs_bio **bbio_ret,
5577
			     int mirror_num, int need_raid_map)
5578 5579 5580 5581
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 offset;
5582 5583
	u64 stripe_offset;
	u64 stripe_nr;
D
David Woodhouse 已提交
5584
	u64 stripe_len;
5585
	u32 stripe_index;
5586
	int i;
L
Li Zefan 已提交
5587
	int ret = 0;
5588
	int num_stripes;
5589
	int max_errors = 0;
5590
	int tgtdev_indexes = 0;
5591
	struct btrfs_bio *bbio = NULL;
5592 5593 5594
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
	int num_alloc_stripes;
5595 5596
	int patch_the_first_stripe_for_dev_replace = 0;
	u64 physical_to_patch_in_first_stripe = 0;
D
David Woodhouse 已提交
5597
	u64 raid56_full_stripe_start = (u64)-1;
5598

5599 5600 5601 5602
	if (op == BTRFS_MAP_DISCARD)
		return __btrfs_map_block_for_discard(fs_info, logical,
						     *length, bbio_ret);

5603 5604 5605
	em = get_chunk_map(fs_info, logical, *length);
	if (IS_ERR(em))
		return PTR_ERR(em);
5606

5607
	map = em->map_lookup;
5608
	offset = logical - em->start;
5609

D
David Woodhouse 已提交
5610
	stripe_len = map->stripe_len;
5611 5612 5613 5614 5615
	stripe_nr = offset;
	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
5616
	stripe_nr = div64_u64(stripe_nr, stripe_len);
5617

D
David Woodhouse 已提交
5618
	stripe_offset = stripe_nr * stripe_len;
5619
	if (offset < stripe_offset) {
J
Jeff Mahoney 已提交
5620 5621
		btrfs_crit(fs_info,
			   "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5622 5623 5624 5625 5626
			   stripe_offset, offset, em->start, logical,
			   stripe_len);
		free_extent_map(em);
		return -EINVAL;
	}
5627 5628 5629 5630

	/* stripe_offset is the offset of this block in its stripe*/
	stripe_offset = offset - stripe_offset;

D
David Woodhouse 已提交
5631
	/* if we're here for raid56, we need to know the stripe aligned start */
5632
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
D
David Woodhouse 已提交
5633 5634 5635 5636 5637 5638
		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
		raid56_full_stripe_start = offset;

		/* allow a write of a full stripe, but make sure we don't
		 * allow straddling of stripes
		 */
5639 5640
		raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				full_stripe_len);
D
David Woodhouse 已提交
5641 5642 5643
		raid56_full_stripe_start *= full_stripe_len;
	}

5644
	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
D
David Woodhouse 已提交
5645 5646 5647 5648
		u64 max_len;
		/* For writes to RAID[56], allow a full stripeset across all disks.
		   For other RAID types and for RAID[56] reads, just allow a single
		   stripe (on a single disk). */
5649
		if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5650
		    (op == BTRFS_MAP_WRITE)) {
D
David Woodhouse 已提交
5651 5652 5653 5654 5655 5656 5657
			max_len = stripe_len * nr_data_stripes(map) -
				(offset - raid56_full_stripe_start);
		} else {
			/* we limit the length of each bio to what fits in a stripe */
			max_len = stripe_len - stripe_offset;
		}
		*length = min_t(u64, em->len - offset, max_len);
5658 5659 5660
	} else {
		*length = em->len - offset;
	}
5661

D
David Woodhouse 已提交
5662 5663
	/* This is for when we're called from btrfs_merge_bio_hook() and all
	   it cares about is the length */
5664
	if (!bbio_ret)
5665 5666
		goto out;

5667
	btrfs_dev_replace_lock(dev_replace, 0);
5668 5669
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	if (!dev_replace_is_ongoing)
5670 5671 5672
		btrfs_dev_replace_unlock(dev_replace, 0);
	else
		btrfs_dev_replace_set_lock_blocking(dev_replace);
5673

5674
	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5675
	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5676 5677 5678 5679 5680
		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
						    dev_replace->srcdev->devid,
						    &mirror_num,
					    &physical_to_patch_in_first_stripe);
		if (ret)
5681
			goto out;
5682 5683
		else
			patch_the_first_stripe_for_dev_replace = 1;
5684 5685 5686 5687
	} else if (mirror_num > map->num_stripes) {
		mirror_num = 0;
	}

5688
	num_stripes = 1;
5689
	stripe_index = 0;
5690
	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5691 5692
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5693
		if (!need_full_stripe(op))
5694
			mirror_num = 1;
5695
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5696
		if (need_full_stripe(op))
5697
			num_stripes = map->num_stripes;
5698
		else if (mirror_num)
5699
			stripe_index = mirror_num - 1;
5700
		else {
5701
			stripe_index = find_live_mirror(fs_info, map, 0,
5702
					    map->num_stripes,
5703 5704
					    current->pid % map->num_stripes,
					    dev_replace_is_ongoing);
5705
			mirror_num = stripe_index + 1;
5706
		}
5707

5708
	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5709
		if (need_full_stripe(op)) {
5710
			num_stripes = map->num_stripes;
5711
		} else if (mirror_num) {
5712
			stripe_index = mirror_num - 1;
5713 5714 5715
		} else {
			mirror_num = 1;
		}
5716

C
Chris Mason 已提交
5717
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5718
		u32 factor = map->num_stripes / map->sub_stripes;
C
Chris Mason 已提交
5719

5720
		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
C
Chris Mason 已提交
5721 5722
		stripe_index *= map->sub_stripes;

5723
		if (need_full_stripe(op))
5724
			num_stripes = map->sub_stripes;
C
Chris Mason 已提交
5725 5726
		else if (mirror_num)
			stripe_index += mirror_num - 1;
5727
		else {
J
Jan Schmidt 已提交
5728
			int old_stripe_index = stripe_index;
5729 5730
			stripe_index = find_live_mirror(fs_info, map,
					      stripe_index,
5731
					      map->sub_stripes, stripe_index +
5732 5733
					      current->pid % map->sub_stripes,
					      dev_replace_is_ongoing);
J
Jan Schmidt 已提交
5734
			mirror_num = stripe_index - old_stripe_index + 1;
5735
		}
D
David Woodhouse 已提交
5736

5737
	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5738
		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
D
David Woodhouse 已提交
5739
			/* push stripe_nr back to the start of the full stripe */
5740
			stripe_nr = div64_u64(raid56_full_stripe_start,
5741
					stripe_len * nr_data_stripes(map));
D
David Woodhouse 已提交
5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755

			/* RAID[56] write or recovery. Return all stripes */
			num_stripes = map->num_stripes;
			max_errors = nr_parity_stripes(map);

			*length = map->stripe_len;
			stripe_index = 0;
			stripe_offset = 0;
		} else {
			/*
			 * Mirror #0 or #1 means the original data block.
			 * Mirror #2 is RAID5 parity block.
			 * Mirror #3 is RAID6 Q block.
			 */
5756 5757
			stripe_nr = div_u64_rem(stripe_nr,
					nr_data_stripes(map), &stripe_index);
D
David Woodhouse 已提交
5758 5759 5760 5761 5762
			if (mirror_num > 1)
				stripe_index = nr_data_stripes(map) +
						mirror_num - 2;

			/* We distribute the parity blocks across stripes */
5763 5764
			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
					&stripe_index);
5765
			if (!need_full_stripe(op) && mirror_num <= 1)
5766
				mirror_num = 1;
D
David Woodhouse 已提交
5767
		}
5768 5769
	} else {
		/*
5770 5771 5772
		 * after this, stripe_nr is the number of stripes on this
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
5773
		 */
5774 5775
		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				&stripe_index);
5776
		mirror_num = stripe_index + 1;
5777
	}
5778
	if (stripe_index >= map->num_stripes) {
J
Jeff Mahoney 已提交
5779 5780
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5781 5782 5783 5784
			   stripe_index, map->num_stripes);
		ret = -EINVAL;
		goto out;
	}
5785

5786
	num_alloc_stripes = num_stripes;
5787
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5788
		if (op == BTRFS_MAP_WRITE)
5789
			num_alloc_stripes <<= 1;
5790
		if (op == BTRFS_MAP_GET_READ_MIRRORS)
5791
			num_alloc_stripes++;
5792
		tgtdev_indexes = num_stripes;
5793
	}
5794

5795
	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
L
Li Zefan 已提交
5796 5797 5798 5799
	if (!bbio) {
		ret = -ENOMEM;
		goto out;
	}
5800
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5801
		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
L
Li Zefan 已提交
5802

5803
	/* build raid_map */
5804 5805
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
5806
		u64 tmp;
5807
		unsigned rot;
5808 5809 5810 5811 5812 5813 5814

		bbio->raid_map = (u64 *)((void *)bbio->stripes +
				 sizeof(struct btrfs_bio_stripe) *
				 num_alloc_stripes +
				 sizeof(int) * tgtdev_indexes);

		/* Work out the disk rotation on this stripe-set */
5815
		div_u64_rem(stripe_nr, num_stripes, &rot);
5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828

		/* Fill in the logical address of each stripe */
		tmp = stripe_nr * nr_data_stripes(map);
		for (i = 0; i < nr_data_stripes(map); i++)
			bbio->raid_map[(i+rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bbio->raid_map[(i+rot+1) % num_stripes] =
				RAID6_Q_STRIPE;
	}

L
Liu Bo 已提交
5829

5830 5831 5832 5833 5834 5835 5836 5837
	for (i = 0; i < num_stripes; i++) {
		bbio->stripes[i].physical =
			map->stripes[stripe_index].physical +
			stripe_offset +
			stripe_nr * map->stripe_len;
		bbio->stripes[i].dev =
			map->stripes[stripe_index].dev;
		stripe_index++;
5838
	}
L
Li Zefan 已提交
5839

5840
	if (need_full_stripe(op))
5841
		max_errors = btrfs_chunk_max_errors(map);
L
Li Zefan 已提交
5842

5843 5844
	if (bbio->raid_map)
		sort_parity_stripes(bbio, num_stripes);
5845

5846
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5847
	    need_full_stripe(op)) {
5848 5849
		handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
					  &max_errors);
5850 5851
	}

L
Li Zefan 已提交
5852
	*bbio_ret = bbio;
Z
Zhao Lei 已提交
5853
	bbio->map_type = map->type;
L
Li Zefan 已提交
5854 5855 5856
	bbio->num_stripes = num_stripes;
	bbio->max_errors = max_errors;
	bbio->mirror_num = mirror_num;
5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868

	/*
	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
	 * mirror_num == num_stripes + 1 && dev_replace target drive is
	 * available as a mirror
	 */
	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
		WARN_ON(num_stripes > 1);
		bbio->stripes[0].dev = dev_replace->tgtdev;
		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
		bbio->mirror_num = map->num_stripes + 1;
	}
5869
out:
5870 5871 5872 5873
	if (dev_replace_is_ongoing) {
		btrfs_dev_replace_clear_lock_blocking(dev_replace);
		btrfs_dev_replace_unlock(dev_replace, 0);
	}
5874
	free_extent_map(em);
L
Li Zefan 已提交
5875
	return ret;
5876 5877
}

5878
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5879
		      u64 logical, u64 *length,
5880
		      struct btrfs_bio **bbio_ret, int mirror_num)
5881
{
5882
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5883
				 mirror_num, 0);
5884 5885
}

5886
/* For Scrub/replace */
5887
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5888
		     u64 logical, u64 *length,
5889
		     struct btrfs_bio **bbio_ret)
5890
{
5891
	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5892 5893
}

5894
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
Y
Yan Zheng 已提交
5895 5896 5897 5898 5899 5900 5901 5902 5903
		     u64 chunk_start, u64 physical, u64 devid,
		     u64 **logical, int *naddrs, int *stripe_len)
{
	struct extent_map *em;
	struct map_lookup *map;
	u64 *buf;
	u64 bytenr;
	u64 length;
	u64 stripe_nr;
D
David Woodhouse 已提交
5904
	u64 rmap_len;
Y
Yan Zheng 已提交
5905 5906
	int i, j, nr = 0;

5907 5908
	em = get_chunk_map(fs_info, chunk_start, 1);
	if (IS_ERR(em))
5909 5910
		return -EIO;

5911
	map = em->map_lookup;
Y
Yan Zheng 已提交
5912
	length = em->len;
D
David Woodhouse 已提交
5913 5914
	rmap_len = map->stripe_len;

Y
Yan Zheng 已提交
5915
	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5916
		length = div_u64(length, map->num_stripes / map->sub_stripes);
Y
Yan Zheng 已提交
5917
	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5918
		length = div_u64(length, map->num_stripes);
5919
	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5920
		length = div_u64(length, nr_data_stripes(map));
D
David Woodhouse 已提交
5921 5922
		rmap_len = map->stripe_len * nr_data_stripes(map);
	}
Y
Yan Zheng 已提交
5923

5924
	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5925
	BUG_ON(!buf); /* -ENOMEM */
Y
Yan Zheng 已提交
5926 5927 5928 5929 5930 5931 5932 5933 5934

	for (i = 0; i < map->num_stripes; i++) {
		if (devid && map->stripes[i].dev->devid != devid)
			continue;
		if (map->stripes[i].physical > physical ||
		    map->stripes[i].physical + length <= physical)
			continue;

		stripe_nr = physical - map->stripes[i].physical;
5935
		stripe_nr = div64_u64(stripe_nr, map->stripe_len);
Y
Yan Zheng 已提交
5936 5937 5938

		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
			stripe_nr = stripe_nr * map->num_stripes + i;
5939
			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
Y
Yan Zheng 已提交
5940 5941
		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
			stripe_nr = stripe_nr * map->num_stripes + i;
D
David Woodhouse 已提交
5942 5943 5944 5945 5946
		} /* else if RAID[56], multiply by nr_data_stripes().
		   * Alternatively, just use rmap_len below instead of
		   * map->stripe_len */

		bytenr = chunk_start + stripe_nr * rmap_len;
5947
		WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
5948 5949 5950 5951
		for (j = 0; j < nr; j++) {
			if (buf[j] == bytenr)
				break;
		}
5952 5953
		if (j == nr) {
			WARN_ON(nr >= map->num_stripes);
Y
Yan Zheng 已提交
5954
			buf[nr++] = bytenr;
5955
		}
Y
Yan Zheng 已提交
5956 5957 5958 5959
	}

	*logical = buf;
	*naddrs = nr;
D
David Woodhouse 已提交
5960
	*stripe_len = rmap_len;
Y
Yan Zheng 已提交
5961 5962 5963

	free_extent_map(em);
	return 0;
5964 5965
}

5966
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
5967
{
5968 5969
	bio->bi_private = bbio->private;
	bio->bi_end_io = bbio->end_io;
5970
	bio_endio(bio);
5971

5972
	btrfs_put_bbio(bbio);
5973 5974
}

5975
static void btrfs_end_bio(struct bio *bio)
5976
{
5977
	struct btrfs_bio *bbio = bio->bi_private;
5978
	int is_orig_bio = 0;
5979

5980
	if (bio->bi_status) {
5981
		atomic_inc(&bbio->error);
5982 5983
		if (bio->bi_status == BLK_STS_IOERR ||
		    bio->bi_status == BLK_STS_TARGET) {
5984
			unsigned int stripe_index =
5985
				btrfs_io_bio(bio)->stripe_index;
5986
			struct btrfs_device *dev;
5987 5988 5989

			BUG_ON(stripe_index >= bbio->num_stripes);
			dev = bbio->stripes[stripe_index].dev;
5990
			if (dev->bdev) {
M
Mike Christie 已提交
5991
				if (bio_op(bio) == REQ_OP_WRITE)
5992
					btrfs_dev_stat_inc_and_print(dev,
5993 5994
						BTRFS_DEV_STAT_WRITE_ERRS);
				else
5995
					btrfs_dev_stat_inc_and_print(dev,
5996
						BTRFS_DEV_STAT_READ_ERRS);
5997
				if (bio->bi_opf & REQ_PREFLUSH)
5998
					btrfs_dev_stat_inc_and_print(dev,
5999 6000
						BTRFS_DEV_STAT_FLUSH_ERRS);
			}
6001 6002
		}
	}
6003

6004
	if (bio == bbio->orig_bio)
6005 6006
		is_orig_bio = 1;

6007 6008
	btrfs_bio_counter_dec(bbio->fs_info);

6009
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6010 6011
		if (!is_orig_bio) {
			bio_put(bio);
6012
			bio = bbio->orig_bio;
6013
		}
6014

6015
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6016
		/* only send an error to the higher layers if it is
D
David Woodhouse 已提交
6017
		 * beyond the tolerance of the btrfs bio
6018
		 */
6019
		if (atomic_read(&bbio->error) > bbio->max_errors) {
6020
			bio->bi_status = BLK_STS_IOERR;
6021
		} else {
6022 6023 6024 6025
			/*
			 * this bio is actually up to date, we didn't
			 * go over the max number of errors
			 */
6026
			bio->bi_status = BLK_STS_OK;
6027
		}
6028

6029
		btrfs_end_bbio(bbio, bio);
6030
	} else if (!is_orig_bio) {
6031 6032 6033 6034
		bio_put(bio);
	}
}

6035 6036 6037 6038 6039 6040 6041
/*
 * see run_scheduled_bios for a description of why bios are collected for
 * async submit.
 *
 * This will add one bio to the pending list for a device and make sure
 * the work struct is scheduled.
 */
6042
static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6043
					struct bio *bio)
6044
{
6045
	struct btrfs_fs_info *fs_info = device->fs_info;
6046
	int should_queue = 1;
6047
	struct btrfs_pending_bios *pending_bios;
6048

D
David Woodhouse 已提交
6049
	if (device->missing || !device->bdev) {
6050
		bio_io_error(bio);
D
David Woodhouse 已提交
6051 6052 6053
		return;
	}

6054
	/* don't bother with additional async steps for reads, right now */
M
Mike Christie 已提交
6055
	if (bio_op(bio) == REQ_OP_READ) {
6056
		bio_get(bio);
6057
		btrfsic_submit_bio(bio);
6058
		bio_put(bio);
6059
		return;
6060 6061
	}

6062
	WARN_ON(bio->bi_next);
6063 6064 6065
	bio->bi_next = NULL;

	spin_lock(&device->io_lock);
6066
	if (op_is_sync(bio->bi_opf))
6067 6068 6069
		pending_bios = &device->pending_sync_bios;
	else
		pending_bios = &device->pending_bios;
6070

6071 6072
	if (pending_bios->tail)
		pending_bios->tail->bi_next = bio;
6073

6074 6075 6076
	pending_bios->tail = bio;
	if (!pending_bios->head)
		pending_bios->head = bio;
6077 6078 6079 6080 6081 6082
	if (device->running_pending)
		should_queue = 0;

	spin_unlock(&device->io_lock);

	if (should_queue)
6083
		btrfs_queue_work(fs_info->submit_workers, &device->work);
6084 6085
}

6086 6087
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
			      u64 physical, int dev_nr, int async)
6088 6089
{
	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6090
	struct btrfs_fs_info *fs_info = bbio->fs_info;
6091 6092

	bio->bi_private = bbio;
6093
	btrfs_io_bio(bio)->stripe_index = dev_nr;
6094
	bio->bi_end_io = btrfs_end_bio;
6095
	bio->bi_iter.bi_sector = physical >> 9;
6096 6097 6098 6099 6100 6101
#ifdef DEBUG
	{
		struct rcu_string *name;

		rcu_read_lock();
		name = rcu_dereference(dev->name);
6102 6103 6104 6105 6106 6107
		btrfs_debug(fs_info,
			"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
			bio_op(bio), bio->bi_opf,
			(u64)bio->bi_iter.bi_sector,
			(u_long)dev->bdev->bd_dev, name->str, dev->devid,
			bio->bi_iter.bi_size);
6108 6109 6110
		rcu_read_unlock();
	}
#endif
6111
	bio_set_dev(bio, dev->bdev);
6112

6113
	btrfs_bio_counter_inc_noblocked(fs_info);
6114

6115
	if (async)
6116
		btrfs_schedule_bio(dev, bio);
6117
	else
6118
		btrfsic_submit_bio(bio);
6119 6120 6121 6122 6123 6124
}

static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
	atomic_inc(&bbio->error);
	if (atomic_dec_and_test(&bbio->stripes_pending)) {
6125
		/* Should be the original bio. */
6126 6127
		WARN_ON(bio != bbio->orig_bio);

6128
		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6129
		bio->bi_iter.bi_sector = logical >> 9;
6130 6131 6132 6133
		if (atomic_read(&bbio->error) > bbio->max_errors)
			bio->bi_status = BLK_STS_IOERR;
		else
			bio->bi_status = BLK_STS_OK;
6134
		btrfs_end_bbio(bbio, bio);
6135 6136 6137
	}
}

6138 6139
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
			   int mirror_num, int async_submit)
6140 6141
{
	struct btrfs_device *dev;
6142
	struct bio *first_bio = bio;
6143
	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6144 6145 6146
	u64 length = 0;
	u64 map_length;
	int ret;
6147 6148
	int dev_nr;
	int total_devs;
6149
	struct btrfs_bio *bbio = NULL;
6150

6151
	length = bio->bi_iter.bi_size;
6152
	map_length = length;
6153

6154
	btrfs_bio_counter_inc_blocked(fs_info);
6155
	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
M
Mike Christie 已提交
6156
				&map_length, &bbio, mirror_num, 1);
6157
	if (ret) {
6158
		btrfs_bio_counter_dec(fs_info);
6159
		return errno_to_blk_status(ret);
6160
	}
6161

6162
	total_devs = bbio->num_stripes;
D
David Woodhouse 已提交
6163 6164 6165
	bbio->orig_bio = first_bio;
	bbio->private = first_bio->bi_private;
	bbio->end_io = first_bio->bi_end_io;
6166
	bbio->fs_info = fs_info;
D
David Woodhouse 已提交
6167 6168
	atomic_set(&bbio->stripes_pending, bbio->num_stripes);

6169
	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
M
Mike Christie 已提交
6170
	    ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
D
David Woodhouse 已提交
6171 6172
		/* In this case, map_length has been set to the length of
		   a single stripe; not the whole write */
M
Mike Christie 已提交
6173
		if (bio_op(bio) == REQ_OP_WRITE) {
6174 6175
			ret = raid56_parity_write(fs_info, bio, bbio,
						  map_length);
D
David Woodhouse 已提交
6176
		} else {
6177 6178
			ret = raid56_parity_recover(fs_info, bio, bbio,
						    map_length, mirror_num, 1);
D
David Woodhouse 已提交
6179
		}
6180

6181
		btrfs_bio_counter_dec(fs_info);
6182
		return errno_to_blk_status(ret);
D
David Woodhouse 已提交
6183 6184
	}

6185
	if (map_length < length) {
6186
		btrfs_crit(fs_info,
J
Jeff Mahoney 已提交
6187 6188
			   "mapping failed logical %llu bio len %llu len %llu",
			   logical, length, map_length);
6189 6190
		BUG();
	}
6191

6192
	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6193
		dev = bbio->stripes[dev_nr].dev;
M
Mike Christie 已提交
6194
		if (!dev || !dev->bdev ||
6195
		    (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
6196 6197 6198 6199
			bbio_error(bbio, first_bio, logical);
			continue;
		}

6200
		if (dev_nr < total_devs - 1)
6201
			bio = btrfs_bio_clone(first_bio);
6202
		else
6203
			bio = first_bio;
6204

6205 6206
		submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				  dev_nr, async_submit);
6207
	}
6208
	btrfs_bio_counter_dec(fs_info);
6209
	return BLK_STS_OK;
6210 6211
}

6212
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
Y
Yan Zheng 已提交
6213
				       u8 *uuid, u8 *fsid)
6214
{
Y
Yan Zheng 已提交
6215 6216 6217
	struct btrfs_device *device;
	struct btrfs_fs_devices *cur_devices;

6218
	cur_devices = fs_info->fs_devices;
Y
Yan Zheng 已提交
6219 6220
	while (cur_devices) {
		if (!fsid ||
6221
		    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6222
			device = find_device(cur_devices, devid, uuid);
Y
Yan Zheng 已提交
6223 6224 6225 6226 6227 6228
			if (device)
				return device;
		}
		cur_devices = cur_devices->seed;
	}
	return NULL;
6229 6230
}

6231
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6232 6233 6234 6235
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;

6236 6237
	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
	if (IS_ERR(device))
6238
		return device;
6239 6240

	list_add(&device->dev_list, &fs_devices->devices);
Y
Yan Zheng 已提交
6241
	device->fs_devices = fs_devices;
6242
	fs_devices->num_devices++;
6243 6244

	device->missing = 1;
6245
	fs_devices->missing_devices++;
6246

6247 6248 6249
	return device;
}

6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269
/**
 * btrfs_alloc_device - allocate struct btrfs_device
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
 * on error.  Returned struct is not linked onto any lists and can be
 * destroyed with kfree() right away.
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
					const u64 *devid,
					const u8 *uuid)
{
	struct btrfs_device *dev;
	u64 tmp;

6270
	if (WARN_ON(!devid && !fs_info))
6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283
		return ERR_PTR(-EINVAL);

	dev = __alloc_device();
	if (IS_ERR(dev))
		return dev;

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6284
			bio_put(dev->flush_bio);
6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295
			kfree(dev);
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

6296 6297
	btrfs_init_work(&dev->work, btrfs_submit_helper,
			pending_bios_fn, NULL, NULL);
6298 6299 6300 6301

	return dev;
}

6302
/* Return -EIO if any error, otherwise return 0. */
6303
static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6304 6305
				   struct extent_buffer *leaf,
				   struct btrfs_chunk *chunk, u64 logical)
6306 6307
{
	u64 length;
6308
	u64 stripe_len;
6309 6310 6311
	u16 num_stripes;
	u16 sub_stripes;
	u64 type;
6312

6313
	length = btrfs_chunk_length(leaf, chunk);
6314 6315
	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6316 6317 6318
	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
	type = btrfs_chunk_type(leaf, chunk);

6319
	if (!num_stripes) {
6320
		btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6321 6322 6323
			  num_stripes);
		return -EIO;
	}
6324 6325
	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6326 6327
		return -EIO;
	}
6328 6329
	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
		btrfs_err(fs_info, "invalid chunk sectorsize %u",
6330 6331 6332
			  btrfs_chunk_sector_size(leaf, chunk));
		return -EIO;
	}
6333 6334
	if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
		btrfs_err(fs_info, "invalid chunk length %llu", length);
6335 6336
		return -EIO;
	}
6337
	if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6338
		btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6339 6340 6341 6342
			  stripe_len);
		return -EIO;
	}
	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6343
	    type) {
6344
		btrfs_err(fs_info, "unrecognized chunk type: %llu",
6345 6346 6347 6348 6349
			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
			  btrfs_chunk_type(leaf, chunk));
		return -EIO;
	}
6350 6351 6352 6353 6354 6355 6356
	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
	     num_stripes != 1)) {
6357
		btrfs_err(fs_info,
6358 6359 6360 6361 6362 6363 6364 6365 6366
			"invalid num_stripes:sub_stripes %u:%u for profile %llu",
			num_stripes, sub_stripes,
			type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
		return -EIO;
	}

	return 0;
}

6367
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6368
					u64 devid, u8 *uuid, bool error)
6369
{
6370 6371 6372 6373 6374 6375
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6376 6377
}

6378
static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6379 6380 6381
			  struct extent_buffer *leaf,
			  struct btrfs_chunk *chunk)
{
6382
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396
	struct map_lookup *map;
	struct extent_map *em;
	u64 logical;
	u64 length;
	u64 devid;
	u8 uuid[BTRFS_UUID_SIZE];
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6397
	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6398 6399
	if (ret)
		return ret;
6400

6401
	read_lock(&map_tree->map_tree.lock);
6402
	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6403
	read_unlock(&map_tree->map_tree.lock);
6404 6405 6406 6407 6408 6409 6410 6411 6412

	/* already mapped? */
	if (em && em->start <= logical && em->start + em->len > logical) {
		free_extent_map(em);
		return 0;
	} else if (em) {
		free_extent_map(em);
	}

6413
	em = alloc_extent_map();
6414 6415
	if (!em)
		return -ENOMEM;
6416
	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6417 6418 6419 6420 6421
	if (!map) {
		free_extent_map(em);
		return -ENOMEM;
	}

6422
	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6423
	em->map_lookup = map;
6424 6425
	em->start = logical;
	em->len = length;
6426
	em->orig_start = 0;
6427
	em->block_start = 0;
C
Chris Mason 已提交
6428
	em->block_len = em->len;
6429

6430 6431 6432 6433 6434
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
	map->type = btrfs_chunk_type(leaf, chunk);
C
Chris Mason 已提交
6435
	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6436 6437 6438 6439
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6440 6441 6442
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
6443
		map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6444
							uuid, NULL);
6445
		if (!map->stripes[i].dev &&
6446
		    !btrfs_test_opt(fs_info, DEGRADED)) {
6447
			free_extent_map(em);
6448
			btrfs_report_missing_device(fs_info, devid, uuid, true);
6449
			return -ENOENT;
6450
		}
6451 6452
		if (!map->stripes[i].dev) {
			map->stripes[i].dev =
6453 6454
				add_missing_dev(fs_info->fs_devices, devid,
						uuid);
6455
			if (IS_ERR(map->stripes[i].dev)) {
6456
				free_extent_map(em);
6457 6458 6459 6460
				btrfs_err(fs_info,
					"failed to init missing dev %llu: %ld",
					devid, PTR_ERR(map->stripes[i].dev));
				return PTR_ERR(map->stripes[i].dev);
6461
			}
6462
			btrfs_report_missing_device(fs_info, devid, uuid, false);
6463 6464
		}
		map->stripes[i].dev->in_fs_metadata = 1;
6465 6466
	}

6467
	write_lock(&map_tree->map_tree.lock);
J
Josef Bacik 已提交
6468
	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6469
	write_unlock(&map_tree->map_tree.lock);
6470
	BUG_ON(ret); /* Tree corruption */
6471 6472 6473 6474 6475
	free_extent_map(em);

	return 0;
}

6476
static void fill_device_from_item(struct extent_buffer *leaf,
6477 6478 6479 6480 6481 6482
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
6483 6484
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
6485
	device->commit_total_bytes = device->disk_total_bytes;
6486
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6487
	device->commit_bytes_used = device->bytes_used;
6488 6489 6490 6491
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6492
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6493
	device->is_tgtdev_for_dev_replace = 0;
6494

6495
	ptr = btrfs_device_uuid(dev_item);
6496
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6497 6498
}

6499
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6500
						  u8 *fsid)
Y
Yan Zheng 已提交
6501 6502 6503 6504
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

6505
	BUG_ON(!mutex_is_locked(&uuid_mutex));
D
David Sterba 已提交
6506
	ASSERT(fsid);
Y
Yan Zheng 已提交
6507

6508
	fs_devices = fs_info->fs_devices->seed;
Y
Yan Zheng 已提交
6509
	while (fs_devices) {
6510
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6511 6512
			return fs_devices;

Y
Yan Zheng 已提交
6513 6514 6515 6516 6517
		fs_devices = fs_devices->seed;
	}

	fs_devices = find_fsid(fsid);
	if (!fs_devices) {
6518
		if (!btrfs_test_opt(fs_info, DEGRADED))
6519 6520 6521 6522 6523 6524 6525 6526 6527
			return ERR_PTR(-ENOENT);

		fs_devices = alloc_fs_devices(fsid);
		if (IS_ERR(fs_devices))
			return fs_devices;

		fs_devices->seeding = 1;
		fs_devices->opened = 1;
		return fs_devices;
Y
Yan Zheng 已提交
6528
	}
Y
Yan Zheng 已提交
6529 6530

	fs_devices = clone_fs_devices(fs_devices);
6531 6532
	if (IS_ERR(fs_devices))
		return fs_devices;
Y
Yan Zheng 已提交
6533

6534
	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6535
				   fs_info->bdev_holder);
6536 6537
	if (ret) {
		free_fs_devices(fs_devices);
6538
		fs_devices = ERR_PTR(ret);
Y
Yan Zheng 已提交
6539
		goto out;
6540
	}
Y
Yan Zheng 已提交
6541 6542 6543

	if (!fs_devices->seeding) {
		__btrfs_close_devices(fs_devices);
Y
Yan Zheng 已提交
6544
		free_fs_devices(fs_devices);
6545
		fs_devices = ERR_PTR(-EINVAL);
Y
Yan Zheng 已提交
6546 6547 6548
		goto out;
	}

6549 6550
	fs_devices->seed = fs_info->fs_devices->seed;
	fs_info->fs_devices->seed = fs_devices;
Y
Yan Zheng 已提交
6551
out:
6552
	return fs_devices;
Y
Yan Zheng 已提交
6553 6554
}

6555
static int read_one_dev(struct btrfs_fs_info *fs_info,
6556 6557 6558
			struct extent_buffer *leaf,
			struct btrfs_dev_item *dev_item)
{
6559
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6560 6561 6562
	struct btrfs_device *device;
	u64 devid;
	int ret;
6563
	u8 fs_uuid[BTRFS_FSID_SIZE];
6564 6565
	u8 dev_uuid[BTRFS_UUID_SIZE];

6566
	devid = btrfs_device_id(leaf, dev_item);
6567
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6568
			   BTRFS_UUID_SIZE);
6569
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6570
			   BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
6571

6572
	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6573
		fs_devices = open_seed_devices(fs_info, fs_uuid);
6574 6575
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Y
Yan Zheng 已提交
6576 6577
	}

6578
	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6579
	if (!device) {
6580
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
6581 6582
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
6583
			return -ENOENT;
6584
		}
Y
Yan Zheng 已提交
6585

6586
		device = add_missing_dev(fs_devices, devid, dev_uuid);
6587 6588 6589 6590 6591 6592
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
6593
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6594
	} else {
6595
		if (!device->bdev) {
6596 6597 6598
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
6599
				return -ENOENT;
6600 6601 6602
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
6603
		}
6604 6605

		if(!device->bdev && !device->missing) {
6606 6607 6608 6609 6610 6611
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
6612
			device->fs_devices->missing_devices++;
6613
			device->missing = 1;
Y
Yan Zheng 已提交
6614
		}
6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
			ASSERT(device->missing);

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Y
Yan Zheng 已提交
6629 6630
	}

6631
	if (device->fs_devices != fs_info->fs_devices) {
Y
Yan Zheng 已提交
6632 6633 6634 6635
		BUG_ON(device->writeable);
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
6636
	}
6637 6638

	fill_device_from_item(leaf, dev_item, device);
6639
	device->in_fs_metadata = 1;
6640
	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
Y
Yan Zheng 已提交
6641
		device->fs_devices->total_rw_bytes += device->total_bytes;
6642 6643
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
6644
	}
6645 6646 6647 6648
	ret = 0;
	return ret;
}

6649
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6650
{
6651
	struct btrfs_root *root = fs_info->tree_root;
6652
	struct btrfs_super_block *super_copy = fs_info->super_copy;
6653
	struct extent_buffer *sb;
6654 6655
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
6656 6657
	u8 *array_ptr;
	unsigned long sb_array_offset;
6658
	int ret = 0;
6659 6660 6661
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
6662
	u32 cur_offset;
6663
	u64 type;
6664
	struct btrfs_key key;
6665

6666
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6667 6668 6669 6670 6671
	/*
	 * This will create extent buffer of nodesize, superblock size is
	 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
	 * overallocate but we can keep it as-is, only the first page is used.
	 */
6672
	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6673 6674
	if (IS_ERR(sb))
		return PTR_ERR(sb);
6675
	set_extent_buffer_uptodate(sb);
6676
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6677
	/*
6678
	 * The sb extent buffer is artificial and just used to read the system array.
6679
	 * set_extent_buffer_uptodate() call does not properly mark all it's
6680 6681 6682 6683 6684 6685 6686 6687 6688
	 * pages up-to-date when the page is larger: extent does not cover the
	 * whole page and consequently check_page_uptodate does not find all
	 * the page's extents up-to-date (the hole beyond sb),
	 * write_extent_buffer then triggers a WARN_ON.
	 *
	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
	 * but sb spans only this function. Add an explicit SetPageUptodate call
	 * to silence the warning eg. on PowerPC 64.
	 */
6689
	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6690
		SetPageUptodate(sb->pages[0]);
6691

6692
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6693 6694
	array_size = btrfs_super_sys_array_size(super_copy);

6695 6696 6697
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
6698

6699 6700
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
6701 6702 6703 6704
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

6705 6706
		btrfs_disk_key_to_cpu(&key, disk_key);

6707 6708 6709
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6710

6711
		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6712
			chunk = (struct btrfs_chunk *)sb_array_offset;
6713 6714 6715 6716 6717 6718 6719 6720 6721
			/*
			 * At least one btrfs_chunk with one stripe must be
			 * present, exact stripe count check comes afterwards
			 */
			len = btrfs_chunk_item_size(1);
			if (cur_offset + len > array_size)
				goto out_short_read;

			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6722
			if (!num_stripes) {
6723 6724
				btrfs_err(fs_info,
					"invalid number of stripes %u in sys_array at offset %u",
6725 6726 6727 6728 6729
					num_stripes, cur_offset);
				ret = -EIO;
				break;
			}

6730 6731
			type = btrfs_chunk_type(sb, chunk);
			if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6732
				btrfs_err(fs_info,
6733 6734 6735 6736 6737 6738
			    "invalid chunk type %llu in sys_array at offset %u",
					type, cur_offset);
				ret = -EIO;
				break;
			}

6739 6740 6741 6742
			len = btrfs_chunk_item_size(num_stripes);
			if (cur_offset + len > array_size)
				goto out_short_read;

6743
			ret = read_one_chunk(fs_info, &key, sb, chunk);
6744 6745
			if (ret)
				break;
6746
		} else {
6747 6748 6749
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
6750 6751
			ret = -EIO;
			break;
6752
		}
6753 6754 6755
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
6756
	}
6757
	clear_extent_buffer_uptodate(sb);
6758
	free_extent_buffer_stale(sb);
6759
	return ret;
6760 6761

out_short_read:
6762
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6763
			len, cur_offset);
6764
	clear_extent_buffer_uptodate(sb);
6765
	free_extent_buffer_stale(sb);
6766
	return -EIO;
6767 6768
}

6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
{
	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	struct extent_map *em;
	u64 next_start = 0;
	bool ret = true;

	read_lock(&map_tree->map_tree.lock);
	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
	read_unlock(&map_tree->map_tree.lock);
	/* No chunk at all? Return false anyway */
	if (!em) {
		ret = false;
		goto out;
	}
	while (em) {
		struct map_lookup *map;
		int missing = 0;
		int max_tolerated;
		int i;

		map = em->map_lookup;
		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

			if (!dev || !dev->bdev || dev->missing ||
			    dev->last_flush_error)
				missing++;
		}
		if (missing > max_tolerated) {
			btrfs_warn(fs_info,
	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				   em->start, missing, max_tolerated);
			free_extent_map(em);
			ret = false;
			goto out;
		}
		next_start = extent_map_end(em);
		free_extent_map(em);

		read_lock(&map_tree->map_tree.lock);
		em = lookup_extent_mapping(&map_tree->map_tree, next_start,
					   (u64)(-1) - next_start);
		read_unlock(&map_tree->map_tree.lock);
	}
out:
	return ret;
}

6827
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
6828
{
6829
	struct btrfs_root *root = fs_info->chunk_root;
6830 6831 6832 6833 6834 6835
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
6836
	u64 total_dev = 0;
6837 6838 6839 6840 6841

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

6842
	mutex_lock(&uuid_mutex);
6843
	mutex_lock(&fs_info->chunk_mutex);
6844

6845 6846 6847 6848 6849
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6850 6851 6852 6853 6854
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6855 6856
	if (ret < 0)
		goto error;
C
Chris Mason 已提交
6857
	while (1) {
6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868
		leaf = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
				goto error;
			break;
		}
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6869 6870 6871
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
6872
						  struct btrfs_dev_item);
6873
			ret = read_one_dev(fs_info, leaf, dev_item);
6874 6875
			if (ret)
				goto error;
6876
			total_dev++;
6877 6878 6879
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6880
			ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
Y
Yan Zheng 已提交
6881 6882
			if (ret)
				goto error;
6883 6884 6885
		}
		path->slots[0]++;
	}
6886 6887 6888 6889 6890

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
6891 6892
	if (total_dev != fs_info->fs_devices->total_devices) {
		btrfs_err(fs_info,
6893
	   "super_num_devices %llu mismatch with num_devices %llu found here",
6894
			  btrfs_super_num_devices(fs_info->super_copy),
6895 6896 6897 6898
			  total_dev);
		ret = -EINVAL;
		goto error;
	}
6899 6900 6901
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
6902
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
6903 6904
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
6905 6906 6907
		ret = -EINVAL;
		goto error;
	}
6908 6909
	ret = 0;
error:
6910
	mutex_unlock(&fs_info->chunk_mutex);
6911 6912
	mutex_unlock(&uuid_mutex);

Y
Yan Zheng 已提交
6913
	btrfs_free_path(path);
6914 6915
	return ret;
}
6916

6917 6918 6919 6920 6921
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;

6922 6923 6924
	while (fs_devices) {
		mutex_lock(&fs_devices->device_list_mutex);
		list_for_each_entry(device, &fs_devices->devices, dev_list)
6925
			device->fs_info = fs_info;
6926 6927 6928 6929
		mutex_unlock(&fs_devices->device_list_mutex);

		fs_devices = fs_devices->seed;
	}
6930 6931
}

6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_dev_stat_reset(dev, i);
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct extent_buffer *eb;
	int slot;
	int ret = 0;
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
	int i;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		int item_size;
		struct btrfs_dev_stats_item *ptr;

6964 6965
		key.objectid = BTRFS_DEV_STATS_OBJECTID;
		key.type = BTRFS_PERSISTENT_ITEM_KEY;
6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001
		key.offset = device->devid;
		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
		if (ret) {
			__btrfs_reset_dev_stats(device);
			device->dev_stats_valid = 1;
			btrfs_release_path(path);
			continue;
		}
		slot = path->slots[0];
		eb = path->nodes[0];
		btrfs_item_key_to_cpu(eb, &found_key, slot);
		item_size = btrfs_item_size_nr(eb, slot);

		ptr = btrfs_item_ptr(eb, slot,
				     struct btrfs_dev_stats_item);

		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (item_size >= (1 + i) * sizeof(__le64))
				btrfs_dev_stat_set(device, i,
					btrfs_dev_stats_value(eb, ptr, i));
			else
				btrfs_dev_stat_reset(device, i);
		}

		device->dev_stats_valid = 1;
		btrfs_dev_stat_print_on_load(device);
		btrfs_release_path(path);
	}
	mutex_unlock(&fs_devices->device_list_mutex);

out:
	btrfs_free_path(path);
	return ret < 0 ? ret : 0;
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7002
				struct btrfs_fs_info *fs_info,
7003 7004
				struct btrfs_device *device)
{
7005
	struct btrfs_root *dev_root = fs_info->dev_root;
7006 7007 7008 7009 7010 7011 7012
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7013 7014
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7015 7016 7017
	key.offset = device->devid;

	path = btrfs_alloc_path();
7018 7019
	if (!path)
		return -ENOMEM;
7020 7021
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7022
		btrfs_warn_in_rcu(fs_info,
7023
			"error %d while searching for dev_stats item for device %s",
7024
			      ret, rcu_str_deref(device->name));
7025 7026 7027 7028 7029 7030 7031 7032
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7033
			btrfs_warn_in_rcu(fs_info,
7034
				"delete too small dev_stats item for device %s failed %d",
7035
				      rcu_str_deref(device->name), ret);
7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7047
			btrfs_warn_in_rcu(fs_info,
7048 7049
				"insert dev_stats item for device %s failed %d",
				rcu_str_deref(device->name), ret);
7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7074
	int stats_cnt;
7075 7076 7077 7078
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7079 7080
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7081 7082
			continue;

7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7097
		ret = update_dev_stat_item(trans, fs_info, device);
7098
		if (!ret)
7099
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7100 7101 7102 7103 7104 7105
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7106 7107 7108 7109 7110 7111
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);
	btrfs_dev_stat_print_on_error(dev);
}

7112
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7113
{
7114 7115
	if (!dev->dev_stats_valid)
		return;
7116
	btrfs_err_rl_in_rcu(dev->fs_info,
7117
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7118
			   rcu_str_deref(dev->name),
7119 7120 7121
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7122 7123
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7124
}
7125

7126 7127
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7128 7129 7130 7131 7132 7133 7134 7135
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7136
	btrfs_info_in_rcu(dev->fs_info,
7137
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7138
	       rcu_str_deref(dev->name),
7139 7140 7141 7142 7143 7144 7145
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7146
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7147
			struct btrfs_ioctl_get_dev_stats *stats)
7148 7149
{
	struct btrfs_device *dev;
7150
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7151 7152 7153
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7154
	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7155 7156 7157
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7158
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7159
		return -ENODEV;
7160
	} else if (!dev->dev_stats_valid) {
7161
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7162
		return -ENODEV;
7163
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
				btrfs_dev_stat_reset(dev, i);
		}
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7180

7181
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7182 7183 7184
{
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
7185
	int copy_num;
7186

7187 7188
	if (!bdev)
		return;
7189

7190 7191
	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
		copy_num++) {
7192

7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208
		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
			continue;

		disk_super = (struct btrfs_super_block *)bh->b_data;

		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
		set_buffer_dirty(bh);
		sync_dirty_buffer(bh);
		brelse(bh);
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
	update_dev_time(device_path);
7209
}
7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223

/*
 * Update the size of all devices, which is used for writing out the
 * super blocks.
 */
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *curr, *next;

	if (list_empty(&fs_devices->resized_devices))
		return;

	mutex_lock(&fs_devices->device_list_mutex);
7224
	mutex_lock(&fs_info->chunk_mutex);
7225 7226 7227 7228 7229
	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				 resized_list) {
		list_del_init(&curr->resized_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
	}
7230
	mutex_unlock(&fs_info->chunk_mutex);
7231 7232
	mutex_unlock(&fs_devices->device_list_mutex);
}
7233 7234

/* Must be invoked during the transaction commit */
7235
void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246
					struct btrfs_transaction *transaction)
{
	struct extent_map *em;
	struct map_lookup *map;
	struct btrfs_device *dev;
	int i;

	if (list_empty(&transaction->pending_chunks))
		return;

	/* In order to kick the device replace finish process */
7247
	mutex_lock(&fs_info->chunk_mutex);
7248
	list_for_each_entry(em, &transaction->pending_chunks, list) {
7249
		map = em->map_lookup;
7250 7251 7252 7253 7254 7255

		for (i = 0; i < map->num_stripes; i++) {
			dev = map->stripes[i].dev;
			dev->commit_bytes_used = dev->bytes_used;
		}
	}
7256
	mutex_unlock(&fs_info->chunk_mutex);
7257
}
7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275

void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = fs_info;
		fs_devices = fs_devices->seed;
	}
}

void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	while (fs_devices) {
		fs_devices->fs_info = NULL;
		fs_devices = fs_devices->seed;
	}
}