dev-replace.c 31.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) STRATO AG 2012.  All rights reserved.
 */
5

6 7 8 9 10 11
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/math64.h>
12
#include "misc.h"
13 14 15 16 17 18 19 20 21 22
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
23
#include "sysfs.h"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59

static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
				       int scrub_ret);
static void btrfs_dev_replace_update_device_in_mapping_tree(
						struct btrfs_fs_info *fs_info,
						struct btrfs_device *srcdev,
						struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);

int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
	struct btrfs_key key;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	struct extent_buffer *eb;
	int slot;
	int ret = 0;
	struct btrfs_path *path = NULL;
	int item_size;
	struct btrfs_dev_replace_item *ptr;
	u64 src_devid;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = 0;
	key.type = BTRFS_DEV_REPLACE_KEY;
	key.offset = 0;
	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
	if (ret) {
no_valid_dev_replace_entry_found:
		ret = 0;
		dev_replace->replace_state =
60
			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
		dev_replace->cont_reading_from_srcdev_mode =
		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
		dev_replace->time_started = 0;
		dev_replace->time_stopped = 0;
		atomic64_set(&dev_replace->num_write_errors, 0);
		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
		dev_replace->cursor_left = 0;
		dev_replace->committed_cursor_left = 0;
		dev_replace->cursor_left_last_write_of_item = 0;
		dev_replace->cursor_right = 0;
		dev_replace->srcdev = NULL;
		dev_replace->tgtdev = NULL;
		dev_replace->is_valid = 0;
		dev_replace->item_needs_writeback = 0;
		goto out;
	}
	slot = path->slots[0];
	eb = path->nodes[0];
	item_size = btrfs_item_size_nr(eb, slot);
	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);

	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
83 84
		btrfs_warn(fs_info,
			"dev_replace entry found has unexpected size, ignore entry");
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
		goto no_valid_dev_replace_entry_found;
	}

	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
	dev_replace->cont_reading_from_srcdev_mode =
		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
	dev_replace->time_stopped =
		btrfs_dev_replace_time_stopped(eb, ptr);
	atomic64_set(&dev_replace->num_write_errors,
		     btrfs_dev_replace_num_write_errors(eb, ptr));
	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
	dev_replace->committed_cursor_left = dev_replace->cursor_left;
	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
	dev_replace->is_valid = 1;

	dev_replace->item_needs_writeback = 0;
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		dev_replace->srcdev = NULL;
		dev_replace->tgtdev = NULL;
		break;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
115
		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
116
						src_devid, NULL, NULL, true);
117
		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
118
							BTRFS_DEV_REPLACE_DEVID,
119
							NULL, NULL, true);
120 121 122 123 124
		/*
		 * allow 'btrfs dev replace_cancel' if src/tgt device is
		 * missing
		 */
		if (!dev_replace->srcdev &&
125
		    !btrfs_test_opt(fs_info, DEGRADED)) {
126
			ret = -EIO;
127 128 129 130 131
			btrfs_warn(fs_info,
			   "cannot mount because device replace operation is ongoing and");
			btrfs_warn(fs_info,
			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
			   src_devid);
132 133
		}
		if (!dev_replace->tgtdev &&
134
		    !btrfs_test_opt(fs_info, DEGRADED)) {
135
			ret = -EIO;
136 137 138 139
			btrfs_warn(fs_info,
			   "cannot mount because device replace operation is ongoing and");
			btrfs_warn(fs_info,
			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
140
				BTRFS_DEV_REPLACE_DEVID);
141 142 143 144 145 146 147
		}
		if (dev_replace->tgtdev) {
			if (dev_replace->srcdev) {
				dev_replace->tgtdev->total_bytes =
					dev_replace->srcdev->total_bytes;
				dev_replace->tgtdev->disk_total_bytes =
					dev_replace->srcdev->disk_total_bytes;
148 149
				dev_replace->tgtdev->commit_total_bytes =
					dev_replace->srcdev->commit_total_bytes;
150 151
				dev_replace->tgtdev->bytes_used =
					dev_replace->srcdev->bytes_used;
152 153
				dev_replace->tgtdev->commit_bytes_used =
					dev_replace->srcdev->commit_bytes_used;
154
			}
155 156
			set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				&dev_replace->tgtdev->dev_state);
157 158 159 160 161 162 163 164

			WARN_ON(fs_info->fs_devices->rw_devices == 0);
			dev_replace->tgtdev->io_width = fs_info->sectorsize;
			dev_replace->tgtdev->io_align = fs_info->sectorsize;
			dev_replace->tgtdev->sector_size = fs_info->sectorsize;
			dev_replace->tgtdev->fs_info = fs_info;
			set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&dev_replace->tgtdev->dev_state);
165 166 167 168 169
		}
		break;
	}

out:
170
	btrfs_free_path(path);
171 172 173
	return ret;
}

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
/*
 * Initialize a new device for device replace target from a given source dev
 * and path.
 *
 * Return 0 and new device in @device_out, otherwise return < 0
 */
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				  const char *device_path,
				  struct btrfs_device *srcdev,
				  struct btrfs_device **device_out)
{
	struct btrfs_device *device;
	struct block_device *bdev;
	struct list_head *devices;
	struct rcu_string *name;
	u64 devid = BTRFS_DEV_REPLACE_DEVID;
	int ret = 0;

	*device_out = NULL;
	if (fs_info->fs_devices->seeding) {
		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
		return -EINVAL;
	}

	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
				  fs_info->bdev_holder);
	if (IS_ERR(bdev)) {
		btrfs_err(fs_info, "target device %s is invalid!", device_path);
		return PTR_ERR(bdev);
	}

205
	sync_blockdev(bdev);
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
		if (device->bdev == bdev) {
			btrfs_err(fs_info,
				  "target device is in the filesystem!");
			ret = -EEXIST;
			goto error;
		}
	}


	if (i_size_read(bdev->bd_inode) <
	    btrfs_device_get_total_bytes(srcdev)) {
		btrfs_err(fs_info,
			  "target device is smaller than source device!");
		ret = -EINVAL;
		goto error;
	}


	device = btrfs_alloc_device(NULL, &devid, NULL);
	if (IS_ERR(device)) {
		ret = PTR_ERR(device);
		goto error;
	}

	name = rcu_string_strdup(device_path, GFP_KERNEL);
	if (!name) {
		btrfs_free_device(device);
		ret = -ENOMEM;
		goto error;
	}
	rcu_assign_pointer(device->name, name);

	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
	device->generation = 0;
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
	device->commit_total_bytes = srcdev->commit_total_bytes;
	device->commit_bytes_used = device->bytes_used;
	device->fs_info = fs_info;
	device->bdev = bdev;
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
	device->mode = FMODE_EXCL;
	device->dev_stats_valid = 1;
	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
	device->fs_devices = fs_info->fs_devices;
259 260

	mutex_lock(&fs_info->fs_devices->device_list_mutex);
261 262 263 264 265 266 267 268 269 270 271 272 273
	list_add(&device->dev_list, &fs_info->fs_devices->devices);
	fs_info->fs_devices->num_devices++;
	fs_info->fs_devices->open_devices++;
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);

	*device_out = device;
	return 0;

error:
	blkdev_put(bdev, FMODE_EXCL);
	return ret;
}

274 275 276 277
/*
 * called from commit_transaction. Writes changed device replace state to
 * disk.
 */
278
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
279
{
280
	struct btrfs_fs_info *fs_info = trans->fs_info;
281 282 283 284 285 286 287 288
	int ret;
	struct btrfs_root *dev_root = fs_info->dev_root;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_replace_item *ptr;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

289
	down_read(&dev_replace->rwsem);
290 291
	if (!dev_replace->is_valid ||
	    !dev_replace->item_needs_writeback) {
292
		up_read(&dev_replace->rwsem);
293 294
		return 0;
	}
295
	up_read(&dev_replace->rwsem);
296 297 298 299 300 301 302 303 304 305 306 307

	key.objectid = 0;
	key.type = BTRFS_DEV_REPLACE_KEY;
	key.offset = 0;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
J
Jeff Mahoney 已提交
308 309 310
		btrfs_warn(fs_info,
			   "error %d while searching for dev_replace item!",
			   ret);
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
		goto out;
	}

	if (ret == 0 &&
	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
		/*
		 * need to delete old one and insert a new one.
		 * Since no attempt is made to recover any old state, if the
		 * dev_replace state is 'running', the data on the target
		 * drive is lost.
		 * It would be possible to recover the state: just make sure
		 * that the beginning of the item is never changed and always
		 * contains all the essential information. Then read this
		 * minimal set of information and use it as a base for the
		 * new state.
		 */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
J
Jeff Mahoney 已提交
329 330 331
			btrfs_warn(fs_info,
				   "delete too small dev_replace item failed %d!",
				   ret);
332 333 334 335 336 337 338 339 340 341 342
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
J
Jeff Mahoney 已提交
343 344
			btrfs_warn(fs_info,
				   "insert dev_replace item failed %d!", ret);
345 346 347 348 349 350 351 352
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0],
			     struct btrfs_dev_replace_item);

353
	down_write(&dev_replace->rwsem);
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
	if (dev_replace->srcdev)
		btrfs_set_dev_replace_src_devid(eb, ptr,
			dev_replace->srcdev->devid);
	else
		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
		dev_replace->cont_reading_from_srcdev_mode);
	btrfs_set_dev_replace_replace_state(eb, ptr,
		dev_replace->replace_state);
	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
	btrfs_set_dev_replace_num_write_errors(eb, ptr,
		atomic64_read(&dev_replace->num_write_errors));
	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
	dev_replace->cursor_left_last_write_of_item =
		dev_replace->cursor_left;
	btrfs_set_dev_replace_cursor_left(eb, ptr,
		dev_replace->cursor_left_last_write_of_item);
	btrfs_set_dev_replace_cursor_right(eb, ptr,
		dev_replace->cursor_right);
	dev_replace->item_needs_writeback = 0;
376
	up_write(&dev_replace->rwsem);
377 378 379 380 381 382 383 384 385

	btrfs_mark_buffer_dirty(eb);

out:
	btrfs_free_path(path);

	return ret;
}

386 387
static char* btrfs_dev_name(struct btrfs_device *device)
{
388
	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
389 390 391 392 393
		return "<missing disk>";
	else
		return rcu_str_deref(device->name);
}

394
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
395 396
		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
		int read_src)
397
{
398
	struct btrfs_root *root = fs_info->dev_root;
399 400 401 402 403 404
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int ret;
	struct btrfs_device *tgt_device = NULL;
	struct btrfs_device *src_device = NULL;

405 406 407 408
	src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
						  srcdev_name);
	if (IS_ERR(src_device))
		return PTR_ERR(src_device);
409

410 411 412 413 414 415 416
	if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
		btrfs_warn_in_rcu(fs_info,
	  "cannot replace device %s (devid %llu) due to active swapfile",
			btrfs_dev_name(src_device), src_device->devid);
		return -ETXTBSY;
	}

417 418 419 420 421 422
	/*
	 * Here we commit the transaction to make sure commit_total_bytes
	 * of all the devices are updated.
	 */
	trans = btrfs_attach_transaction(root);
	if (!IS_ERR(trans)) {
423
		ret = btrfs_commit_transaction(trans);
424 425 426 427 428 429
		if (ret)
			return ret;
	} else if (PTR_ERR(trans) != -ENOENT) {
		return PTR_ERR(trans);
	}

430 431 432 433 434
	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
					    src_device, &tgt_device);
	if (ret)
		return ret;

435
	down_write(&dev_replace->rwsem);
436 437 438 439 440 441 442
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		break;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
443
		ASSERT(0);
444
		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
445
		up_write(&dev_replace->rwsem);
446 447 448
		goto leave;
	}

449
	dev_replace->cont_reading_from_srcdev_mode = read_src;
450 451 452
	dev_replace->srcdev = src_device;
	dev_replace->tgtdev = tgt_device;

A
Anand Jain 已提交
453
	btrfs_info_in_rcu(fs_info,
454
		      "dev_replace from %s (devid %llu) to %s started",
455
		      btrfs_dev_name(src_device),
456 457 458 459 460 461 462 463
		      src_device->devid,
		      rcu_str_deref(tgt_device->name));

	/*
	 * from now on, the writes to the srcdev are all duplicated to
	 * go to the tgtdev as well (refer to btrfs_map_block()).
	 */
	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
464
	dev_replace->time_started = ktime_get_real_seconds();
465 466 467 468 469 470
	dev_replace->cursor_left = 0;
	dev_replace->committed_cursor_left = 0;
	dev_replace->cursor_left_last_write_of_item = 0;
	dev_replace->cursor_right = 0;
	dev_replace->is_valid = 1;
	dev_replace->item_needs_writeback = 1;
471 472
	atomic64_set(&dev_replace->num_write_errors, 0);
	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
473
	up_write(&dev_replace->rwsem);
474

475 476
	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
	if (ret)
477
		btrfs_err(fs_info, "kobj add dev failed %d", ret);
478

479
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
480

481 482
	/* Commit dev_replace state and reserve 1 item for it. */
	trans = btrfs_start_transaction(root, 1);
483 484
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
485
		down_write(&dev_replace->rwsem);
486 487 488 489
		dev_replace->replace_state =
			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
		dev_replace->srcdev = NULL;
		dev_replace->tgtdev = NULL;
490
		up_write(&dev_replace->rwsem);
491 492 493
		goto leave;
	}

494
	ret = btrfs_commit_transaction(trans);
495 496 497 498
	WARN_ON(ret);

	/* the disk copy procedure reuses the scrub code */
	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
499
			      btrfs_device_get_total_bytes(src_device),
500 501
			      &dev_replace->scrub_progress, 0, 1);

A
Anand Jain 已提交
502
	ret = btrfs_dev_replace_finishing(fs_info, ret);
503
	if (ret == -EINPROGRESS) {
504
		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
505
	} else if (ret != -ECANCELED) {
506 507
		WARN_ON(ret);
	}
508

509
	return ret;
510 511

leave:
512
	btrfs_destroy_dev_replace_tgtdev(tgt_device);
513 514 515
	return ret;
}

516
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
			    struct btrfs_ioctl_dev_replace_args *args)
{
	int ret;

	switch (args->start.cont_reading_from_srcdev_mode) {
	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
		break;
	default:
		return -EINVAL;
	}

	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
	    args->start.tgtdev_name[0] == '\0')
		return -EINVAL;

533
	ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
534 535 536 537 538
					args->start.srcdevid,
					args->start.srcdev_name,
					args->start.cont_reading_from_srcdev_mode);
	args->result = ret;
	/* don't warn if EINPROGRESS, someone else might be running scrub */
539 540 541
	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
	    ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
		return 0;
542 543 544 545

	return ret;
}

546
/*
547
 * blocked until all in-flight bios operations are finished.
548 549 550 551
 */
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
552 553
	wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
		   &fs_info->dev_replace.bio_counter));
554 555 556 557 558 559 560 561
}

/*
 * we have removed target device, it is safe to allow new bios request.
 */
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
562
	wake_up(&fs_info->dev_replace.replace_wait);
563 564
}

565 566 567 568 569 570 571 572 573 574 575 576 577 578
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
				       int scrub_ret)
{
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	struct btrfs_device *tgt_device;
	struct btrfs_device *src_device;
	struct btrfs_root *root = fs_info->tree_root;
	u8 uuid_tmp[BTRFS_UUID_SIZE];
	struct btrfs_trans_handle *trans;
	int ret = 0;

	/* don't allow cancel or unmount to disturb the finishing procedure */
	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);

579
	down_read(&dev_replace->rwsem);
580 581 582
	/* was the operation canceled, or is it finished? */
	if (dev_replace->replace_state !=
	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
583
		up_read(&dev_replace->rwsem);
584 585 586 587 588 589
		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
		return 0;
	}

	tgt_device = dev_replace->tgtdev;
	src_device = dev_replace->srcdev;
590
	up_read(&dev_replace->rwsem);
591 592 593 594 595

	/*
	 * flush all outstanding I/O and inode extent mappings before the
	 * copy operation is declared as being finished
	 */
596
	ret = btrfs_start_delalloc_roots(fs_info, -1);
597 598 599 600
	if (ret) {
		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
		return ret;
	}
601
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
602

603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
	/*
	 * We have to use this loop approach because at this point src_device
	 * has to be available for transaction commit to complete, yet new
	 * chunks shouldn't be allocated on the device.
	 */
	while (1) {
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans)) {
			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			return PTR_ERR(trans);
		}
		ret = btrfs_commit_transaction(trans);
		WARN_ON(ret);

		/* Prevent write_all_supers() during the finishing procedure */
		mutex_lock(&fs_info->fs_devices->device_list_mutex);
		/* Prevent new chunks being allocated on the source device */
		mutex_lock(&fs_info->chunk_mutex);

		if (!list_empty(&src_device->post_commit_list)) {
			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			mutex_unlock(&fs_info->chunk_mutex);
		} else {
			break;
		}
628 629
	}

630
	down_write(&dev_replace->rwsem);
631 632 633 634 635
	dev_replace->replace_state =
		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
	dev_replace->tgtdev = NULL;
	dev_replace->srcdev = NULL;
636
	dev_replace->time_stopped = ktime_get_real_seconds();
637 638
	dev_replace->item_needs_writeback = 1;

639 640 641 642 643 644
	/* replace old device with new one in mapping tree */
	if (!scrub_ret) {
		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
								src_device,
								tgt_device);
	} else {
645 646
		if (scrub_ret != -ECANCELED)
			btrfs_err_in_rcu(fs_info,
647
				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
648
				 btrfs_dev_name(src_device),
649 650
				 src_device->devid,
				 rcu_str_deref(tgt_device->name), scrub_ret);
651
		up_write(&dev_replace->rwsem);
652 653
		mutex_unlock(&fs_info->chunk_mutex);
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
654
		btrfs_rm_dev_replace_blocked(fs_info);
655
		if (tgt_device)
656
			btrfs_destroy_dev_replace_tgtdev(tgt_device);
657
		btrfs_rm_dev_replace_unblocked(fs_info);
658 659
		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);

660
		return scrub_ret;
661 662
	}

663 664
	btrfs_info_in_rcu(fs_info,
			  "dev_replace from %s (devid %llu) to %s finished",
665
			  btrfs_dev_name(src_device),
666 667
			  src_device->devid,
			  rcu_str_deref(tgt_device->name));
668
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
669 670 671 672 673
	tgt_device->devid = src_device->devid;
	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
674 675 676 677
	btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
	btrfs_device_set_disk_total_bytes(tgt_device,
					  src_device->disk_total_bytes);
	btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
678
	tgt_device->commit_bytes_used = src_device->bytes_used;
679

680
	btrfs_assign_next_active_device(src_device, tgt_device);
681

682
	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
683
	fs_info->fs_devices->rw_devices++;
684

685
	up_write(&dev_replace->rwsem);
686 687
	btrfs_rm_dev_replace_blocked(fs_info);

688
	btrfs_rm_dev_replace_remove_srcdev(src_device);
689

690 691
	btrfs_rm_dev_replace_unblocked(fs_info);

692 693 694 695 696 697
	/*
	 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
	 * update on-disk dev stats value during commit transaction
	 */
	atomic_inc(&tgt_device->dev_stats_ccnt);

698 699 700 701 702 703 704
	/*
	 * this is again a consistent state where no dev_replace procedure
	 * is running, the target device is part of the filesystem, the
	 * source device is not part of the filesystem anymore and its 1st
	 * superblock is scratched out so that it is no longer marked to
	 * belong to this filesystem.
	 */
705 706
	mutex_unlock(&fs_info->chunk_mutex);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
707

708
	/* replace the sysfs entry */
709
	btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
710
	btrfs_rm_dev_replace_free_srcdev(src_device);
711

712 713 714
	/* write back the superblocks */
	trans = btrfs_start_transaction(root, 0);
	if (!IS_ERR(trans))
715
		btrfs_commit_transaction(trans);
716 717 718 719 720 721 722 723 724 725 726

	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);

	return 0;
}

static void btrfs_dev_replace_update_device_in_mapping_tree(
						struct btrfs_fs_info *fs_info,
						struct btrfs_device *srcdev,
						struct btrfs_device *tgtdev)
{
727
	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
728 729 730 731 732 733 734 735 736 737
	struct extent_map *em;
	struct map_lookup *map;
	u64 start = 0;
	int i;

	write_lock(&em_tree->lock);
	do {
		em = lookup_extent_mapping(em_tree, start, (u64)-1);
		if (!em)
			break;
738
		map = em->map_lookup;
739 740 741 742 743 744 745 746 747
		for (i = 0; i < map->num_stripes; i++)
			if (srcdev == map->stripes[i].dev)
				map->stripes[i].dev = tgtdev;
		start = em->start + em->len;
		free_extent_map(em);
	} while (start);
	write_unlock(&em_tree->lock);
}

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776
/*
 * Read progress of device replace status according to the state and last
 * stored position. The value format is the same as for
 * btrfs_dev_replace::progress_1000
 */
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
{
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	u64 ret = 0;

	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		ret = 0;
		break;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
		ret = 1000;
		break;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
		ret = div64_u64(dev_replace->cursor_left,
				div_u64(btrfs_device_get_total_bytes(
						dev_replace->srcdev), 1000));
		break;
	}

	return ret;
}

777 778 779 780 781
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
			      struct btrfs_ioctl_dev_replace_args *args)
{
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

782
	down_read(&dev_replace->rwsem);
783 784 785 786 787 788 789 790 791 792
	/* even if !dev_replace_is_valid, the values are good enough for
	 * the replace_status ioctl */
	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
	args->status.replace_state = dev_replace->replace_state;
	args->status.time_started = dev_replace->time_started;
	args->status.time_stopped = dev_replace->time_stopped;
	args->status.num_write_errors =
		atomic64_read(&dev_replace->num_write_errors);
	args->status.num_uncorrectable_read_errors =
		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
793
	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
794
	up_read(&dev_replace->rwsem);
795 796
}

797
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
798 799 800
{
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	struct btrfs_device *tgt_device = NULL;
801
	struct btrfs_device *src_device = NULL;
802 803
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = fs_info->tree_root;
804
	int result;
805 806
	int ret;

807
	if (sb_rdonly(fs_info->sb))
808 809
		return -EROFS;

810
	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
811
	down_write(&dev_replace->rwsem);
812 813 814 815 816
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
817
		up_write(&dev_replace->rwsem);
818
		break;
819
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
820 821
		tgt_device = dev_replace->tgtdev;
		src_device = dev_replace->srcdev;
822
		up_write(&dev_replace->rwsem);
823 824 825 826 827 828 829 830 831 832 833 834 835 836
		ret = btrfs_scrub_cancel(fs_info);
		if (ret < 0) {
			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
		} else {
			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
			/*
			 * btrfs_dev_replace_finishing() will handle the
			 * cleanup part
			 */
			btrfs_info_in_rcu(fs_info,
				"dev_replace from %s (devid %llu) to %s canceled",
				btrfs_dev_name(src_device), src_device->devid,
				btrfs_dev_name(tgt_device));
		}
837
		break;
838
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
839 840 841 842
		/*
		 * Scrub doing the replace isn't running so we need to do the
		 * cleanup step of btrfs_dev_replace_finishing() here
		 */
843 844
		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
		tgt_device = dev_replace->tgtdev;
845
		src_device = dev_replace->srcdev;
846 847
		dev_replace->tgtdev = NULL;
		dev_replace->srcdev = NULL;
848 849 850 851
		dev_replace->replace_state =
				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
		dev_replace->time_stopped = ktime_get_real_seconds();
		dev_replace->item_needs_writeback = 1;
852

853
		up_write(&dev_replace->rwsem);
854

855 856 857
		/* Scrub for replace must not be running in suspended state */
		ret = btrfs_scrub_cancel(fs_info);
		ASSERT(ret != -ENOTCONN);
858 859 860 861 862 863 864 865

		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans)) {
			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
			return PTR_ERR(trans);
		}
		ret = btrfs_commit_transaction(trans);
		WARN_ON(ret);
866

867 868 869 870 871 872 873 874 875
		btrfs_info_in_rcu(fs_info,
		"suspended dev_replace from %s (devid %llu) to %s canceled",
			btrfs_dev_name(src_device), src_device->devid,
			btrfs_dev_name(tgt_device));

		if (tgt_device)
			btrfs_destroy_dev_replace_tgtdev(tgt_device);
		break;
	default:
876
		up_write(&dev_replace->rwsem);
877 878
		result = -EINVAL;
	}
879 880 881 882 883 884 885 886 887 888

	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
	return result;
}

void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
{
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
889 890
	down_write(&dev_replace->rwsem);

891 892 893 894 895 896 897 898 899
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
		break;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
		dev_replace->replace_state =
			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
900
		dev_replace->time_stopped = ktime_get_real_seconds();
901
		dev_replace->item_needs_writeback = 1;
902
		btrfs_info(fs_info, "suspending dev_replace for unmount");
903 904 905
		break;
	}

906
	up_write(&dev_replace->rwsem);
907 908 909 910 911 912 913 914 915
	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}

/* resume dev_replace procedure that was interrupted by unmount */
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

916 917
	down_write(&dev_replace->rwsem);

918 919 920 921
	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
922
		up_write(&dev_replace->rwsem);
923 924 925 926 927 928 929 930 931
		return 0;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
		break;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
		dev_replace->replace_state =
			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
		break;
	}
	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
932
		btrfs_info(fs_info,
J
Jeff Mahoney 已提交
933 934 935
			   "cannot continue dev_replace, tgtdev is missing");
		btrfs_info(fs_info,
			   "you may cancel the operation after 'mount -o degraded'");
936 937
		dev_replace->replace_state =
					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
938
		up_write(&dev_replace->rwsem);
939 940
		return 0;
	}
941
	up_write(&dev_replace->rwsem);
942

943 944 945 946 947 948
	/*
	 * This could collide with a paused balance, but the exclusive op logic
	 * should never allow both to start and pause. We don't want to allow
	 * dev-replace to start anyway.
	 */
	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
949
		down_write(&dev_replace->rwsem);
950 951
		dev_replace->replace_state =
					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
952
		up_write(&dev_replace->rwsem);
953 954 955 956 957
		btrfs_info(fs_info,
		"cannot resume dev-replace, other exclusive operation running");
		return 0;
	}

958
	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
959
	return PTR_ERR_OR_ZERO(task);
960 961 962 963 964 965 966
}

static int btrfs_dev_replace_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = data;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	u64 progress;
967
	int ret;
968

969 970 971
	progress = btrfs_dev_replace_progress(fs_info);
	progress = div_u64(progress, 10);
	btrfs_info_in_rcu(fs_info,
972 973
		"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
		btrfs_dev_name(dev_replace->srcdev),
974
		dev_replace->srcdev->devid,
975
		btrfs_dev_name(dev_replace->tgtdev),
976 977
		(unsigned int)progress);

978 979
	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
			      dev_replace->committed_cursor_left,
980
			      btrfs_device_get_total_bytes(dev_replace->srcdev),
981 982
			      &dev_replace->scrub_progress, 0, 1);
	ret = btrfs_dev_replace_finishing(fs_info, ret);
983
	WARN_ON(ret && ret != -ECANCELED);
984 985

	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
986 987 988
	return 0;
}

989
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
{
	if (!dev_replace->is_valid)
		return 0;

	switch (dev_replace->replace_state) {
	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
		return 0;
	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
		/*
		 * return true even if tgtdev is missing (this is
		 * something that can happen if the dev_replace
		 * procedure is suspended by an umount and then
		 * the tgtdev is missing (or "btrfs dev scan") was
1006
		 * not called and the filesystem is remounted
1007 1008
		 * in degraded state. This does not stop the
		 * dev_replace procedure. It needs to be canceled
1009
		 * manually if the cancellation is wanted.
1010 1011 1012 1013 1014 1015
		 */
		break;
	}
	return 1;
}

1016 1017
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
1018
	percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1019 1020
}

1021
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
1022
{
1023 1024
	percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
	cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
1025 1026 1027 1028
}

void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
1029
	while (1) {
1030
		percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1031 1032 1033 1034
		if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
				     &fs_info->fs_state)))
			break;

1035
		btrfs_bio_counter_dec(fs_info);
1036
		wait_event(fs_info->dev_replace.replace_wait,
1037 1038 1039 1040
			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
				     &fs_info->fs_state));
	}
}