scrub.c 156.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Arne Jansen 已提交
2
/*
3
 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
A
Arne Jansen 已提交
4 5 6
 */

#include <linux/blkdev.h>
7
#include <linux/ratelimit.h>
8
#include <linux/sched/mm.h>
9
#include <crypto/hash.h>
A
Arne Jansen 已提交
10
#include "ctree.h"
11
#include "discard.h"
A
Arne Jansen 已提交
12 13 14
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
15
#include "transaction.h"
16
#include "backref.h"
17
#include "extent_io.h"
18
#include "dev-replace.h"
19
#include "check-integrity.h"
D
David Woodhouse 已提交
20
#include "raid56.h"
21
#include "block-group.h"
22
#include "zoned.h"
23
#include "fs.h"
24
#include "accessors.h"
25
#include "file-item.h"
26
#include "scrub.h"
A
Arne Jansen 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40

/*
 * This is only the first step towards a full-features scrub. It reads all
 * extent and super block and verifies the checksums. In case a bad checksum
 * is found or the extent cannot be read, good data will be written back if
 * any can be found.
 *
 * Future enhancements:
 *  - In case an unrepairable extent is encountered, track which files are
 *    affected and report them
 *  - track and record media errors, throw out bad devices
 *  - add a mode to also read unallocated space
 */

41
struct scrub_block;
42
struct scrub_ctx;
A
Arne Jansen 已提交
43

44
/*
45 46
 * The following three values only influence the performance.
 *
47
 * The last one configures the number of parallel and outstanding I/O
48
 * operations. The first one configures an upper limit for the number
49 50
 * of (dynamically allocated) pages that are added to a bio.
 */
51 52
#define SCRUB_SECTORS_PER_BIO	32	/* 128KiB per bio for 4KiB pages */
#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for 4KiB pages */
53 54

/*
55
 * The following value times PAGE_SIZE needs to be large enough to match the
56 57
 * largest node/leaf/sector size that shall be supported.
 */
58
#define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
A
Arne Jansen 已提交
59

60 61
#define SCRUB_MAX_PAGES			(DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))

62 63 64 65 66 67 68 69 70 71 72
/*
 * Maximum number of mirrors that can be available for all profiles counting
 * the target device of dev-replace as one. During an active device replace
 * procedure, the target device of the copy operation is a mirror for the
 * filesystem data as well that can be used to read data in order to repair
 * read errors on other disks.
 *
 * Current value is derived from RAID1C4 with 4 copies.
 */
#define BTRFS_MAX_MIRRORS (4 + 1)

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
/* Represent one sector and its needed info to verify the content. */
struct scrub_sector_verification {
	bool is_metadata;

	union {
		/*
		 * Csum pointer for data csum verification.  Should point to a
		 * sector csum inside scrub_stripe::csums.
		 *
		 * NULL if this data sector has no csum.
		 */
		u8 *csum;

		/*
		 * Extra info for metadata verification.  All sectors inside a
		 * tree block share the same generation.
		 */
		u64 generation;
	};
};

enum scrub_stripe_flags {
	/* Set when @mirror_num, @dev, @physical and @logical are set. */
	SCRUB_STRIPE_FLAG_INITIALIZED,

	/* Set when the read-repair is finished. */
	SCRUB_STRIPE_FLAG_REPAIR_DONE,
};

#define SCRUB_STRIPE_PAGES		(BTRFS_STRIPE_LEN / PAGE_SIZE)

/*
 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
 */
struct scrub_stripe {
108
	struct scrub_ctx *sctx;
109 110 111 112 113 114 115 116 117 118 119 120 121 122
	struct btrfs_block_group *bg;

	struct page *pages[SCRUB_STRIPE_PAGES];
	struct scrub_sector_verification *sectors;

	struct btrfs_device *dev;
	u64 logical;
	u64 physical;

	u16 mirror_num;

	/* Should be BTRFS_STRIPE_LEN / sectorsize. */
	u16 nr_sectors;

123 124 125 126 127 128 129
	/*
	 * How many data/meta extents are in this stripe.  Only for scrub status
	 * reporting purposes.
	 */
	u16 nr_data_extents;
	u16 nr_meta_extents;

130 131
	atomic_t pending_io;
	wait_queue_head_t io_wait;
132
	wait_queue_head_t repair_wait;
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162

	/*
	 * Indicate the states of the stripe.  Bits are defined in
	 * scrub_stripe_flags enum.
	 */
	unsigned long state;

	/* Indicate which sectors are covered by extent items. */
	unsigned long extent_sector_bitmap;

	/*
	 * The errors hit during the initial read of the stripe.
	 *
	 * Would be utilized for error reporting and repair.
	 */
	unsigned long init_error_bitmap;

	/*
	 * The following error bitmaps are all for the current status.
	 * Every time we submit a new read, these bitmaps may be updated.
	 *
	 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
	 *
	 * IO and csum errors can happen for both metadata and data.
	 */
	unsigned long error_bitmap;
	unsigned long io_error_bitmap;
	unsigned long csum_error_bitmap;
	unsigned long meta_error_bitmap;

163 164 165 166 167 168
	/* For writeback (repair or replace) error reporting. */
	unsigned long write_error_bitmap;

	/* Writeback can be concurrent, thus we need to protect the bitmap. */
	spinlock_t write_error_lock;

169 170 171 172 173
	/*
	 * Checksum for the whole stripe if this stripe is inside a data block
	 * group.
	 */
	u8 *csums;
174 175

	struct work_struct work;
176 177
};

178
struct scrub_recover {
179
	refcount_t		refs;
180
	struct btrfs_io_context	*bioc;
181 182 183
	u64			map_length;
};

184
struct scrub_sector {
185
	struct scrub_block	*sblock;
186
	struct list_head	list;
A
Arne Jansen 已提交
187 188
	u64			flags;  /* extent flags */
	u64			generation;
189 190
	/* Offset in bytes to @sblock. */
	u32			offset;
191
	atomic_t		refs;
192 193
	unsigned int		have_csum:1;
	unsigned int		io_error:1;
A
Arne Jansen 已提交
194
	u8			csum[BTRFS_CSUM_SIZE];
195 196

	struct scrub_recover	*recover;
A
Arne Jansen 已提交
197 198 199 200
};

struct scrub_bio {
	int			index;
201
	struct scrub_ctx	*sctx;
202
	struct btrfs_device	*dev;
A
Arne Jansen 已提交
203
	struct bio		*bio;
204
	blk_status_t		status;
A
Arne Jansen 已提交
205 206
	u64			logical;
	u64			physical;
207 208
	struct scrub_sector	*sectors[SCRUB_SECTORS_PER_BIO];
	int			sector_count;
A
Arne Jansen 已提交
209
	int			next_free;
210
	struct work_struct	work;
A
Arne Jansen 已提交
211 212
};

213
struct scrub_block {
214 215 216 217 218
	/*
	 * Each page will have its page::private used to record the logical
	 * bytenr.
	 */
	struct page		*pages[SCRUB_MAX_PAGES];
219
	struct scrub_sector	*sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
220
	struct btrfs_device	*dev;
221 222
	/* Logical bytenr of the sblock */
	u64			logical;
223 224
	u64			physical;
	u64			physical_for_dev_replace;
225 226
	/* Length of sblock in bytes */
	u32			len;
227
	int			sector_count;
228
	int			mirror_num;
229

230
	atomic_t		outstanding_sectors;
231
	refcount_t		refs; /* free mem on transition to zero */
232
	struct scrub_ctx	*sctx;
233
	struct scrub_parity	*sparity;
234 235 236 237
	struct {
		unsigned int	header_error:1;
		unsigned int	checksum_error:1;
		unsigned int	no_io_error_seen:1;
238
		unsigned int	generation_error:1; /* also sets header_error */
239 240 241 242

		/* The following is for the data used to check parity */
		/* It is for the data with checksum */
		unsigned int	data_corrected:1;
243
	};
244
	struct work_struct	work;
245 246
};

247 248 249 250 251 252 253 254 255 256 257 258
/* Used for the chunks with parity stripe such RAID5/6 */
struct scrub_parity {
	struct scrub_ctx	*sctx;

	struct btrfs_device	*scrub_dev;

	u64			logic_start;

	u64			logic_end;

	int			nsectors;

259
	u32			stripe_len;
260

261
	refcount_t		refs;
262

263
	struct list_head	sectors_list;
264 265

	/* Work of parity check and repair */
266
	struct work_struct	work;
267 268

	/* Mark the parity blocks which have data */
269
	unsigned long		dbitmap;
270 271 272 273 274

	/*
	 * Mark the parity blocks which have data, but errors happen when
	 * read data or check data
	 */
275
	unsigned long		ebitmap;
276 277
};

278
struct scrub_ctx {
279
	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
280
	struct btrfs_fs_info	*fs_info;
A
Arne Jansen 已提交
281 282
	int			first_free;
	int			curr;
283 284
	atomic_t		bios_in_flight;
	atomic_t		workers_pending;
A
Arne Jansen 已提交
285 286 287 288
	spinlock_t		list_lock;
	wait_queue_head_t	list_wait;
	struct list_head	csum_list;
	atomic_t		cancel_req;
A
Arne Jansen 已提交
289
	int			readonly;
290
	int			sectors_per_bio;
291

292 293 294 295
	/* State of IO submission throttling affecting the associated device */
	ktime_t			throttle_deadline;
	u64			throttle_sent;

296
	int			is_dev_replace;
297
	u64			write_pointer;
298 299 300 301

	struct scrub_bio        *wr_curr_bio;
	struct mutex            wr_lock;
	struct btrfs_device     *wr_tgtdev;
302
	bool                    flush_all_writes;
303

A
Arne Jansen 已提交
304 305 306 307 308
	/*
	 * statistics
	 */
	struct btrfs_scrub_progress stat;
	spinlock_t		stat_lock;
309 310 311 312 313 314 315 316

	/*
	 * Use a ref counter to avoid use-after-free issues. Scrub workers
	 * decrement bios_in_flight and workers_pending and then do a wakeup
	 * on the list_wait wait queue. We must ensure the main scrub task
	 * doesn't free the scrub context before or while the workers are
	 * doing the wakeup() call.
	 */
317
	refcount_t              refs;
A
Arne Jansen 已提交
318 319
};

320 321 322 323
struct scrub_warning {
	struct btrfs_path	*path;
	u64			extent_item_size;
	const char		*errstr;
D
David Sterba 已提交
324
	u64			physical;
325 326 327 328
	u64			logical;
	struct btrfs_device	*dev;
};

329 330 331 332 333 334 335
struct full_stripe_lock {
	struct rb_node node;
	u64 logical;
	u64 refs;
	struct mutex mutex;
};

336
#ifndef CONFIG_64BIT
337
/* This structure is for architectures whose (void *) is smaller than u64 */
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
struct scrub_page_private {
	u64 logical;
};
#endif

static int attach_scrub_page_private(struct page *page, u64 logical)
{
#ifdef CONFIG_64BIT
	attach_page_private(page, (void *)logical);
	return 0;
#else
	struct scrub_page_private *spp;

	spp = kmalloc(sizeof(*spp), GFP_KERNEL);
	if (!spp)
		return -ENOMEM;
	spp->logical = logical;
	attach_page_private(page, (void *)spp);
	return 0;
#endif
}

static void detach_scrub_page_private(struct page *page)
{
#ifdef CONFIG_64BIT
	detach_page_private(page);
	return;
#else
	struct scrub_page_private *spp;

	spp = detach_page_private(page);
	kfree(spp);
	return;
#endif
}

374 375 376 377 378 379 380 381 382 383 384 385 386 387
static void release_scrub_stripe(struct scrub_stripe *stripe)
{
	if (!stripe)
		return;

	for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
		if (stripe->pages[i])
			__free_page(stripe->pages[i]);
		stripe->pages[i] = NULL;
	}
	kfree(stripe->sectors);
	kfree(stripe->csums);
	stripe->sectors = NULL;
	stripe->csums = NULL;
388
	stripe->sctx = NULL;
389 390 391 392 393 394 395 396 397 398 399 400 401
	stripe->state = 0;
}

int init_scrub_stripe(struct btrfs_fs_info *fs_info, struct scrub_stripe *stripe)
{
	int ret;

	memset(stripe, 0, sizeof(*stripe));

	stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
	stripe->state = 0;

	init_waitqueue_head(&stripe->io_wait);
402
	init_waitqueue_head(&stripe->repair_wait);
403
	atomic_set(&stripe->pending_io, 0);
404
	spin_lock_init(&stripe->write_error_lock);
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425

	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
	if (ret < 0)
		goto error;

	stripe->sectors = kcalloc(stripe->nr_sectors,
				  sizeof(struct scrub_sector_verification),
				  GFP_KERNEL);
	if (!stripe->sectors)
		goto error;

	stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
				fs_info->csum_size, GFP_KERNEL);
	if (!stripe->csums)
		goto error;
	return 0;
error:
	release_scrub_stripe(stripe);
	return -ENOMEM;
}

426
static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
427 428 429 430
{
	wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
}

431 432 433 434 435
static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
					     struct btrfs_device *dev,
					     u64 logical, u64 physical,
					     u64 physical_for_dev_replace,
					     int mirror_num)
436 437 438 439 440 441 442 443
{
	struct scrub_block *sblock;

	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
	if (!sblock)
		return NULL;
	refcount_set(&sblock->refs, 1);
	sblock->sctx = sctx;
444
	sblock->logical = logical;
445 446 447 448
	sblock->physical = physical;
	sblock->physical_for_dev_replace = physical_for_dev_replace;
	sblock->dev = dev;
	sblock->mirror_num = mirror_num;
449
	sblock->no_io_error_seen = 1;
450 451 452 453
	/*
	 * Scrub_block::pages will be allocated at alloc_scrub_sector() when
	 * the corresponding page is not allocated.
	 */
454 455 456
	return sblock;
}

457 458 459 460 461 462
/*
 * Allocate a new scrub sector and attach it to @sblock.
 *
 * Will also allocate new pages for @sblock if needed.
 */
static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
463
					       u64 logical)
464
{
465
	const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
466 467
	struct scrub_sector *ssector;

468 469 470
	/* We must never have scrub_block exceed U32_MAX in size. */
	ASSERT(logical - sblock->logical < U32_MAX);

471
	ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
472 473
	if (!ssector)
		return NULL;
474 475 476 477 478

	/* Allocate a new page if the slot is not allocated */
	if (!sblock->pages[page_index]) {
		int ret;

479
		sblock->pages[page_index] = alloc_page(GFP_KERNEL);
480 481 482 483 484 485 486 487 488 489 490 491
		if (!sblock->pages[page_index]) {
			kfree(ssector);
			return NULL;
		}
		ret = attach_scrub_page_private(sblock->pages[page_index],
				sblock->logical + (page_index << PAGE_SHIFT));
		if (ret < 0) {
			kfree(ssector);
			__free_page(sblock->pages[page_index]);
			sblock->pages[page_index] = NULL;
			return NULL;
		}
492
	}
493

494 495 496 497
	atomic_set(&ssector->refs, 1);
	ssector->sblock = sblock;
	/* The sector to be added should not be used */
	ASSERT(sblock->sectors[sblock->sector_count] == NULL);
498
	ssector->offset = logical - sblock->logical;
499

500 501 502 503 504
	/* The sector count must be smaller than the limit */
	ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);

	sblock->sectors[sblock->sector_count] = ssector;
	sblock->sector_count++;
505
	sblock->len += sblock->sctx->fs_info->sectorsize;
506 507 508 509

	return ssector;
}

510 511 512
static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
{
	struct scrub_block *sblock = ssector->sblock;
513
	pgoff_t index;
514 515 516 517 518 519 520
	/*
	 * When calling this function, ssector must be alreaday attached to the
	 * parent sblock.
	 */
	ASSERT(sblock);

	/* The range should be inside the sblock range */
521
	ASSERT(ssector->offset < sblock->len);
522

523
	index = ssector->offset >> PAGE_SHIFT;
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
	ASSERT(index < SCRUB_MAX_PAGES);
	ASSERT(sblock->pages[index]);
	ASSERT(PagePrivate(sblock->pages[index]));
	return sblock->pages[index];
}

static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
{
	struct scrub_block *sblock = ssector->sblock;

	/*
	 * When calling this function, ssector must be already attached to the
	 * parent sblock.
	 */
	ASSERT(sblock);

	/* The range should be inside the sblock range */
541
	ASSERT(ssector->offset < sblock->len);
542

543
	return offset_in_page(ssector->offset);
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
}

static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
{
	return page_address(scrub_sector_get_page(ssector)) +
	       scrub_sector_get_page_offset(ssector);
}

static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
				unsigned int len)
{
	return bio_add_page(bio, scrub_sector_get_page(ssector), len,
			    scrub_sector_get_page_offset(ssector));
}

559
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
560
				     struct scrub_block *sblocks_for_recheck[]);
561
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
562 563
				struct scrub_block *sblock,
				int retry_failed_mirror);
564
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
565
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
566
					     struct scrub_block *sblock_good);
567
static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
568
					    struct scrub_block *sblock_good,
569
					    int sector_num, int force_write);
570
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
571 572
static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
					     int sector_num);
573 574 575 576
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
577 578
static void scrub_sector_get(struct scrub_sector *sector);
static void scrub_sector_put(struct scrub_sector *sector);
579 580
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
581 582 583 584
static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
			 u64 physical, struct btrfs_device *dev, u64 flags,
			 u64 gen, int mirror_num, u8 *csum,
			 u64 physical_for_dev_replace);
585
static void scrub_bio_end_io(struct bio *bio);
586
static void scrub_bio_end_io_worker(struct work_struct *work);
587
static void scrub_block_complete(struct scrub_block *sblock);
588 589
static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
				      struct scrub_sector *sector);
590
static void scrub_wr_submit(struct scrub_ctx *sctx);
591
static void scrub_wr_bio_end_io(struct bio *bio);
592
static void scrub_wr_bio_end_io_worker(struct work_struct *work);
593
static void scrub_put_ctx(struct scrub_ctx *sctx);
S
Stefan Behrens 已提交
594

595
static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
596
{
597 598
	return sector->recover &&
	       (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
599
}
S
Stefan Behrens 已提交
600

601 602
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
603
	refcount_inc(&sctx->refs);
604 605 606 607 608 609 610
	atomic_inc(&sctx->bios_in_flight);
}

static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
{
	atomic_dec(&sctx->bios_in_flight);
	wake_up(&sctx->list_wait);
611
	scrub_put_ctx(sctx);
612 613
}

614
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
615 616 617 618 619 620 621 622 623
{
	while (atomic_read(&fs_info->scrub_pause_req)) {
		mutex_unlock(&fs_info->scrub_lock);
		wait_event(fs_info->scrub_pause_wait,
		   atomic_read(&fs_info->scrub_pause_req) == 0);
		mutex_lock(&fs_info->scrub_lock);
	}
}

624
static void scrub_pause_on(struct btrfs_fs_info *fs_info)
625 626 627
{
	atomic_inc(&fs_info->scrubs_paused);
	wake_up(&fs_info->scrub_pause_wait);
628
}
629

630 631
static void scrub_pause_off(struct btrfs_fs_info *fs_info)
{
632 633 634 635 636 637 638 639
	mutex_lock(&fs_info->scrub_lock);
	__scrub_blocked_if_needed(fs_info);
	atomic_dec(&fs_info->scrubs_paused);
	mutex_unlock(&fs_info->scrub_lock);

	wake_up(&fs_info->scrub_pause_wait);
}

640 641 642 643 644 645
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
	scrub_pause_on(fs_info);
	scrub_pause_off(fs_info);
}

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
/*
 * Insert new full stripe lock into full stripe locks tree
 *
 * Return pointer to existing or newly inserted full_stripe_lock structure if
 * everything works well.
 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 *
 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 * function
 */
static struct full_stripe_lock *insert_full_stripe_lock(
		struct btrfs_full_stripe_locks_tree *locks_root,
		u64 fstripe_logical)
{
	struct rb_node **p;
	struct rb_node *parent = NULL;
	struct full_stripe_lock *entry;
	struct full_stripe_lock *ret;

665
	lockdep_assert_held(&locks_root->lock);
666 667 668 669 670 671 672 673 674 675 676 677 678 679 680

	p = &locks_root->root.rb_node;
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct full_stripe_lock, node);
		if (fstripe_logical < entry->logical) {
			p = &(*p)->rb_left;
		} else if (fstripe_logical > entry->logical) {
			p = &(*p)->rb_right;
		} else {
			entry->refs++;
			return entry;
		}
	}

681 682 683
	/*
	 * Insert new lock.
	 */
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
	if (!ret)
		return ERR_PTR(-ENOMEM);
	ret->logical = fstripe_logical;
	ret->refs = 1;
	mutex_init(&ret->mutex);

	rb_link_node(&ret->node, parent, p);
	rb_insert_color(&ret->node, &locks_root->root);
	return ret;
}

/*
 * Search for a full stripe lock of a block group
 *
 * Return pointer to existing full stripe lock if found
 * Return NULL if not found
 */
static struct full_stripe_lock *search_full_stripe_lock(
		struct btrfs_full_stripe_locks_tree *locks_root,
		u64 fstripe_logical)
{
	struct rb_node *node;
	struct full_stripe_lock *entry;

709
	lockdep_assert_held(&locks_root->lock);
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728

	node = locks_root->root.rb_node;
	while (node) {
		entry = rb_entry(node, struct full_stripe_lock, node);
		if (fstripe_logical < entry->logical)
			node = node->rb_left;
		else if (fstripe_logical > entry->logical)
			node = node->rb_right;
		else
			return entry;
	}
	return NULL;
}

/*
 * Helper to get full stripe logical from a normal bytenr.
 *
 * Caller must ensure @cache is a RAID56 block group.
 */
729
static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
730 731 732 733 734 735 736 737 738 739 740 741 742
{
	u64 ret;

	/*
	 * Due to chunk item size limit, full stripe length should not be
	 * larger than U32_MAX. Just a sanity check here.
	 */
	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);

	/*
	 * round_down() can only handle power of 2, while RAID56 full
	 * stripe length can be 64KiB * n, so we need to manually round down.
	 */
743 744
	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
			cache->full_stripe_len + cache->start;
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
	return ret;
}

/*
 * Lock a full stripe to avoid concurrency of recovery and read
 *
 * It's only used for profiles with parities (RAID5/6), for other profiles it
 * does nothing.
 *
 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 * So caller must call unlock_full_stripe() at the same context.
 *
 * Return <0 if encounters error.
 */
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
			    bool *locked_ret)
{
762
	struct btrfs_block_group *bg_cache;
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
	struct btrfs_full_stripe_locks_tree *locks_root;
	struct full_stripe_lock *existing;
	u64 fstripe_start;
	int ret = 0;

	*locked_ret = false;
	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
	if (!bg_cache) {
		ASSERT(0);
		return -ENOENT;
	}

	/* Profiles not based on parity don't need full stripe lock */
	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
		goto out;
	locks_root = &bg_cache->full_stripe_locks_root;

	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);

	/* Now insert the full stripe lock */
	mutex_lock(&locks_root->lock);
	existing = insert_full_stripe_lock(locks_root, fstripe_start);
	mutex_unlock(&locks_root->lock);
	if (IS_ERR(existing)) {
		ret = PTR_ERR(existing);
		goto out;
	}
	mutex_lock(&existing->mutex);
	*locked_ret = true;
out:
	btrfs_put_block_group(bg_cache);
	return ret;
}

/*
 * Unlock a full stripe.
 *
 * NOTE: Caller must ensure it's the same context calling corresponding
 * lock_full_stripe().
 *
 * Return 0 if we unlock full stripe without problem.
 * Return <0 for error
 */
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
			      bool locked)
{
809
	struct btrfs_block_group *bg_cache;
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
	struct btrfs_full_stripe_locks_tree *locks_root;
	struct full_stripe_lock *fstripe_lock;
	u64 fstripe_start;
	bool freeit = false;
	int ret = 0;

	/* If we didn't acquire full stripe lock, no need to continue */
	if (!locked)
		return 0;

	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
	if (!bg_cache) {
		ASSERT(0);
		return -ENOENT;
	}
	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
		goto out;

	locks_root = &bg_cache->full_stripe_locks_root;
	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);

	mutex_lock(&locks_root->lock);
	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
	/* Unpaired unlock_full_stripe() detected */
	if (!fstripe_lock) {
		WARN_ON(1);
		ret = -ENOENT;
		mutex_unlock(&locks_root->lock);
		goto out;
	}

	if (fstripe_lock->refs == 0) {
		WARN_ON(1);
		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
			fstripe_lock->logical);
	} else {
		fstripe_lock->refs--;
	}

	if (fstripe_lock->refs == 0) {
		rb_erase(&fstripe_lock->node, &locks_root->root);
		freeit = true;
	}
	mutex_unlock(&locks_root->lock);

	mutex_unlock(&fstripe_lock->mutex);
	if (freeit)
		kfree(fstripe_lock);
out:
	btrfs_put_block_group(bg_cache);
	return ret;
}

863
static void scrub_free_csums(struct scrub_ctx *sctx)
A
Arne Jansen 已提交
864
{
865
	while (!list_empty(&sctx->csum_list)) {
A
Arne Jansen 已提交
866
		struct btrfs_ordered_sum *sum;
867
		sum = list_first_entry(&sctx->csum_list,
A
Arne Jansen 已提交
868 869 870 871 872 873
				       struct btrfs_ordered_sum, list);
		list_del(&sum->list);
		kfree(sum);
	}
}

874
static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
A
Arne Jansen 已提交
875 876 877
{
	int i;

878
	if (!sctx)
A
Arne Jansen 已提交
879 880
		return;

881
	/* this can happen when scrub is cancelled */
882 883
	if (sctx->curr != -1) {
		struct scrub_bio *sbio = sctx->bios[sctx->curr];
884

885
		for (i = 0; i < sbio->sector_count; i++)
886
			scrub_block_put(sbio->sectors[i]->sblock);
887 888 889
		bio_put(sbio->bio);
	}

890
	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
891
		struct scrub_bio *sbio = sctx->bios[i];
A
Arne Jansen 已提交
892 893 894 895 896 897

		if (!sbio)
			break;
		kfree(sbio);
	}

898
	kfree(sctx->wr_curr_bio);
899 900
	scrub_free_csums(sctx);
	kfree(sctx);
A
Arne Jansen 已提交
901 902
}

903 904
static void scrub_put_ctx(struct scrub_ctx *sctx)
{
905
	if (refcount_dec_and_test(&sctx->refs))
906 907 908
		scrub_free_ctx(sctx);
}

909 910
static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
		struct btrfs_fs_info *fs_info, int is_dev_replace)
A
Arne Jansen 已提交
911
{
912
	struct scrub_ctx *sctx;
A
Arne Jansen 已提交
913 914
	int		i;

915
	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
916
	if (!sctx)
A
Arne Jansen 已提交
917
		goto nomem;
918
	refcount_set(&sctx->refs, 1);
919
	sctx->is_dev_replace = is_dev_replace;
920
	sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
921
	sctx->curr = -1;
922
	sctx->fs_info = fs_info;
923
	INIT_LIST_HEAD(&sctx->csum_list);
924
	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
A
Arne Jansen 已提交
925 926
		struct scrub_bio *sbio;

927
		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
A
Arne Jansen 已提交
928 929
		if (!sbio)
			goto nomem;
930
		sctx->bios[i] = sbio;
A
Arne Jansen 已提交
931 932

		sbio->index = i;
933
		sbio->sctx = sctx;
934
		sbio->sector_count = 0;
935
		INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
A
Arne Jansen 已提交
936

937
		if (i != SCRUB_BIOS_PER_SCTX - 1)
938
			sctx->bios[i]->next_free = i + 1;
939
		else
940 941 942
			sctx->bios[i]->next_free = -1;
	}
	sctx->first_free = 0;
943 944
	atomic_set(&sctx->bios_in_flight, 0);
	atomic_set(&sctx->workers_pending, 0);
945 946 947 948 949
	atomic_set(&sctx->cancel_req, 0);

	spin_lock_init(&sctx->list_lock);
	spin_lock_init(&sctx->stat_lock);
	init_waitqueue_head(&sctx->list_wait);
950
	sctx->throttle_deadline = 0;
951

952 953 954
	WARN_ON(sctx->wr_curr_bio != NULL);
	mutex_init(&sctx->wr_lock);
	sctx->wr_curr_bio = NULL;
955
	if (is_dev_replace) {
956 957
		WARN_ON(!fs_info->dev_replace.tgtdev);
		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
958
		sctx->flush_all_writes = false;
959
	}
960

961
	return sctx;
A
Arne Jansen 已提交
962 963

nomem:
964
	scrub_free_ctx(sctx);
A
Arne Jansen 已提交
965 966 967
	return ERR_PTR(-ENOMEM);
}

968 969
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
				     u64 root, void *warn_ctx)
970 971 972 973
{
	u32 nlink;
	int ret;
	int i;
974
	unsigned nofs_flag;
975 976
	struct extent_buffer *eb;
	struct btrfs_inode_item *inode_item;
977
	struct scrub_warning *swarn = warn_ctx;
978
	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
979 980
	struct inode_fs_paths *ipath = NULL;
	struct btrfs_root *local_root;
981
	struct btrfs_key key;
982

D
David Sterba 已提交
983
	local_root = btrfs_get_fs_root(fs_info, root, true);
984 985 986 987 988
	if (IS_ERR(local_root)) {
		ret = PTR_ERR(local_root);
		goto err;
	}

989 990 991
	/*
	 * this makes the path point to (inum INODE_ITEM ioff)
	 */
992 993 994 995 996
	key.objectid = inum;
	key.type = BTRFS_INODE_ITEM_KEY;
	key.offset = 0;

	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
997
	if (ret) {
998
		btrfs_put_root(local_root);
999 1000 1001 1002 1003 1004 1005 1006 1007 1008
		btrfs_release_path(swarn->path);
		goto err;
	}

	eb = swarn->path->nodes[0];
	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
					struct btrfs_inode_item);
	nlink = btrfs_inode_nlink(eb, inode_item);
	btrfs_release_path(swarn->path);

1009 1010 1011 1012 1013 1014
	/*
	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
	 * uses GFP_NOFS in this context, so we keep it consistent but it does
	 * not seem to be strictly necessary.
	 */
	nofs_flag = memalloc_nofs_save();
1015
	ipath = init_ipath(4096, local_root, swarn->path);
1016
	memalloc_nofs_restore(nofs_flag);
1017
	if (IS_ERR(ipath)) {
1018
		btrfs_put_root(local_root);
1019 1020 1021 1022
		ret = PTR_ERR(ipath);
		ipath = NULL;
		goto err;
	}
1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
	ret = paths_from_inode(inum, ipath);

	if (ret < 0)
		goto err;

	/*
	 * we deliberately ignore the bit ipath might have been too small to
	 * hold all of the paths here
	 */
	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
J
Jeff Mahoney 已提交
1033
		btrfs_warn_in_rcu(fs_info,
1034
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
J
Jeff Mahoney 已提交
1035
				  swarn->errstr, swarn->logical,
1036
				  btrfs_dev_name(swarn->dev),
D
David Sterba 已提交
1037
				  swarn->physical,
J
Jeff Mahoney 已提交
1038
				  root, inum, offset,
1039
				  fs_info->sectorsize, nlink,
J
Jeff Mahoney 已提交
1040
				  (char *)(unsigned long)ipath->fspath->val[i]);
1041

1042
	btrfs_put_root(local_root);
1043 1044 1045 1046
	free_ipath(ipath);
	return 0;

err:
J
Jeff Mahoney 已提交
1047
	btrfs_warn_in_rcu(fs_info,
D
David Sterba 已提交
1048
			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
J
Jeff Mahoney 已提交
1049
			  swarn->errstr, swarn->logical,
1050
			  btrfs_dev_name(swarn->dev),
D
David Sterba 已提交
1051
			  swarn->physical,
J
Jeff Mahoney 已提交
1052
			  root, inum, offset, ret);
1053 1054 1055 1056 1057

	free_ipath(ipath);
	return 0;
}

1058 1059
static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
				       bool is_super, u64 logical, u64 physical)
1060
{
1061
	struct btrfs_fs_info *fs_info = dev->fs_info;
1062 1063 1064 1065 1066
	struct btrfs_path *path;
	struct btrfs_key found_key;
	struct extent_buffer *eb;
	struct btrfs_extent_item *ei;
	struct scrub_warning swarn;
1067 1068
	unsigned long ptr = 0;
	u64 flags = 0;
1069
	u64 ref_root;
1070
	u32 item_size;
1071
	u8 ref_level = 0;
1072
	int ret;
1073

1074
	/* Super block error, no need to search extent tree. */
1075
	if (is_super) {
1076
		btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
1077
				  errstr, btrfs_dev_name(dev), physical);
1078 1079
		return;
	}
1080
	path = btrfs_alloc_path();
1081 1082
	if (!path)
		return;
1083

1084 1085
	swarn.physical = physical;
	swarn.logical = logical;
1086
	swarn.errstr = errstr;
1087
	swarn.dev = NULL;
1088

1089 1090
	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
				  &flags);
1091 1092 1093 1094 1095 1096 1097
	if (ret < 0)
		goto out;

	swarn.extent_item_size = found_key.offset;

	eb = path->nodes[0];
	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
1098
	item_size = btrfs_item_size(eb, path->slots[0]);
1099

1100
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1101
		do {
1102 1103 1104
			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
						      item_size, &ref_root,
						      &ref_level);
1105
			btrfs_warn_in_rcu(fs_info,
D
David Sterba 已提交
1106
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
J
Jeff Mahoney 已提交
1107
				errstr, swarn.logical,
1108
				btrfs_dev_name(dev),
D
David Sterba 已提交
1109
				swarn.physical,
1110 1111 1112 1113
				ref_level ? "node" : "leaf",
				ret < 0 ? -1 : ref_level,
				ret < 0 ? -1 : ref_root);
		} while (ret != 1);
1114
		btrfs_release_path(path);
1115
	} else {
1116 1117
		struct btrfs_backref_walk_ctx ctx = { 0 };

1118
		btrfs_release_path(path);
1119 1120 1121 1122 1123

		ctx.bytenr = found_key.objectid;
		ctx.extent_item_pos = swarn.logical - found_key.objectid;
		ctx.fs_info = fs_info;

1124
		swarn.path = path;
1125
		swarn.dev = dev;
1126 1127

		iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
1128 1129 1130 1131 1132 1133
	}

out:
	btrfs_free_path(path);
}

1134 1135 1136 1137 1138 1139 1140
static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
	scrub_print_common_warning(errstr, sblock->dev,
			sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER,
			sblock->logical, sblock->physical);
}

1141 1142
static inline void scrub_get_recover(struct scrub_recover *recover)
{
1143
	refcount_inc(&recover->refs);
1144 1145
}

1146 1147
static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
				     struct scrub_recover *recover)
1148
{
1149
	if (refcount_dec_and_test(&recover->refs)) {
1150
		btrfs_bio_counter_dec(fs_info);
1151
		btrfs_put_bioc(recover->bioc);
1152 1153 1154 1155
		kfree(recover);
	}
}

A
Arne Jansen 已提交
1156
/*
1157
 * scrub_handle_errored_block gets called when either verification of the
1158 1159
 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
 * case, this function handles all sectors in the bio, even though only one
1160 1161 1162
 * may be bad.
 * The goal of this function is to repair the errored block by using the
 * contents of one of the mirrors.
A
Arne Jansen 已提交
1163
 */
1164
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
A
Arne Jansen 已提交
1165
{
1166
	struct scrub_ctx *sctx = sblock_to_check->sctx;
1167
	struct btrfs_device *dev = sblock_to_check->dev;
1168 1169 1170 1171 1172
	struct btrfs_fs_info *fs_info;
	u64 logical;
	unsigned int failed_mirror_index;
	unsigned int is_metadata;
	unsigned int have_csum;
1173 1174
	/* One scrub_block for each mirror */
	struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
1175 1176 1177
	struct scrub_block *sblock_bad;
	int ret;
	int mirror_index;
1178
	int sector_num;
1179
	int success;
1180
	bool full_stripe_locked;
1181
	unsigned int nofs_flag;
1182
	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1183 1184
				      DEFAULT_RATELIMIT_BURST);

1185
	BUG_ON(sblock_to_check->sector_count < 1);
1186
	fs_info = sctx->fs_info;
1187
	if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1188
		/*
1189
		 * If we find an error in a super block, we just report it.
1190 1191 1192
		 * They will get written with the next transaction commit
		 * anyway
		 */
1193
		scrub_print_warning("super block error", sblock_to_check);
1194 1195 1196
		spin_lock(&sctx->stat_lock);
		++sctx->stat.super_errors;
		spin_unlock(&sctx->stat_lock);
1197
		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
1198 1199
		return 0;
	}
1200 1201 1202
	logical = sblock_to_check->logical;
	ASSERT(sblock_to_check->mirror_num);
	failed_mirror_index = sblock_to_check->mirror_num - 1;
1203
	is_metadata = !(sblock_to_check->sectors[0]->flags &
1204
			BTRFS_EXTENT_FLAG_DATA);
1205
	have_csum = sblock_to_check->sectors[0]->have_csum;
1206

1207 1208
	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
		return 0;
1209

1210 1211 1212 1213 1214 1215
	/*
	 * We must use GFP_NOFS because the scrub task might be waiting for a
	 * worker task executing this function and in turn a transaction commit
	 * might be waiting the scrub task to pause (which needs to wait for all
	 * the worker tasks to complete before pausing).
	 * We do allocations in the workers through insert_full_stripe_lock()
1216
	 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
1217 1218 1219
	 * this function.
	 */
	nofs_flag = memalloc_nofs_save();
1220 1221 1222 1223 1224 1225 1226 1227 1228
	/*
	 * For RAID5/6, race can happen for a different device scrub thread.
	 * For data corruption, Parity and Data threads will both try
	 * to recovery the data.
	 * Race can lead to doubly added csum error, or even unrecoverable
	 * error.
	 */
	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
	if (ret < 0) {
1229
		memalloc_nofs_restore(nofs_flag);
1230 1231 1232 1233 1234 1235 1236 1237 1238
		spin_lock(&sctx->stat_lock);
		if (ret == -ENOMEM)
			sctx->stat.malloc_errors++;
		sctx->stat.read_errors++;
		sctx->stat.uncorrectable_errors++;
		spin_unlock(&sctx->stat_lock);
		return ret;
	}

1239 1240 1241 1242
	/*
	 * read all mirrors one after the other. This includes to
	 * re-read the extent or metadata block that failed (that was
	 * the cause that this fixup code is called) another time,
1243
	 * sector by sector this time in order to know which sectors
1244 1245 1246 1247
	 * caused I/O errors and which ones are good (for all mirrors).
	 * It is the goal to handle the situation when more than one
	 * mirror contains I/O errors, but the errors do not
	 * overlap, i.e. the data can be repaired by selecting the
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	 * sectors from those mirrors without I/O error on the
	 * particular sectors. One example (with blocks >= 2 * sectorsize)
	 * would be that mirror #1 has an I/O error on the first sector,
	 * the second sector is good, and mirror #2 has an I/O error on
	 * the second sector, but the first sector is good.
	 * Then the first sector of the first mirror can be repaired by
	 * taking the first sector of the second mirror, and the
	 * second sector of the second mirror can be repaired by
	 * copying the contents of the 2nd sector of the 1st mirror.
	 * One more note: if the sectors of one mirror contain I/O
1258 1259 1260
	 * errors, the checksum cannot be verified. In order to get
	 * the best data for repairing, the first attempt is to find
	 * a mirror without I/O errors and with a validated checksum.
1261
	 * Only if this is not possible, the sectors are picked from
1262 1263 1264 1265 1266
	 * mirrors with I/O errors without considering the checksum.
	 * If the latter is the case, at the end, the checksum of the
	 * repaired area is verified in order to correctly maintain
	 * the statistics.
	 */
1267
	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
1268 1269 1270 1271 1272 1273
		/*
		 * Note: the two members refs and outstanding_sectors are not
		 * used in the blocks that are used for the recheck procedure.
		 *
		 * But alloc_scrub_block() will initialize sblock::ref anyway,
		 * so we can use scrub_block_put() to clean them up.
1274 1275 1276
		 *
		 * And here we don't setup the physical/dev for the sblock yet,
		 * they will be correctly initialized in scrub_setup_recheck_block().
1277
		 */
1278 1279
		sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
							logical, 0, 0, mirror_index);
1280 1281 1282 1283 1284 1285 1286 1287 1288
		if (!sblocks_for_recheck[mirror_index]) {
			spin_lock(&sctx->stat_lock);
			sctx->stat.malloc_errors++;
			sctx->stat.read_errors++;
			sctx->stat.uncorrectable_errors++;
			spin_unlock(&sctx->stat_lock);
			btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
			goto out;
		}
A
Arne Jansen 已提交
1289 1290
	}

1291
	/* Setup the context, map the logical blocks and alloc the sectors */
1292
	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1293
	if (ret) {
1294 1295 1296 1297
		spin_lock(&sctx->stat_lock);
		sctx->stat.read_errors++;
		sctx->stat.uncorrectable_errors++;
		spin_unlock(&sctx->stat_lock);
1298
		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1299 1300 1301
		goto out;
	}
	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1302
	sblock_bad = sblocks_for_recheck[failed_mirror_index];
1303

1304
	/* build and submit the bios for the failed mirror, check checksums */
1305
	scrub_recheck_block(fs_info, sblock_bad, 1);
A
Arne Jansen 已提交
1306

1307 1308 1309
	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
	    sblock_bad->no_io_error_seen) {
		/*
1310
		 * The error disappeared after reading sector by sector, or
1311 1312 1313 1314 1315 1316
		 * the area was part of a huge bio and other parts of the
		 * bio caused I/O errors, or the block layer merged several
		 * read requests into one and the error is caused by a
		 * different bio (usually one of the two latter cases is
		 * the cause)
		 */
1317 1318
		spin_lock(&sctx->stat_lock);
		sctx->stat.unverified_errors++;
1319
		sblock_to_check->data_corrected = 1;
1320
		spin_unlock(&sctx->stat_lock);
A
Arne Jansen 已提交
1321

1322 1323
		if (sctx->is_dev_replace)
			scrub_write_block_to_dev_replace(sblock_bad);
1324
		goto out;
A
Arne Jansen 已提交
1325 1326
	}

1327
	if (!sblock_bad->no_io_error_seen) {
1328 1329 1330
		spin_lock(&sctx->stat_lock);
		sctx->stat.read_errors++;
		spin_unlock(&sctx->stat_lock);
1331
		if (__ratelimit(&rs))
1332
			scrub_print_warning("i/o error", sblock_to_check);
1333
		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1334
	} else if (sblock_bad->checksum_error) {
1335 1336 1337
		spin_lock(&sctx->stat_lock);
		sctx->stat.csum_errors++;
		spin_unlock(&sctx->stat_lock);
1338
		if (__ratelimit(&rs))
1339
			scrub_print_warning("checksum error", sblock_to_check);
1340
		btrfs_dev_stat_inc_and_print(dev,
1341
					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
1342
	} else if (sblock_bad->header_error) {
1343 1344 1345
		spin_lock(&sctx->stat_lock);
		sctx->stat.verify_errors++;
		spin_unlock(&sctx->stat_lock);
1346
		if (__ratelimit(&rs))
1347 1348
			scrub_print_warning("checksum/header error",
					    sblock_to_check);
1349
		if (sblock_bad->generation_error)
1350
			btrfs_dev_stat_inc_and_print(dev,
1351 1352
				BTRFS_DEV_STAT_GENERATION_ERRS);
		else
1353
			btrfs_dev_stat_inc_and_print(dev,
1354
				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1355
	}
A
Arne Jansen 已提交
1356

1357 1358 1359 1360
	if (sctx->readonly) {
		ASSERT(!sctx->is_dev_replace);
		goto out;
	}
A
Arne Jansen 已提交
1361

1362 1363
	/*
	 * now build and submit the bios for the other mirrors, check
1364 1365
	 * checksums.
	 * First try to pick the mirror which is completely without I/O
1366 1367 1368 1369 1370
	 * errors and also does not have a checksum error.
	 * If one is found, and if a checksum is present, the full block
	 * that is known to contain an error is rewritten. Afterwards
	 * the block is known to be corrected.
	 * If a mirror is found which is completely correct, and no
1371
	 * checksum is present, only those sectors are rewritten that had
1372
	 * an I/O error in the block to be repaired, since it cannot be
1373 1374
	 * determined, which copy of the other sectors is better (and it
	 * could happen otherwise that a correct sector would be
1375 1376
	 * overwritten by a bad one).
	 */
1377
	for (mirror_index = 0; ;mirror_index++) {
1378
		struct scrub_block *sblock_other;
1379

1380 1381
		if (mirror_index == failed_mirror_index)
			continue;
1382 1383

		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1384
		if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1385 1386
			if (mirror_index >= BTRFS_MAX_MIRRORS)
				break;
1387
			if (!sblocks_for_recheck[mirror_index]->sector_count)
1388 1389
				break;

1390
			sblock_other = sblocks_for_recheck[mirror_index];
1391
		} else {
1392
			struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1393
			int max_allowed = r->bioc->num_stripes - r->bioc->replace_nr_stripes;
1394 1395 1396

			if (mirror_index >= max_allowed)
				break;
1397
			if (!sblocks_for_recheck[1]->sector_count)
1398 1399 1400
				break;

			ASSERT(failed_mirror_index == 0);
1401
			sblock_other = sblocks_for_recheck[1];
1402
			sblock_other->mirror_num = 1 + mirror_index;
1403
		}
1404 1405

		/* build and submit the bios, check checksums */
1406
		scrub_recheck_block(fs_info, sblock_other, 0);
1407 1408

		if (!sblock_other->header_error &&
1409 1410
		    !sblock_other->checksum_error &&
		    sblock_other->no_io_error_seen) {
1411 1412
			if (sctx->is_dev_replace) {
				scrub_write_block_to_dev_replace(sblock_other);
1413
				goto corrected_error;
1414 1415
			} else {
				ret = scrub_repair_block_from_good_copy(
1416 1417 1418
						sblock_bad, sblock_other);
				if (!ret)
					goto corrected_error;
1419
			}
1420 1421
		}
	}
A
Arne Jansen 已提交
1422

1423 1424
	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
		goto did_not_correct_error;
1425 1426 1427

	/*
	 * In case of I/O errors in the area that is supposed to be
1428 1429
	 * repaired, continue by picking good copies of those sectors.
	 * Select the good sectors from mirrors to rewrite bad sectors from
1430 1431 1432 1433 1434
	 * the area to fix. Afterwards verify the checksum of the block
	 * that is supposed to be repaired. This verification step is
	 * only done for the purpose of statistic counting and for the
	 * final scrub report, whether errors remain.
	 * A perfect algorithm could make use of the checksum and try
1435
	 * all possible combinations of sectors from the different mirrors
1436
	 * until the checksum verification succeeds. For example, when
1437
	 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1438
	 * of mirror #2 is readable but the final checksum test fails,
1439
	 * then the 2nd sector of mirror #3 could be tried, whether now
1440
	 * the final checksum succeeds. But this would be a rare
1441 1442 1443 1444
	 * exception and is therefore not implemented. At least it is
	 * avoided that the good copy is overwritten.
	 * A more useful improvement would be to pick the sectors
	 * without I/O error based on sector sizes (512 bytes on legacy
1445
	 * disks) instead of on sectorsize. Then maybe 512 byte of one
1446
	 * mirror could be repaired by taking 512 byte of a different
1447
	 * mirror, even if other 512 byte sectors in the same sectorsize
1448
	 * area are unreadable.
A
Arne Jansen 已提交
1449
	 */
1450
	success = 1;
1451 1452
	for (sector_num = 0; sector_num < sblock_bad->sector_count;
	     sector_num++) {
1453
		struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1454
		struct scrub_block *sblock_other = NULL;
1455

1456 1457
		/* Skip no-io-error sectors in scrub */
		if (!sector_bad->io_error && !sctx->is_dev_replace)
A
Arne Jansen 已提交
1458
			continue;
1459

1460
		if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1461 1462 1463 1464 1465 1466 1467 1468
			/*
			 * In case of dev replace, if raid56 rebuild process
			 * didn't work out correct data, then copy the content
			 * in sblock_bad to make sure target device is identical
			 * to source device, instead of writing garbage data in
			 * sblock_for_recheck array to target device.
			 */
			sblock_other = NULL;
1469 1470
		} else if (sector_bad->io_error) {
			/* Try to find no-io-error sector in mirrors */
1471 1472
			for (mirror_index = 0;
			     mirror_index < BTRFS_MAX_MIRRORS &&
1473
			     sblocks_for_recheck[mirror_index]->sector_count > 0;
1474
			     mirror_index++) {
1475
				if (!sblocks_for_recheck[mirror_index]->
1476
				    sectors[sector_num]->io_error) {
1477
					sblock_other = sblocks_for_recheck[mirror_index];
1478
					break;
1479 1480
				}
			}
1481 1482
			if (!sblock_other)
				success = 0;
I
Ilya Dryomov 已提交
1483
		}
A
Arne Jansen 已提交
1484

1485 1486
		if (sctx->is_dev_replace) {
			/*
1487 1488 1489 1490
			 * Did not find a mirror to fetch the sector from.
			 * scrub_write_sector_to_dev_replace() handles this
			 * case (sector->io_error), by filling the block with
			 * zeros before submitting the write request
1491 1492 1493 1494
			 */
			if (!sblock_other)
				sblock_other = sblock_bad;

1495 1496
			if (scrub_write_sector_to_dev_replace(sblock_other,
							      sector_num) != 0) {
1497
				atomic64_inc(
1498
					&fs_info->dev_replace.num_write_errors);
1499 1500 1501
				success = 0;
			}
		} else if (sblock_other) {
1502 1503 1504
			ret = scrub_repair_sector_from_good_copy(sblock_bad,
								 sblock_other,
								 sector_num, 0);
1505
			if (0 == ret)
1506
				sector_bad->io_error = 0;
1507 1508
			else
				success = 0;
1509
		}
A
Arne Jansen 已提交
1510 1511
	}

1512
	if (success && !sctx->is_dev_replace) {
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522
		if (is_metadata || have_csum) {
			/*
			 * need to verify the checksum now that all
			 * sectors on disk are repaired (the write
			 * request for data to be repaired is on its way).
			 * Just be lazy and use scrub_recheck_block()
			 * which re-reads the data before the checksum
			 * is verified, but most likely the data comes out
			 * of the page cache.
			 */
1523
			scrub_recheck_block(fs_info, sblock_bad, 1);
1524
			if (!sblock_bad->header_error &&
1525 1526 1527 1528 1529 1530 1531
			    !sblock_bad->checksum_error &&
			    sblock_bad->no_io_error_seen)
				goto corrected_error;
			else
				goto did_not_correct_error;
		} else {
corrected_error:
1532 1533
			spin_lock(&sctx->stat_lock);
			sctx->stat.corrected_errors++;
1534
			sblock_to_check->data_corrected = 1;
1535
			spin_unlock(&sctx->stat_lock);
1536 1537
			btrfs_err_rl_in_rcu(fs_info,
				"fixed up error at logical %llu on dev %s",
1538
				logical, btrfs_dev_name(dev));
A
Arne Jansen 已提交
1539
		}
1540 1541
	} else {
did_not_correct_error:
1542 1543 1544
		spin_lock(&sctx->stat_lock);
		sctx->stat.uncorrectable_errors++;
		spin_unlock(&sctx->stat_lock);
1545 1546
		btrfs_err_rl_in_rcu(fs_info,
			"unable to fixup (regular) error at logical %llu on dev %s",
1547
			logical, btrfs_dev_name(dev));
I
Ilya Dryomov 已提交
1548
	}
A
Arne Jansen 已提交
1549

1550
out:
1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569
	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
		struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
		struct scrub_recover *recover;
		int sector_index;

		/* Not allocated, continue checking the next mirror */
		if (!sblock)
			continue;

		for (sector_index = 0; sector_index < sblock->sector_count;
		     sector_index++) {
			/*
			 * Here we just cleanup the recover, each sector will be
			 * properly cleaned up by later scrub_block_put()
			 */
			recover = sblock->sectors[sector_index]->recover;
			if (recover) {
				scrub_put_recover(fs_info, recover);
				sblock->sectors[sector_index]->recover = NULL;
1570
			}
1571
		}
1572
		scrub_block_put(sblock);
1573
	}
A
Arne Jansen 已提交
1574

1575
	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1576
	memalloc_nofs_restore(nofs_flag);
1577 1578
	if (ret < 0)
		return ret;
1579 1580
	return 0;
}
A
Arne Jansen 已提交
1581

1582
static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1583
{
1584
	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
Z
Zhao Lei 已提交
1585
		return 2;
1586
	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
Z
Zhao Lei 已提交
1587 1588
		return 3;
	else
1589
		return (int)bioc->num_stripes;
1590 1591
}

Z
Zhao Lei 已提交
1592
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1593
						 u64 full_stripe_logical,
1594 1595 1596 1597 1598 1599
						 int nstripes, int mirror,
						 int *stripe_index,
						 u64 *stripe_offset)
{
	int i;

1600
	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1601 1602 1603
		const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
					    nstripes - 1 : nstripes - 2;

1604
		/* RAID5/6 */
1605 1606 1607
		for (i = 0; i < nr_data_stripes; i++) {
			const u64 data_stripe_start = full_stripe_logical +
						(i * BTRFS_STRIPE_LEN);
1608

1609 1610
			if (logical >= data_stripe_start &&
			    logical < data_stripe_start + BTRFS_STRIPE_LEN)
1611 1612 1613 1614
				break;
		}

		*stripe_index = i;
1615 1616
		*stripe_offset = (logical - full_stripe_logical) &
				 BTRFS_STRIPE_LEN_MASK;
1617 1618 1619 1620 1621 1622 1623
	} else {
		/* The other RAID type */
		*stripe_index = mirror;
		*stripe_offset = 0;
	}
}

1624
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1625
				     struct scrub_block *sblocks_for_recheck[])
1626
{
1627
	struct scrub_ctx *sctx = original_sblock->sctx;
1628
	struct btrfs_fs_info *fs_info = sctx->fs_info;
1629
	u64 logical = original_sblock->logical;
1630 1631 1632 1633
	u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
	u64 generation = original_sblock->sectors[0]->generation;
	u64 flags = original_sblock->sectors[0]->flags;
	u64 have_csum = original_sblock->sectors[0]->have_csum;
1634
	struct scrub_recover *recover;
1635
	struct btrfs_io_context *bioc;
1636 1637 1638 1639
	u64 sublen;
	u64 mapped_length;
	u64 stripe_offset;
	int stripe_index;
1640
	int sector_index = 0;
1641
	int mirror_index;
1642
	int nmirrors;
1643 1644 1645
	int ret;

	while (length > 0) {
1646
		sublen = min_t(u64, length, fs_info->sectorsize);
1647
		mapped_length = sublen;
1648
		bioc = NULL;
A
Arne Jansen 已提交
1649

1650
		/*
1651 1652
		 * With a length of sectorsize, each returned stripe represents
		 * one mirror
1653
		 */
1654
		btrfs_bio_counter_inc_blocked(fs_info);
1655
		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1656 1657 1658
				       logical, &mapped_length, &bioc);
		if (ret || !bioc || mapped_length < sublen) {
			btrfs_put_bioc(bioc);
1659
			btrfs_bio_counter_dec(fs_info);
1660 1661
			return -EIO;
		}
A
Arne Jansen 已提交
1662

1663
		recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
1664
		if (!recover) {
1665
			btrfs_put_bioc(bioc);
1666
			btrfs_bio_counter_dec(fs_info);
1667 1668 1669
			return -ENOMEM;
		}

1670
		refcount_set(&recover->refs, 1);
1671
		recover->bioc = bioc;
1672 1673
		recover->map_length = mapped_length;

1674
		ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1675

1676
		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
Z
Zhao Lei 已提交
1677

1678
		for (mirror_index = 0; mirror_index < nmirrors;
1679 1680
		     mirror_index++) {
			struct scrub_block *sblock;
1681
			struct scrub_sector *sector;
1682

1683
			sblock = sblocks_for_recheck[mirror_index];
1684
			sblock->sctx = sctx;
1685

1686
			sector = alloc_scrub_sector(sblock, logical);
1687
			if (!sector) {
1688 1689 1690
				spin_lock(&sctx->stat_lock);
				sctx->stat.malloc_errors++;
				spin_unlock(&sctx->stat_lock);
1691
				scrub_put_recover(fs_info, recover);
1692 1693
				return -ENOMEM;
			}
1694 1695 1696
			sector->flags = flags;
			sector->generation = generation;
			sector->have_csum = have_csum;
1697
			if (have_csum)
1698
				memcpy(sector->csum,
1699
				       original_sblock->sectors[0]->csum,
1700
				       sctx->fs_info->csum_size);
1701

Z
Zhao Lei 已提交
1702
			scrub_stripe_index_and_offset(logical,
1703
						      bioc->map_type,
1704
						      bioc->full_stripe_logical,
1705
						      bioc->num_stripes -
1706
						      bioc->replace_nr_stripes,
1707 1708 1709
						      mirror_index,
						      &stripe_index,
						      &stripe_offset);
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
			/*
			 * We're at the first sector, also populate @sblock
			 * physical and dev.
			 */
			if (sector_index == 0) {
				sblock->physical =
					bioc->stripes[stripe_index].physical +
					stripe_offset;
				sblock->dev = bioc->stripes[stripe_index].dev;
				sblock->physical_for_dev_replace =
					original_sblock->physical_for_dev_replace;
			}
1722

1723
			BUG_ON(sector_index >= original_sblock->sector_count);
1724
			scrub_get_recover(recover);
1725
			sector->recover = recover;
1726
		}
1727
		scrub_put_recover(fs_info, recover);
1728 1729
		length -= sublen;
		logical += sublen;
1730
		sector_index++;
1731 1732 1733
	}

	return 0;
I
Ilya Dryomov 已提交
1734 1735
}

1736
static void scrub_bio_wait_endio(struct bio *bio)
1737
{
1738
	complete(bio->bi_private);
1739 1740 1741 1742
}

static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
					struct bio *bio,
1743
					struct scrub_sector *sector)
1744
{
1745
	DECLARE_COMPLETION_ONSTACK(done);
1746

1747 1748
	bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
				 SECTOR_SHIFT;
1749 1750
	bio->bi_private = &done;
	bio->bi_end_io = scrub_bio_wait_endio;
1751
	raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
1752

1753 1754
	wait_for_completion_io(&done);
	return blk_status_to_errno(bio->bi_status);
1755 1756
}

L
Liu Bo 已提交
1757 1758 1759
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
					  struct scrub_block *sblock)
{
1760
	struct scrub_sector *first_sector = sblock->sectors[0];
L
Liu Bo 已提交
1761
	struct bio *bio;
1762
	int i;
L
Liu Bo 已提交
1763

1764
	/* All sectors in sblock belong to the same stripe on the same device. */
1765 1766
	ASSERT(sblock->dev);
	if (!sblock->dev->bdev)
L
Liu Bo 已提交
1767 1768
		goto out;

1769
	bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
L
Liu Bo 已提交
1770

1771
	for (i = 0; i < sblock->sector_count; i++) {
1772
		struct scrub_sector *sector = sblock->sectors[i];
L
Liu Bo 已提交
1773

1774
		bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
L
Liu Bo 已提交
1775 1776
	}

1777
	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
L
Liu Bo 已提交
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
		bio_put(bio);
		goto out;
	}

	bio_put(bio);

	scrub_recheck_block_checksum(sblock);

	return;
out:
1788 1789
	for (i = 0; i < sblock->sector_count; i++)
		sblock->sectors[i]->io_error = 1;
L
Liu Bo 已提交
1790 1791 1792 1793

	sblock->no_io_error_seen = 0;
}

1794
/*
1795 1796 1797 1798 1799
 * This function will check the on disk data for checksum errors, header errors
 * and read I/O errors. If any I/O errors happen, the exact sectors which are
 * errored are marked as being bad. The goal is to enable scrub to take those
 * sectors that are not errored from all the mirrors so that the sectors that
 * are errored in the just handled mirror can be repaired.
1800
 */
1801
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1802 1803
				struct scrub_block *sblock,
				int retry_failed_mirror)
I
Ilya Dryomov 已提交
1804
{
1805
	int i;
I
Ilya Dryomov 已提交
1806

1807
	sblock->no_io_error_seen = 1;
I
Ilya Dryomov 已提交
1808

L
Liu Bo 已提交
1809
	/* short cut for raid56 */
1810
	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
L
Liu Bo 已提交
1811 1812
		return scrub_recheck_block_on_raid56(fs_info, sblock);

1813
	for (i = 0; i < sblock->sector_count; i++) {
1814
		struct scrub_sector *sector = sblock->sectors[i];
1815 1816
		struct bio bio;
		struct bio_vec bvec;
1817

1818
		if (sblock->dev->bdev == NULL) {
1819
			sector->io_error = 1;
1820 1821 1822 1823
			sblock->no_io_error_seen = 0;
			continue;
		}

1824
		bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
1825
		bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
1826 1827
		bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
					SECTOR_SHIFT;
1828

1829 1830
		btrfsic_check_bio(&bio);
		if (submit_bio_wait(&bio)) {
1831
			sector->io_error = 1;
L
Liu Bo 已提交
1832
			sblock->no_io_error_seen = 0;
1833
		}
1834

1835
		bio_uninit(&bio);
1836
	}
I
Ilya Dryomov 已提交
1837

1838
	if (sblock->no_io_error_seen)
1839
		scrub_recheck_block_checksum(sblock);
A
Arne Jansen 已提交
1840 1841
}

1842
static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
M
Miao Xie 已提交
1843
{
1844
	struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
M
Miao Xie 已提交
1845 1846
	int ret;

1847
	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
M
Miao Xie 已提交
1848 1849 1850
	return !ret;
}

1851
static void scrub_recheck_block_checksum(struct scrub_block *sblock)
A
Arne Jansen 已提交
1852
{
1853 1854 1855
	sblock->header_error = 0;
	sblock->checksum_error = 0;
	sblock->generation_error = 0;
1856

1857
	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1858 1859 1860
		scrub_checksum_data(sblock);
	else
		scrub_checksum_tree_block(sblock);
A
Arne Jansen 已提交
1861 1862
}

1863
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1864
					     struct scrub_block *sblock_good)
1865
{
1866
	int i;
1867
	int ret = 0;
I
Ilya Dryomov 已提交
1868

1869
	for (i = 0; i < sblock_bad->sector_count; i++) {
1870
		int ret_sub;
I
Ilya Dryomov 已提交
1871

1872 1873
		ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
							     sblock_good, i, 1);
1874 1875
		if (ret_sub)
			ret = ret_sub;
A
Arne Jansen 已提交
1876
	}
1877 1878 1879 1880

	return ret;
}

1881 1882 1883
static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
					      struct scrub_block *sblock_good,
					      int sector_num, int force_write)
1884
{
1885 1886
	struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
	struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1887
	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1888
	const u32 sectorsize = fs_info->sectorsize;
1889 1890

	if (force_write || sblock_bad->header_error ||
1891
	    sblock_bad->checksum_error || sector_bad->io_error) {
1892 1893
		struct bio bio;
		struct bio_vec bvec;
1894 1895
		int ret;

1896
		if (!sblock_bad->dev->bdev) {
1897
			btrfs_warn_rl(fs_info,
J
Jeff Mahoney 已提交
1898
				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1899 1900 1901
			return -EIO;
		}

1902 1903 1904
		bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
		bio.bi_iter.bi_sector = (sblock_bad->physical +
					 sector_bad->offset) >> SECTOR_SHIFT;
1905
		ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
1906

1907 1908 1909
		btrfsic_check_bio(&bio);
		ret = submit_bio_wait(&bio);
		bio_uninit(&bio);
1910

1911
		if (ret) {
1912
			btrfs_dev_stat_inc_and_print(sblock_bad->dev,
1913
				BTRFS_DEV_STAT_WRITE_ERRS);
1914
			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1915 1916
			return -EIO;
		}
A
Arne Jansen 已提交
1917 1918
	}

1919 1920 1921
	return 0;
}

1922 1923
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
1924
	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1925
	int i;
1926

1927 1928 1929 1930 1931 1932 1933
	/*
	 * This block is used for the check of the parity on the source device,
	 * so the data needn't be written into the destination device.
	 */
	if (sblock->sparity)
		return;

1934
	for (i = 0; i < sblock->sector_count; i++) {
1935 1936
		int ret;

1937
		ret = scrub_write_sector_to_dev_replace(sblock, i);
1938
		if (ret)
1939
			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1940 1941 1942
	}
}

1943
static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1944
{
1945
	const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
1946
	struct scrub_sector *sector = sblock->sectors[sector_num];
1947

1948
	if (sector->io_error)
1949
		memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
1950

1951
	return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1952 1953
}

1954 1955 1956 1957 1958 1959 1960 1961
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
	int ret = 0;
	u64 length;

	if (!btrfs_is_zoned(sctx->fs_info))
		return 0;

1962 1963 1964
	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
		return 0;

1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975
	if (sctx->write_pointer < physical) {
		length = physical - sctx->write_pointer;

		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
						sctx->write_pointer, length);
		if (!ret)
			sctx->write_pointer = physical;
	}
	return ret;
}

1976 1977 1978 1979 1980
static void scrub_block_get(struct scrub_block *sblock)
{
	refcount_inc(&sblock->refs);
}

1981 1982
static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
				      struct scrub_sector *sector)
1983
{
1984
	struct scrub_block *sblock = sector->sblock;
1985 1986
	struct scrub_bio *sbio;
	int ret;
1987
	const u32 sectorsize = sctx->fs_info->sectorsize;
1988

1989
	mutex_lock(&sctx->wr_lock);
1990
again:
1991 1992
	if (!sctx->wr_curr_bio) {
		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1993
					      GFP_KERNEL);
1994 1995
		if (!sctx->wr_curr_bio) {
			mutex_unlock(&sctx->wr_lock);
1996 1997
			return -ENOMEM;
		}
1998
		sctx->wr_curr_bio->sctx = sctx;
1999
		sctx->wr_curr_bio->sector_count = 0;
2000
	}
2001
	sbio = sctx->wr_curr_bio;
2002
	if (sbio->sector_count == 0) {
2003 2004
		ret = fill_writer_pointer_gap(sctx, sector->offset +
					      sblock->physical_for_dev_replace);
2005 2006 2007 2008 2009
		if (ret) {
			mutex_unlock(&sctx->wr_lock);
			return ret;
		}

2010 2011
		sbio->physical = sblock->physical_for_dev_replace + sector->offset;
		sbio->logical = sblock->logical + sector->offset;
2012
		sbio->dev = sctx->wr_tgtdev;
2013 2014 2015
		if (!sbio->bio) {
			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
					      REQ_OP_WRITE, GFP_NOFS);
2016
		}
2017 2018 2019
		sbio->bio->bi_private = sbio;
		sbio->bio->bi_end_io = scrub_wr_bio_end_io;
		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2020
		sbio->status = 0;
2021
	} else if (sbio->physical + sbio->sector_count * sectorsize !=
2022
		   sblock->physical_for_dev_replace + sector->offset ||
2023
		   sbio->logical + sbio->sector_count * sectorsize !=
2024
		   sblock->logical + sector->offset) {
2025 2026 2027 2028
		scrub_wr_submit(sctx);
		goto again;
	}

2029
	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
2030
	if (ret != sectorsize) {
2031
		if (sbio->sector_count < 1) {
2032 2033
			bio_put(sbio->bio);
			sbio->bio = NULL;
2034
			mutex_unlock(&sctx->wr_lock);
2035 2036 2037 2038 2039 2040
			return -EIO;
		}
		scrub_wr_submit(sctx);
		goto again;
	}

2041
	sbio->sectors[sbio->sector_count] = sector;
2042
	scrub_sector_get(sector);
2043 2044 2045 2046 2047 2048 2049
	/*
	 * Since ssector no longer holds a page, but uses sblock::pages, we
	 * have to ensure the sblock had not been freed before our write bio
	 * finished.
	 */
	scrub_block_get(sector->sblock);

2050 2051
	sbio->sector_count++;
	if (sbio->sector_count == sctx->sectors_per_bio)
2052
		scrub_wr_submit(sctx);
2053
	mutex_unlock(&sctx->wr_lock);
2054 2055 2056 2057 2058 2059 2060 2061

	return 0;
}

static void scrub_wr_submit(struct scrub_ctx *sctx)
{
	struct scrub_bio *sbio;

2062
	if (!sctx->wr_curr_bio)
2063 2064
		return;

2065 2066
	sbio = sctx->wr_curr_bio;
	sctx->wr_curr_bio = NULL;
2067 2068 2069 2070 2071
	scrub_pending_bio_inc(sctx);
	/* process all writes in a single worker thread. Then the block layer
	 * orders the requests before sending them to the driver which
	 * doubled the write performance on spinning disks when measured
	 * with Linux 3.5 */
2072 2073
	btrfsic_check_bio(sbio->bio);
	submit_bio(sbio->bio);
2074 2075

	if (btrfs_is_zoned(sctx->fs_info))
2076
		sctx->write_pointer = sbio->physical + sbio->sector_count *
2077
			sctx->fs_info->sectorsize;
2078 2079
}

2080
static void scrub_wr_bio_end_io(struct bio *bio)
2081 2082
{
	struct scrub_bio *sbio = bio->bi_private;
2083
	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2084

2085
	sbio->status = bio->bi_status;
2086 2087
	sbio->bio = bio;

2088 2089
	INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
	queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
2090 2091
}

2092
static void scrub_wr_bio_end_io_worker(struct work_struct *work)
2093 2094 2095 2096 2097
{
	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
	struct scrub_ctx *sctx = sbio->sctx;
	int i;

2098
	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2099
	if (sbio->status) {
2100
		struct btrfs_dev_replace *dev_replace =
2101
			&sbio->sctx->fs_info->dev_replace;
2102

2103 2104
		for (i = 0; i < sbio->sector_count; i++) {
			struct scrub_sector *sector = sbio->sectors[i];
2105

2106
			sector->io_error = 1;
2107
			atomic64_inc(&dev_replace->num_write_errors);
2108 2109 2110
		}
	}

2111 2112 2113 2114 2115 2116
	/*
	 * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
	 * endio we should put the sblock.
	 */
	for (i = 0; i < sbio->sector_count; i++) {
		scrub_block_put(sbio->sectors[i]->sblock);
2117
		scrub_sector_put(sbio->sectors[i]);
2118
	}
2119 2120 2121 2122 2123 2124 2125

	bio_put(sbio->bio);
	kfree(sbio);
	scrub_pending_bio_dec(sctx);
}

static int scrub_checksum(struct scrub_block *sblock)
2126 2127 2128 2129
{
	u64 flags;
	int ret;

2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141
	/*
	 * No need to initialize these stats currently,
	 * because this function only use return value
	 * instead of these stats value.
	 *
	 * Todo:
	 * always use stats
	 */
	sblock->header_error = 0;
	sblock->generation_error = 0;
	sblock->checksum_error = 0;

2142 2143
	WARN_ON(sblock->sector_count < 1);
	flags = sblock->sectors[0]->flags;
2144 2145 2146 2147 2148 2149
	ret = 0;
	if (flags & BTRFS_EXTENT_FLAG_DATA)
		ret = scrub_checksum_data(sblock);
	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
		ret = scrub_checksum_tree_block(sblock);
	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2150
		ret = scrub_checksum_super(sblock);
2151 2152 2153 2154
	else
		WARN_ON(1);
	if (ret)
		scrub_handle_errored_block(sblock);
2155 2156

	return ret;
A
Arne Jansen 已提交
2157 2158
}

2159
static int scrub_checksum_data(struct scrub_block *sblock)
A
Arne Jansen 已提交
2160
{
2161
	struct scrub_ctx *sctx = sblock->sctx;
2162 2163
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
A
Arne Jansen 已提交
2164
	u8 csum[BTRFS_CSUM_SIZE];
2165
	struct scrub_sector *sector;
2166
	char *kaddr;
A
Arne Jansen 已提交
2167

2168
	BUG_ON(sblock->sector_count < 1);
2169 2170
	sector = sblock->sectors[0];
	if (!sector->have_csum)
A
Arne Jansen 已提交
2171 2172
		return 0;

2173
	kaddr = scrub_sector_get_kaddr(sector);
2174

2175 2176
	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
2177

2178
	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
A
Arne Jansen 已提交
2179

2180
	if (memcmp(csum, sector->csum, fs_info->csum_size))
2181
		sblock->checksum_error = 1;
2182
	return sblock->checksum_error;
A
Arne Jansen 已提交
2183 2184
}

2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200
static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;

	return stripe->pages[page_index];
}

static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
						 int sector_nr)
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;

	return offset_in_page(sector_nr << fs_info->sectorsize_bits);
}

2201
static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
	const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
	const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
	const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
	u8 on_disk_csum[BTRFS_CSUM_SIZE];
	u8 calculated_csum[BTRFS_CSUM_SIZE];
	struct btrfs_header *header;

	/*
	 * Here we don't have a good way to attach the pages (and subpages)
	 * to a dummy extent buffer, thus we have to directly grab the members
	 * from pages.
	 */
	header = (struct btrfs_header *)(page_address(first_page) + first_off);
	memcpy(on_disk_csum, header->csum, fs_info->csum_size);

	if (logical != btrfs_stack_header_bytenr(header)) {
		bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
			      logical, stripe->mirror_num,
			      btrfs_stack_header_bytenr(header), logical);
		return;
	}
	if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
			      logical, stripe->mirror_num,
			      header->fsid, fs_info->fs_devices->fsid);
		return;
	}
	if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
		   BTRFS_UUID_SIZE) != 0) {
		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
			      logical, stripe->mirror_num,
			      header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
		return;
	}

	/* Now check tree block csum. */
	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
	crypto_shash_update(shash, page_address(first_page) + first_off +
			    BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);

	for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
		struct page *page = scrub_stripe_get_page(stripe, i);
		unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);

		crypto_shash_update(shash, page_address(page) + page_off,
				    fs_info->sectorsize);
	}

	crypto_shash_final(shash, calculated_csum);
	if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
			      logical, stripe->mirror_num,
			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
		return;
	}
	if (stripe->sectors[sector_nr].generation !=
	    btrfs_stack_header_generation(header)) {
		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad generation, has %llu want %llu",
			      logical, stripe->mirror_num,
			      btrfs_stack_header_generation(header),
			      stripe->sectors[sector_nr].generation);
		return;
	}
	bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
	bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
	bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
}

2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
	struct page *page = scrub_stripe_get_page(stripe, sector_nr);
	unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
	u8 csum_buf[BTRFS_CSUM_SIZE];
	int ret;

	ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);

	/* Sector not utilized, skip it. */
	if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
		return;

	/* IO error, no need to check. */
	if (test_bit(sector_nr, &stripe->io_error_bitmap))
		return;

	/* Metadata, verify the full tree block. */
	if (sector->is_metadata) {
		/*
		 * Check if the tree block crosses the stripe boudary.  If
		 * crossed the boundary, we cannot verify it but only give a
		 * warning.
		 *
		 * This can only happen on a very old filesystem where chunks
		 * are not ensured to be stripe aligned.
		 */
		if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
			btrfs_warn_rl(fs_info,
			"tree block at %llu crosses stripe boundary %llu",
				      stripe->logical +
				      (sector_nr << fs_info->sectorsize_bits),
				      stripe->logical);
			return;
		}
		scrub_verify_one_metadata(stripe, sector_nr);
		return;
	}

	/*
	 * Data is easier, we just verify the data csum (if we have it).  For
	 * cases without csum, we have no other choice but to trust it.
	 */
	if (!sector->csum) {
		clear_bit(sector_nr, &stripe->error_bitmap);
		return;
	}

	ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
	if (ret < 0) {
		set_bit(sector_nr, &stripe->csum_error_bitmap);
		set_bit(sector_nr, &stripe->error_bitmap);
	} else {
		clear_bit(sector_nr, &stripe->csum_error_bitmap);
		clear_bit(sector_nr, &stripe->error_bitmap);
	}
}

/* Verify specified sectors of a stripe. */
2353
static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
	int sector_nr;

	for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
		scrub_verify_one_sector(stripe, sector_nr);
		if (stripe->sectors[sector_nr].is_metadata)
			sector_nr += sectors_per_tree - 1;
	}
}

2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467
static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
{
	int i;

	for (i = 0; i < stripe->nr_sectors; i++) {
		if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
		    scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
			break;
	}
	ASSERT(i < stripe->nr_sectors);
	return i;
}

/*
 * Repair read is different to the regular read:
 *
 * - Only reads the failed sectors
 * - May have extra blocksize limits
 */
static void scrub_repair_read_endio(struct btrfs_bio *bbio)
{
	struct scrub_stripe *stripe = bbio->private;
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	struct bio_vec *bvec;
	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
	u32 bio_size = 0;
	int i;

	ASSERT(sector_nr < stripe->nr_sectors);

	bio_for_each_bvec_all(bvec, &bbio->bio, i)
		bio_size += bvec->bv_len;

	if (bbio->bio.bi_status) {
		bitmap_set(&stripe->io_error_bitmap, sector_nr,
			   bio_size >> fs_info->sectorsize_bits);
		bitmap_set(&stripe->error_bitmap, sector_nr,
			   bio_size >> fs_info->sectorsize_bits);
	} else {
		bitmap_clear(&stripe->io_error_bitmap, sector_nr,
			     bio_size >> fs_info->sectorsize_bits);
	}
	bio_put(&bbio->bio);
	if (atomic_dec_and_test(&stripe->pending_io))
		wake_up(&stripe->io_wait);
}

static int calc_next_mirror(int mirror, int num_copies)
{
	ASSERT(mirror <= num_copies);
	return (mirror + 1 > num_copies) ? 1 : mirror + 1;
}

static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
					    int mirror, int blocksize, bool wait)
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	struct btrfs_bio *bbio = NULL;
	const unsigned long old_error_bitmap = stripe->error_bitmap;
	int i;

	ASSERT(stripe->mirror_num >= 1);
	ASSERT(atomic_read(&stripe->pending_io) == 0);

	for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
		struct page *page;
		int pgoff;
		int ret;

		page = scrub_stripe_get_page(stripe, i);
		pgoff = scrub_stripe_get_page_offset(stripe, i);

		/* The current sector cannot be merged, submit the bio. */
		if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
			     bbio->bio.bi_iter.bi_size >= blocksize)) {
			ASSERT(bbio->bio.bi_iter.bi_size);
			atomic_inc(&stripe->pending_io);
			btrfs_submit_bio(bbio, mirror);
			if (wait)
				wait_scrub_stripe_io(stripe);
			bbio = NULL;
		}

		if (!bbio) {
			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
				fs_info, scrub_repair_read_endio, stripe);
			bbio->bio.bi_iter.bi_sector = (stripe->logical +
				(i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
		}

		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
		ASSERT(ret == fs_info->sectorsize);
	}
	if (bbio) {
		ASSERT(bbio->bio.bi_iter.bi_size);
		atomic_inc(&stripe->pending_io);
		btrfs_submit_bio(bbio, mirror);
		if (wait)
			wait_scrub_stripe_io(stripe);
	}
}

2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592
static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
				       struct scrub_stripe *stripe)
{
	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
				      DEFAULT_RATELIMIT_BURST);
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	struct btrfs_device *dev = NULL;
	u64 physical = 0;
	int nr_data_sectors = 0;
	int nr_meta_sectors = 0;
	int nr_nodatacsum_sectors = 0;
	int nr_repaired_sectors = 0;
	int sector_nr;

	/*
	 * Init needed infos for error reporting.
	 *
	 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
	 * thus no need for dev/physical, error reporting still needs dev and physical.
	 */
	if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
		u64 mapped_len = fs_info->sectorsize;
		struct btrfs_io_context *bioc = NULL;
		int stripe_index = stripe->mirror_num - 1;
		int ret;

		/* For scrub, our mirror_num should always start at 1. */
		ASSERT(stripe->mirror_num >= 1);
		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				       stripe->logical, &mapped_len, &bioc);
		/*
		 * If we failed, dev will be NULL, and later detailed reports
		 * will just be skipped.
		 */
		if (ret < 0)
			goto skip;
		physical = bioc->stripes[stripe_index].physical;
		dev = bioc->stripes[stripe_index].dev;
		btrfs_put_bioc(bioc);
	}

skip:
	for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
		bool repaired = false;

		if (stripe->sectors[sector_nr].is_metadata) {
			nr_meta_sectors++;
		} else {
			nr_data_sectors++;
			if (!stripe->sectors[sector_nr].csum)
				nr_nodatacsum_sectors++;
		}

		if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
		    !test_bit(sector_nr, &stripe->error_bitmap)) {
			nr_repaired_sectors++;
			repaired = true;
		}

		/* Good sector from the beginning, nothing need to be done. */
		if (!test_bit(sector_nr, &stripe->init_error_bitmap))
			continue;

		/*
		 * Report error for the corrupted sectors.  If repaired, just
		 * output the message of repaired message.
		 */
		if (repaired) {
			if (dev) {
				btrfs_err_rl_in_rcu(fs_info,
			"fixed up error at logical %llu on dev %s physical %llu",
					    stripe->logical, btrfs_dev_name(dev),
					    physical);
			} else {
				btrfs_err_rl_in_rcu(fs_info,
			"fixed up error at logical %llu on mirror %u",
					    stripe->logical, stripe->mirror_num);
			}
			continue;
		}

		/* The remaining are all for unrepaired. */
		if (dev) {
			btrfs_err_rl_in_rcu(fs_info,
	"unable to fixup (regular) error at logical %llu on dev %s physical %llu",
					    stripe->logical, btrfs_dev_name(dev),
					    physical);
		} else {
			btrfs_err_rl_in_rcu(fs_info,
	"unable to fixup (regular) error at logical %llu on mirror %u",
					    stripe->logical, stripe->mirror_num);
		}

		if (test_bit(sector_nr, &stripe->io_error_bitmap))
			if (__ratelimit(&rs) && dev)
				scrub_print_common_warning("i/o error", dev, false,
						     stripe->logical, physical);
		if (test_bit(sector_nr, &stripe->csum_error_bitmap))
			if (__ratelimit(&rs) && dev)
				scrub_print_common_warning("checksum error", dev, false,
						     stripe->logical, physical);
		if (test_bit(sector_nr, &stripe->meta_error_bitmap))
			if (__ratelimit(&rs) && dev)
				scrub_print_common_warning("header error", dev, false,
						     stripe->logical, physical);
	}

	spin_lock(&sctx->stat_lock);
	sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
	sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
	sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
	sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
	sctx->stat.no_csum += nr_nodatacsum_sectors;
	sctx->stat.read_errors +=
		bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
	sctx->stat.csum_errors +=
		bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
	sctx->stat.verify_errors +=
		bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
	sctx->stat.uncorrectable_errors +=
		bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
	sctx->stat.corrected_errors += nr_repaired_sectors;
	spin_unlock(&sctx->stat_lock);
}

2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665
/*
 * The main entrance for all read related scrub work, including:
 *
 * - Wait for the initial read to finish
 * - Verify and locate any bad sectors
 * - Go through the remaining mirrors and try to read as large blocksize as
 *   possible
 * - Go through all mirrors (including the failed mirror) sector-by-sector
 *
 * Writeback does not happen here, it needs extra synchronization.
 */
static void scrub_stripe_read_repair_worker(struct work_struct *work)
{
	struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
					  stripe->bg->length);
	int mirror;
	int i;

	ASSERT(stripe->mirror_num > 0);

	wait_scrub_stripe_io(stripe);
	scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
	/* Save the initial failed bitmap for later repair and report usage. */
	stripe->init_error_bitmap = stripe->error_bitmap;

	if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
		goto out;

	/*
	 * Try all remaining mirrors.
	 *
	 * Here we still try to read as large block as possible, as this is
	 * faster and we have extra safety nets to rely on.
	 */
	for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
	     mirror != stripe->mirror_num;
	     mirror = calc_next_mirror(mirror, num_copies)) {
		const unsigned long old_error_bitmap = stripe->error_bitmap;

		scrub_stripe_submit_repair_read(stripe, mirror,
						BTRFS_STRIPE_LEN, false);
		wait_scrub_stripe_io(stripe);
		scrub_verify_one_stripe(stripe, old_error_bitmap);
		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
			goto out;
	}

	/*
	 * Last safety net, try re-checking all mirrors, including the failed
	 * one, sector-by-sector.
	 *
	 * As if one sector failed the drive's internal csum, the whole read
	 * containing the offending sector would be marked as error.
	 * Thus here we do sector-by-sector read.
	 *
	 * This can be slow, thus we only try it as the last resort.
	 */

	for (i = 0, mirror = stripe->mirror_num;
	     i < num_copies;
	     i++, mirror = calc_next_mirror(mirror, num_copies)) {
		const unsigned long old_error_bitmap = stripe->error_bitmap;

		scrub_stripe_submit_repair_read(stripe, mirror,
						fs_info->sectorsize, true);
		wait_scrub_stripe_io(stripe);
		scrub_verify_one_stripe(stripe, old_error_bitmap);
		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
			goto out;
	}
out:
2666
	scrub_stripe_report_errors(stripe->sctx, stripe);
2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688
	set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
	wake_up(&stripe->repair_wait);
}

void scrub_read_endio(struct btrfs_bio *bbio)
{
	struct scrub_stripe *stripe = bbio->private;

	if (bbio->bio.bi_status) {
		bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
		bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
	} else {
		bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
	}
	bio_put(&bbio->bio);
	if (atomic_dec_and_test(&stripe->pending_io)) {
		wake_up(&stripe->io_wait);
		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
	}
}

2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774
static void scrub_write_endio(struct btrfs_bio *bbio)
{
	struct scrub_stripe *stripe = bbio->private;
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	struct bio_vec *bvec;
	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
	u32 bio_size = 0;
	int i;

	bio_for_each_bvec_all(bvec, &bbio->bio, i)
		bio_size += bvec->bv_len;

	if (bbio->bio.bi_status) {
		unsigned long flags;

		spin_lock_irqsave(&stripe->write_error_lock, flags);
		bitmap_set(&stripe->write_error_bitmap, sector_nr,
			   bio_size >> fs_info->sectorsize_bits);
		spin_unlock_irqrestore(&stripe->write_error_lock, flags);
	}
	bio_put(&bbio->bio);

	if (atomic_dec_and_test(&stripe->pending_io))
		wake_up(&stripe->io_wait);
}

/*
 * Submit the write bio(s) for the sectors specified by @write_bitmap.
 *
 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
 *
 * - Only needs logical bytenr and mirror_num
 *   Just like the scrub read path
 *
 * - Would only result in writes to the specified mirror
 *   Unlike the regular writeback path, which would write back to all stripes
 *
 * - Handle dev-replace and read-repair writeback differently
 */
void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
			 unsigned long write_bitmap, bool dev_replace)
{
	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
	struct btrfs_bio *bbio = NULL;
	const bool zoned = btrfs_is_zoned(fs_info);
	int sector_nr;

	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
		struct page *page = scrub_stripe_get_page(stripe, sector_nr);
		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
		int ret;

		/* We should only writeback sectors covered by an extent. */
		ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));

		/* Cannot merge with previous sector, submit the current one. */
		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
			fill_writer_pointer_gap(sctx, stripe->physical +
					(sector_nr << fs_info->sectorsize_bits));
			atomic_inc(&stripe->pending_io);
			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
			/* For zoned writeback, queue depth must be 1. */
			if (zoned)
				wait_scrub_stripe_io(stripe);
			bbio = NULL;
		}
		if (!bbio) {
			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
					       fs_info, scrub_write_endio, stripe);
			bbio->bio.bi_iter.bi_sector = (stripe->logical +
				(sector_nr << fs_info->sectorsize_bits)) >>
				SECTOR_SHIFT;
		}
		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
		ASSERT(ret == fs_info->sectorsize);
	}
	if (bbio) {
		fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
					SECTOR_SHIFT);
		atomic_inc(&stripe->pending_io);
		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
		if (zoned)
			wait_scrub_stripe_io(stripe);
	}
}

2775
static int scrub_checksum_tree_block(struct scrub_block *sblock)
A
Arne Jansen 已提交
2776
{
2777
	struct scrub_ctx *sctx = sblock->sctx;
A
Arne Jansen 已提交
2778
	struct btrfs_header *h;
2779
	struct btrfs_fs_info *fs_info = sctx->fs_info;
2780
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2781 2782
	u8 calculated_csum[BTRFS_CSUM_SIZE];
	u8 on_disk_csum[BTRFS_CSUM_SIZE];
2783 2784 2785 2786 2787 2788 2789
	/*
	 * This is done in sectorsize steps even for metadata as there's a
	 * constraint for nodesize to be aligned to sectorsize. This will need
	 * to change so we don't misuse data and metadata units like that.
	 */
	const u32 sectorsize = sctx->fs_info->sectorsize;
	const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
2790
	int i;
2791
	struct scrub_sector *sector;
2792
	char *kaddr;
2793

2794
	BUG_ON(sblock->sector_count < 1);
2795

2796
	/* Each member in sectors is just one sector */
2797
	ASSERT(sblock->sector_count == num_sectors);
2798

2799
	sector = sblock->sectors[0];
2800
	kaddr = scrub_sector_get_kaddr(sector);
2801
	h = (struct btrfs_header *)kaddr;
2802
	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
A
Arne Jansen 已提交
2803 2804 2805 2806 2807 2808

	/*
	 * we don't use the getter functions here, as we
	 * a) don't have an extent buffer and
	 * b) the page is already kmapped
	 */
2809
	if (sblock->logical != btrfs_stack_header_bytenr(h)) {
2810
		sblock->header_error = 1;
2811 2812 2813 2814 2815 2816
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
			      sblock->logical, sblock->mirror_num,
			      btrfs_stack_header_bytenr(h),
			      sblock->logical);
		goto out;
2817
	}
A
Arne Jansen 已提交
2818

2819
	if (!scrub_check_fsid(h->fsid, sector)) {
2820
		sblock->header_error = 1;
2821 2822 2823 2824 2825 2826
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
			      sblock->logical, sblock->mirror_num,
			      h->fsid, sblock->dev->fs_devices->fsid);
		goto out;
	}
A
Arne Jansen 已提交
2827

2828
	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
2829
		sblock->header_error = 1;
2830 2831 2832 2833 2834 2835
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
			      sblock->logical, sblock->mirror_num,
			      h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
		goto out;
	}
A
Arne Jansen 已提交
2836

2837 2838 2839
	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
2840
			    sectorsize - BTRFS_CSUM_SIZE);
2841

2842
	for (i = 1; i < num_sectors; i++) {
2843
		kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
2844
		crypto_shash_update(shash, kaddr, sectorsize);
2845 2846
	}

2847
	crypto_shash_final(shash, calculated_csum);
2848
	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
2849
		sblock->checksum_error = 1;
2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
			      sblock->logical, sblock->mirror_num,
			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
		goto out;
	}

	if (sector->generation != btrfs_stack_header_generation(h)) {
		sblock->header_error = 1;
		sblock->generation_error = 1;
		btrfs_warn_rl(fs_info,
		"tree block %llu mirror %u has bad generation, has %llu want %llu",
			      sblock->logical, sblock->mirror_num,
			      btrfs_stack_header_generation(h),
			      sector->generation);
	}
A
Arne Jansen 已提交
2867

2868
out:
2869
	return sblock->header_error || sblock->checksum_error;
A
Arne Jansen 已提交
2870 2871
}

2872
static int scrub_checksum_super(struct scrub_block *sblock)
A
Arne Jansen 已提交
2873 2874
{
	struct btrfs_super_block *s;
2875
	struct scrub_ctx *sctx = sblock->sctx;
2876 2877
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2878
	u8 calculated_csum[BTRFS_CSUM_SIZE];
2879
	struct scrub_sector *sector;
2880
	char *kaddr;
2881 2882
	int fail_gen = 0;
	int fail_cor = 0;
2883

2884
	BUG_ON(sblock->sector_count < 1);
2885
	sector = sblock->sectors[0];
2886
	kaddr = scrub_sector_get_kaddr(sector);
2887
	s = (struct btrfs_super_block *)kaddr;
A
Arne Jansen 已提交
2888

2889
	if (sblock->logical != btrfs_super_bytenr(s))
2890
		++fail_cor;
A
Arne Jansen 已提交
2891

2892
	if (sector->generation != btrfs_super_generation(s))
2893
		++fail_gen;
A
Arne Jansen 已提交
2894

2895
	if (!scrub_check_fsid(s->fsid, sector))
2896
		++fail_cor;
A
Arne Jansen 已提交
2897

2898 2899 2900 2901
	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
2902

2903
	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
2904
		++fail_cor;
A
Arne Jansen 已提交
2905

2906
	return fail_cor + fail_gen;
A
Arne Jansen 已提交
2907 2908
}

2909 2910
static void scrub_block_put(struct scrub_block *sblock)
{
2911
	if (refcount_dec_and_test(&sblock->refs)) {
2912 2913
		int i;

2914 2915 2916
		if (sblock->sparity)
			scrub_parity_put(sblock->sparity);

2917
		for (i = 0; i < sblock->sector_count; i++)
2918
			scrub_sector_put(sblock->sectors[i]);
2919 2920 2921 2922 2923 2924
		for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
			if (sblock->pages[i]) {
				detach_scrub_page_private(sblock->pages[i]);
				__free_page(sblock->pages[i]);
			}
		}
2925 2926 2927 2928
		kfree(sblock);
	}
}

2929
static void scrub_sector_get(struct scrub_sector *sector)
2930
{
2931
	atomic_inc(&sector->refs);
2932 2933
}

2934
static void scrub_sector_put(struct scrub_sector *sector)
2935
{
2936
	if (atomic_dec_and_test(&sector->refs))
2937
		kfree(sector);
2938 2939
}

2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998
/*
 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
 * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
 */
static void scrub_throttle(struct scrub_ctx *sctx)
{
	const int time_slice = 1000;
	struct scrub_bio *sbio;
	struct btrfs_device *device;
	s64 delta;
	ktime_t now;
	u32 div;
	u64 bwlimit;

	sbio = sctx->bios[sctx->curr];
	device = sbio->dev;
	bwlimit = READ_ONCE(device->scrub_speed_max);
	if (bwlimit == 0)
		return;

	/*
	 * Slice is divided into intervals when the IO is submitted, adjust by
	 * bwlimit and maximum of 64 intervals.
	 */
	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
	div = min_t(u32, 64, div);

	/* Start new epoch, set deadline */
	now = ktime_get();
	if (sctx->throttle_deadline == 0) {
		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
		sctx->throttle_sent = 0;
	}

	/* Still in the time to send? */
	if (ktime_before(now, sctx->throttle_deadline)) {
		/* If current bio is within the limit, send it */
		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
		if (sctx->throttle_sent <= div_u64(bwlimit, div))
			return;

		/* We're over the limit, sleep until the rest of the slice */
		delta = ktime_ms_delta(sctx->throttle_deadline, now);
	} else {
		/* New request after deadline, start new epoch */
		delta = 0;
	}

	if (delta) {
		long timeout;

		timeout = div_u64(delta * HZ, 1000);
		schedule_timeout_interruptible(timeout);
	}

	/* Next call will start the deadline period */
	sctx->throttle_deadline = 0;
}

2999
static void scrub_submit(struct scrub_ctx *sctx)
A
Arne Jansen 已提交
3000 3001 3002
{
	struct scrub_bio *sbio;

3003
	if (sctx->curr == -1)
S
Stefan Behrens 已提交
3004
		return;
A
Arne Jansen 已提交
3005

3006 3007
	scrub_throttle(sctx);

3008 3009
	sbio = sctx->bios[sctx->curr];
	sctx->curr = -1;
3010
	scrub_pending_bio_inc(sctx);
3011 3012
	btrfsic_check_bio(sbio->bio);
	submit_bio(sbio->bio);
A
Arne Jansen 已提交
3013 3014
}

3015 3016
static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
				      struct scrub_sector *sector)
A
Arne Jansen 已提交
3017
{
3018
	struct scrub_block *sblock = sector->sblock;
A
Arne Jansen 已提交
3019
	struct scrub_bio *sbio;
3020
	const u32 sectorsize = sctx->fs_info->sectorsize;
3021
	int ret;
A
Arne Jansen 已提交
3022 3023 3024 3025 3026

again:
	/*
	 * grab a fresh bio or wait for one to become available
	 */
3027 3028 3029 3030 3031 3032
	while (sctx->curr == -1) {
		spin_lock(&sctx->list_lock);
		sctx->curr = sctx->first_free;
		if (sctx->curr != -1) {
			sctx->first_free = sctx->bios[sctx->curr]->next_free;
			sctx->bios[sctx->curr]->next_free = -1;
3033
			sctx->bios[sctx->curr]->sector_count = 0;
3034
			spin_unlock(&sctx->list_lock);
A
Arne Jansen 已提交
3035
		} else {
3036 3037
			spin_unlock(&sctx->list_lock);
			wait_event(sctx->list_wait, sctx->first_free != -1);
A
Arne Jansen 已提交
3038 3039
		}
	}
3040
	sbio = sctx->bios[sctx->curr];
3041
	if (sbio->sector_count == 0) {
3042 3043 3044
		sbio->physical = sblock->physical + sector->offset;
		sbio->logical = sblock->logical + sector->offset;
		sbio->dev = sblock->dev;
3045 3046 3047
		if (!sbio->bio) {
			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
					      REQ_OP_READ, GFP_NOFS);
3048
		}
3049 3050 3051
		sbio->bio->bi_private = sbio;
		sbio->bio->bi_end_io = scrub_bio_end_io;
		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
3052
		sbio->status = 0;
3053
	} else if (sbio->physical + sbio->sector_count * sectorsize !=
3054
		   sblock->physical + sector->offset ||
3055
		   sbio->logical + sbio->sector_count * sectorsize !=
3056 3057
		   sblock->logical + sector->offset ||
		   sbio->dev != sblock->dev) {
3058
		scrub_submit(sctx);
A
Arne Jansen 已提交
3059 3060
		goto again;
	}
3061

3062
	sbio->sectors[sbio->sector_count] = sector;
3063
	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
3064
	if (ret != sectorsize) {
3065
		if (sbio->sector_count < 1) {
3066 3067 3068 3069
			bio_put(sbio->bio);
			sbio->bio = NULL;
			return -EIO;
		}
3070
		scrub_submit(sctx);
3071 3072 3073
		goto again;
	}

3074
	scrub_block_get(sblock); /* one for the page added to the bio */
3075
	atomic_inc(&sblock->outstanding_sectors);
3076 3077
	sbio->sector_count++;
	if (sbio->sector_count == sctx->sectors_per_bio)
3078
		scrub_submit(sctx);
3079 3080 3081 3082

	return 0;
}

3083
static void scrub_missing_raid56_end_io(struct bio *bio)
3084 3085
{
	struct scrub_block *sblock = bio->bi_private;
3086
	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
3087

3088
	btrfs_bio_counter_dec(fs_info);
3089
	if (bio->bi_status)
3090 3091
		sblock->no_io_error_seen = 0;

3092 3093
	bio_put(bio);

3094
	queue_work(fs_info->scrub_workers, &sblock->work);
3095 3096
}

3097
static void scrub_missing_raid56_worker(struct work_struct *work)
3098 3099 3100
{
	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
	struct scrub_ctx *sctx = sblock->sctx;
3101
	struct btrfs_fs_info *fs_info = sctx->fs_info;
3102 3103 3104
	u64 logical;
	struct btrfs_device *dev;

3105 3106
	logical = sblock->logical;
	dev = sblock->dev;
3107

3108
	if (sblock->no_io_error_seen)
3109
		scrub_recheck_block_checksum(sblock);
3110 3111 3112 3113 3114

	if (!sblock->no_io_error_seen) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.read_errors++;
		spin_unlock(&sctx->stat_lock);
3115
		btrfs_err_rl_in_rcu(fs_info,
3116
			"IO error rebuilding logical %llu for dev %s",
3117
			logical, btrfs_dev_name(dev));
3118 3119 3120 3121
	} else if (sblock->header_error || sblock->checksum_error) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.uncorrectable_errors++;
		spin_unlock(&sctx->stat_lock);
3122
		btrfs_err_rl_in_rcu(fs_info,
3123
			"failed to rebuild valid logical %llu for dev %s",
3124
			logical, btrfs_dev_name(dev));
3125 3126 3127 3128
	} else {
		scrub_write_block_to_dev_replace(sblock);
	}

3129
	if (sctx->is_dev_replace && sctx->flush_all_writes) {
3130
		mutex_lock(&sctx->wr_lock);
3131
		scrub_wr_submit(sctx);
3132
		mutex_unlock(&sctx->wr_lock);
3133 3134
	}

3135
	scrub_block_put(sblock);
3136 3137 3138 3139 3140 3141
	scrub_pending_bio_dec(sctx);
}

static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
	struct scrub_ctx *sctx = sblock->sctx;
3142
	struct btrfs_fs_info *fs_info = sctx->fs_info;
3143
	u64 length = sblock->sector_count << fs_info->sectorsize_bits;
3144
	u64 logical = sblock->logical;
3145
	struct btrfs_io_context *bioc = NULL;
3146 3147 3148 3149 3150
	struct bio *bio;
	struct btrfs_raid_bio *rbio;
	int ret;
	int i;

3151
	btrfs_bio_counter_inc_blocked(fs_info);
3152
	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
3153
			       &length, &bioc);
3154
	if (ret || !bioc)
3155
		goto bioc_out;
3156 3157

	if (WARN_ON(!sctx->is_dev_replace ||
3158
		    !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3159 3160 3161 3162
		/*
		 * We shouldn't be scrubbing a missing device. Even for dev
		 * replace, we should only get here for RAID 5/6. We either
		 * managed to mount something with no mirrors remaining or
3163
		 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
3164
		 */
3165
		goto bioc_out;
3166 3167
	}

3168
	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
3169 3170 3171 3172
	bio->bi_iter.bi_sector = logical >> 9;
	bio->bi_private = sblock;
	bio->bi_end_io = scrub_missing_raid56_end_io;

3173
	rbio = raid56_alloc_missing_rbio(bio, bioc);
3174 3175 3176
	if (!rbio)
		goto rbio_out;

3177
	for (i = 0; i < sblock->sector_count; i++) {
3178
		struct scrub_sector *sector = sblock->sectors[i];
3179

3180 3181
		raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
				       scrub_sector_get_page_offset(sector),
3182
				       sector->offset + sector->sblock->logical);
3183 3184
	}

3185
	INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
3186 3187 3188
	scrub_block_get(sblock);
	scrub_pending_bio_inc(sctx);
	raid56_submit_missing_rbio(rbio);
3189
	btrfs_put_bioc(bioc);
3190 3191 3192 3193
	return;

rbio_out:
	bio_put(bio);
3194
bioc_out:
3195
	btrfs_bio_counter_dec(fs_info);
3196
	btrfs_put_bioc(bioc);
3197 3198 3199 3200 3201
	spin_lock(&sctx->stat_lock);
	sctx->stat.malloc_errors++;
	spin_unlock(&sctx->stat_lock);
}

3202
static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
3203
		       u64 physical, struct btrfs_device *dev, u64 flags,
3204
		       u64 gen, int mirror_num, u8 *csum,
3205
		       u64 physical_for_dev_replace)
3206 3207
{
	struct scrub_block *sblock;
3208
	const u32 sectorsize = sctx->fs_info->sectorsize;
3209 3210
	int index;

3211 3212
	sblock = alloc_scrub_block(sctx, dev, logical, physical,
				   physical_for_dev_replace, mirror_num);
3213
	if (!sblock) {
3214 3215 3216
		spin_lock(&sctx->stat_lock);
		sctx->stat.malloc_errors++;
		spin_unlock(&sctx->stat_lock);
3217
		return -ENOMEM;
A
Arne Jansen 已提交
3218
	}
3219 3220

	for (index = 0; len > 0; index++) {
3221
		struct scrub_sector *sector;
3222 3223 3224 3225 3226 3227
		/*
		 * Here we will allocate one page for one sector to scrub.
		 * This is fine if PAGE_SIZE == sectorsize, but will cost
		 * more memory for PAGE_SIZE > sectorsize case.
		 */
		u32 l = min(sectorsize, len);
3228

3229
		sector = alloc_scrub_sector(sblock, logical);
3230
		if (!sector) {
3231 3232 3233
			spin_lock(&sctx->stat_lock);
			sctx->stat.malloc_errors++;
			spin_unlock(&sctx->stat_lock);
3234
			scrub_block_put(sblock);
3235 3236
			return -ENOMEM;
		}
3237 3238
		sector->flags = flags;
		sector->generation = gen;
3239
		if (csum) {
3240 3241
			sector->have_csum = 1;
			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
3242
		} else {
3243
			sector->have_csum = 0;
3244 3245 3246 3247
		}
		len -= l;
		logical += l;
		physical += l;
3248
		physical_for_dev_replace += l;
3249 3250
	}

3251
	WARN_ON(sblock->sector_count == 0);
3252
	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
3253 3254 3255 3256 3257 3258
		/*
		 * This case should only be hit for RAID 5/6 device replace. See
		 * the comment in scrub_missing_raid56_pages() for details.
		 */
		scrub_missing_raid56_pages(sblock);
	} else {
3259
		for (index = 0; index < sblock->sector_count; index++) {
3260
			struct scrub_sector *sector = sblock->sectors[index];
3261
			int ret;
3262

3263
			ret = scrub_add_sector_to_rd_bio(sctx, sector);
3264 3265 3266 3267
			if (ret) {
				scrub_block_put(sblock);
				return ret;
			}
3268
		}
A
Arne Jansen 已提交
3269

3270
		if (flags & BTRFS_EXTENT_FLAG_SUPER)
3271 3272
			scrub_submit(sctx);
	}
A
Arne Jansen 已提交
3273

3274 3275
	/* last one frees, either here or in bio completion for last page */
	scrub_block_put(sblock);
A
Arne Jansen 已提交
3276 3277 3278
	return 0;
}

3279
static void scrub_bio_end_io(struct bio *bio)
3280 3281
{
	struct scrub_bio *sbio = bio->bi_private;
3282
	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
3283

3284
	sbio->status = bio->bi_status;
3285 3286
	sbio->bio = bio;

3287
	queue_work(fs_info->scrub_workers, &sbio->work);
3288 3289
}

3290
static void scrub_bio_end_io_worker(struct work_struct *work)
3291 3292
{
	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
3293
	struct scrub_ctx *sctx = sbio->sctx;
3294 3295
	int i;

3296
	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
3297
	if (sbio->status) {
3298 3299
		for (i = 0; i < sbio->sector_count; i++) {
			struct scrub_sector *sector = sbio->sectors[i];
3300

3301 3302
			sector->io_error = 1;
			sector->sblock->no_io_error_seen = 0;
3303 3304 3305
		}
	}

3306
	/* Now complete the scrub_block items that have all pages completed */
3307 3308
	for (i = 0; i < sbio->sector_count; i++) {
		struct scrub_sector *sector = sbio->sectors[i];
3309
		struct scrub_block *sblock = sector->sblock;
3310

3311
		if (atomic_dec_and_test(&sblock->outstanding_sectors))
3312 3313 3314 3315 3316 3317
			scrub_block_complete(sblock);
		scrub_block_put(sblock);
	}

	bio_put(sbio->bio);
	sbio->bio = NULL;
3318 3319 3320 3321
	spin_lock(&sctx->list_lock);
	sbio->next_free = sctx->first_free;
	sctx->first_free = sbio->index;
	spin_unlock(&sctx->list_lock);
3322

3323
	if (sctx->is_dev_replace && sctx->flush_all_writes) {
3324
		mutex_lock(&sctx->wr_lock);
3325
		scrub_wr_submit(sctx);
3326
		mutex_unlock(&sctx->wr_lock);
3327 3328
	}

3329
	scrub_pending_bio_dec(sctx);
3330 3331
}

3332 3333
static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
				       unsigned long *bitmap,
3334
				       u64 start, u32 len)
3335
{
3336
	u64 offset;
3337
	u32 nsectors;
3338
	u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
3339 3340 3341 3342 3343 3344 3345

	if (len >= sparity->stripe_len) {
		bitmap_set(bitmap, 0, sparity->nsectors);
		return;
	}

	start -= sparity->logic_start;
3346
	start = div64_u64_rem(start, sparity->stripe_len, &offset);
3347
	offset = offset >> sectorsize_bits;
3348
	nsectors = len >> sectorsize_bits;
3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359

	if (offset + nsectors <= sparity->nsectors) {
		bitmap_set(bitmap, offset, nsectors);
		return;
	}

	bitmap_set(bitmap, offset, sparity->nsectors - offset);
	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
}

static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
3360
						   u64 start, u32 len)
3361
{
3362
	__scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
3363 3364 3365
}

static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
3366
						  u64 start, u32 len)
3367
{
3368
	__scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
3369 3370
}

3371 3372
static void scrub_block_complete(struct scrub_block *sblock)
{
3373 3374
	int corrupted = 0;

3375
	if (!sblock->no_io_error_seen) {
3376
		corrupted = 1;
3377
		scrub_handle_errored_block(sblock);
3378 3379 3380 3381 3382 3383
	} else {
		/*
		 * if has checksum error, write via repair mechanism in
		 * dev replace case, otherwise write here in dev replace
		 * case.
		 */
3384 3385
		corrupted = scrub_checksum(sblock);
		if (!corrupted && sblock->sctx->is_dev_replace)
3386 3387
			scrub_write_block_to_dev_replace(sblock);
	}
3388 3389

	if (sblock->sparity && corrupted && !sblock->data_corrected) {
3390 3391 3392
		u64 start = sblock->logical;
		u64 end = sblock->logical +
			  sblock->sectors[sblock->sector_count - 1]->offset +
3393
			  sblock->sctx->fs_info->sectorsize;
3394

3395
		ASSERT(end - start <= U32_MAX);
3396 3397 3398
		scrub_parity_mark_sectors_error(sblock->sparity,
						start, end - start);
	}
3399 3400
}

3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412
static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
{
	sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
	list_del(&sum->list);
	kfree(sum);
}

/*
 * Find the desired csum for range [logical, logical + sectorsize), and store
 * the csum into @csum.
 *
 * The search source is sctx->csum_list, which is a pre-populated list
D
David Sterba 已提交
3413
 * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
3414 3415 3416 3417 3418
 * that is before @logical.
 *
 * Return 0 if there is no csum for the range.
 * Return 1 if there is csum for the range and copied to @csum.
 */
3419
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
A
Arne Jansen 已提交
3420
{
3421
	bool found = false;
A
Arne Jansen 已提交
3422

3423
	while (!list_empty(&sctx->csum_list)) {
3424 3425 3426 3427
		struct btrfs_ordered_sum *sum = NULL;
		unsigned long index;
		unsigned long num_sectors;

3428
		sum = list_first_entry(&sctx->csum_list,
A
Arne Jansen 已提交
3429
				       struct btrfs_ordered_sum, list);
3430
		/* The current csum range is beyond our range, no csum found */
A
Arne Jansen 已提交
3431 3432 3433
		if (sum->bytenr > logical)
			break;

3434 3435 3436 3437 3438 3439 3440 3441 3442 3443
		/*
		 * The current sum is before our bytenr, since scrub is always
		 * done in bytenr order, the csum will never be used anymore,
		 * clean it up so that later calls won't bother with the range,
		 * and continue search the next range.
		 */
		if (sum->bytenr + sum->len <= logical) {
			drop_csum_range(sctx, sum);
			continue;
		}
A
Arne Jansen 已提交
3444

3445 3446 3447 3448
		/* Now the csum range covers our bytenr, copy the csum */
		found = true;
		index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
		num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
3449

3450 3451 3452 3453 3454 3455 3456
		memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
		       sctx->fs_info->csum_size);

		/* Cleanup the range if we're at the end of the csum range */
		if (index == num_sectors - 1)
			drop_csum_range(sctx, sum);
		break;
A
Arne Jansen 已提交
3457
	}
3458 3459
	if (!found)
		return 0;
3460
	return 1;
A
Arne Jansen 已提交
3461 3462
}

3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566
static bool should_use_device(struct btrfs_fs_info *fs_info,
			      struct btrfs_device *dev,
			      bool follow_replace_read_mode)
{
	struct btrfs_device *replace_srcdev = fs_info->dev_replace.srcdev;
	struct btrfs_device *replace_tgtdev = fs_info->dev_replace.tgtdev;

	if (!dev->bdev)
		return false;

	/*
	 * We're doing scrub/replace, if it's pure scrub, no tgtdev should be
	 * here.  If it's replace, we're going to write data to tgtdev, thus
	 * the current data of the tgtdev is all garbage, thus we can not use
	 * it at all.
	 */
	if (dev == replace_tgtdev)
		return false;

	/* No need to follow replace read mode, any existing device is fine. */
	if (!follow_replace_read_mode)
		return true;

	/* Need to follow the mode. */
	if (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		return dev != replace_srcdev;
	return true;
}
static int scrub_find_good_copy(struct btrfs_fs_info *fs_info,
				u64 extent_logical, u32 extent_len,
				u64 *extent_physical,
				struct btrfs_device **extent_dev,
				int *extent_mirror_num)
{
	u64 mapped_length;
	struct btrfs_io_context *bioc = NULL;
	int ret;
	int i;

	mapped_length = extent_len;
	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
			      extent_logical, &mapped_length, &bioc, 0);
	if (ret || !bioc || mapped_length < extent_len) {
		btrfs_put_bioc(bioc);
		btrfs_err_rl(fs_info, "btrfs_map_block() failed for logical %llu: %d",
				extent_logical, ret);
		return -EIO;
	}

	/*
	 * First loop to exclude all missing devices and the source device if
	 * needed.  And we don't want to use target device as mirror either, as
	 * we're doing the replace, the target device range contains nothing.
	 */
	for (i = 0; i < bioc->num_stripes - bioc->replace_nr_stripes; i++) {
		struct btrfs_io_stripe *stripe = &bioc->stripes[i];

		if (!should_use_device(fs_info, stripe->dev, true))
			continue;
		goto found;
	}
	/*
	 * We didn't find any alternative mirrors, we have to break our replace
	 * read mode, or we can not read at all.
	 */
	for (i = 0; i < bioc->num_stripes - bioc->replace_nr_stripes; i++) {
		struct btrfs_io_stripe *stripe = &bioc->stripes[i];

		if (!should_use_device(fs_info, stripe->dev, false))
			continue;
		goto found;
	}

	btrfs_err_rl(fs_info, "failed to find any live mirror for logical %llu",
			extent_logical);
	return -EIO;

found:
	*extent_physical = bioc->stripes[i].physical;
	*extent_mirror_num = i + 1;
	*extent_dev = bioc->stripes[i].dev;
	btrfs_put_bioc(bioc);
	return 0;
}

static bool scrub_need_different_mirror(struct scrub_ctx *sctx,
					struct map_lookup *map,
					struct btrfs_device *dev)
{
	/*
	 * For RAID56, all the extra mirrors are rebuilt from other P/Q,
	 * cannot utilize other mirrors directly.
	 */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
		return false;

	if (!dev->bdev)
		return true;

	return sctx->fs_info->dev_replace.cont_reading_from_srcdev_mode ==
		BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID;
}

A
Arne Jansen 已提交
3567
/* scrub extent tries to collect up to 64 kB for each bio */
L
Liu Bo 已提交
3568
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
3569
			u64 logical, u32 len,
3570
			u64 physical, struct btrfs_device *dev, u64 flags,
3571
			u64 gen, int mirror_num)
A
Arne Jansen 已提交
3572
{
3573 3574 3575
	struct btrfs_device *src_dev = dev;
	u64 src_physical = physical;
	int src_mirror = mirror_num;
A
Arne Jansen 已提交
3576 3577
	int ret;
	u8 csum[BTRFS_CSUM_SIZE];
3578 3579 3580
	u32 blocksize;

	if (flags & BTRFS_EXTENT_FLAG_DATA) {
L
Liu Bo 已提交
3581
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
3582
			blocksize = BTRFS_STRIPE_LEN;
L
Liu Bo 已提交
3583
		else
3584
			blocksize = sctx->fs_info->sectorsize;
3585 3586 3587 3588
		spin_lock(&sctx->stat_lock);
		sctx->stat.data_extents_scrubbed++;
		sctx->stat.data_bytes_scrubbed += len;
		spin_unlock(&sctx->stat_lock);
3589
	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
L
Liu Bo 已提交
3590
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
3591
			blocksize = BTRFS_STRIPE_LEN;
L
Liu Bo 已提交
3592 3593
		else
			blocksize = sctx->fs_info->nodesize;
3594 3595 3596 3597
		spin_lock(&sctx->stat_lock);
		sctx->stat.tree_extents_scrubbed++;
		sctx->stat.tree_bytes_scrubbed += len;
		spin_unlock(&sctx->stat_lock);
3598
	} else {
3599
		blocksize = sctx->fs_info->sectorsize;
3600
		WARN_ON(1);
3601
	}
A
Arne Jansen 已提交
3602

3603
	/*
3604 3605
	 * For dev-replace case, we can have @dev being a missing device, or
	 * we want to avoid reading from the source device if possible.
3606
	 */
3607 3608 3609 3610 3611 3612
	if (sctx->is_dev_replace && scrub_need_different_mirror(sctx, map, dev)) {
		ret = scrub_find_good_copy(sctx->fs_info, logical, len,
					   &src_physical, &src_dev, &src_mirror);
		if (ret < 0)
			return ret;
	}
A
Arne Jansen 已提交
3613
	while (len) {
3614
		u32 l = min(len, blocksize);
A
Arne Jansen 已提交
3615 3616 3617 3618
		int have_csum = 0;

		if (flags & BTRFS_EXTENT_FLAG_DATA) {
			/* push csums to sbio */
3619
			have_csum = scrub_find_csum(sctx, logical, csum);
A
Arne Jansen 已提交
3620
			if (have_csum == 0)
3621
				++sctx->stat.no_csum;
A
Arne Jansen 已提交
3622
		}
3623 3624 3625
		ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
				    flags, gen, src_mirror,
				    have_csum ? csum : NULL, physical);
A
Arne Jansen 已提交
3626 3627 3628 3629 3630
		if (ret)
			return ret;
		len -= l;
		logical += l;
		physical += l;
3631
		src_physical += l;
A
Arne Jansen 已提交
3632 3633 3634 3635
	}
	return 0;
}

3636
static int scrub_sectors_for_parity(struct scrub_parity *sparity,
3637
				  u64 logical, u32 len,
3638 3639 3640 3641 3642
				  u64 physical, struct btrfs_device *dev,
				  u64 flags, u64 gen, int mirror_num, u8 *csum)
{
	struct scrub_ctx *sctx = sparity->sctx;
	struct scrub_block *sblock;
3643
	const u32 sectorsize = sctx->fs_info->sectorsize;
3644 3645
	int index;

3646 3647
	ASSERT(IS_ALIGNED(len, sectorsize));

3648
	sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659
	if (!sblock) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.malloc_errors++;
		spin_unlock(&sctx->stat_lock);
		return -ENOMEM;
	}

	sblock->sparity = sparity;
	scrub_parity_get(sparity);

	for (index = 0; len > 0; index++) {
3660
		struct scrub_sector *sector;
3661

3662
		sector = alloc_scrub_sector(sblock, logical);
3663
		if (!sector) {
3664 3665 3666 3667 3668 3669
			spin_lock(&sctx->stat_lock);
			sctx->stat.malloc_errors++;
			spin_unlock(&sctx->stat_lock);
			scrub_block_put(sblock);
			return -ENOMEM;
		}
3670
		sblock->sectors[index] = sector;
3671
		/* For scrub parity */
3672 3673 3674 3675
		scrub_sector_get(sector);
		list_add_tail(&sector->list, &sparity->sectors_list);
		sector->flags = flags;
		sector->generation = gen;
3676
		if (csum) {
3677 3678
			sector->have_csum = 1;
			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
3679
		} else {
3680
			sector->have_csum = 0;
3681
		}
3682 3683 3684 3685 3686

		/* Iterate over the stripe range in sectorsize steps */
		len -= sectorsize;
		logical += sectorsize;
		physical += sectorsize;
3687 3688
	}

3689 3690
	WARN_ON(sblock->sector_count == 0);
	for (index = 0; index < sblock->sector_count; index++) {
3691
		struct scrub_sector *sector = sblock->sectors[index];
3692 3693
		int ret;

3694
		ret = scrub_add_sector_to_rd_bio(sctx, sector);
3695 3696 3697 3698 3699 3700
		if (ret) {
			scrub_block_put(sblock);
			return ret;
		}
	}

3701
	/* Last one frees, either here or in bio completion for last sector */
3702 3703 3704 3705 3706
	scrub_block_put(sblock);
	return 0;
}

static int scrub_extent_for_parity(struct scrub_parity *sparity,
3707
				   u64 logical, u32 len,
3708 3709 3710 3711 3712 3713 3714 3715
				   u64 physical, struct btrfs_device *dev,
				   u64 flags, u64 gen, int mirror_num)
{
	struct scrub_ctx *sctx = sparity->sctx;
	int ret;
	u8 csum[BTRFS_CSUM_SIZE];
	u32 blocksize;

3716
	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
3717 3718 3719 3720
		scrub_parity_mark_sectors_error(sparity, logical, len);
		return 0;
	}

3721
	if (flags & BTRFS_EXTENT_FLAG_DATA) {
L
Liu Bo 已提交
3722
		blocksize = sparity->stripe_len;
3723
	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
L
Liu Bo 已提交
3724
		blocksize = sparity->stripe_len;
3725
	} else {
3726
		blocksize = sctx->fs_info->sectorsize;
3727 3728 3729 3730
		WARN_ON(1);
	}

	while (len) {
3731
		u32 l = min(len, blocksize);
3732 3733 3734 3735
		int have_csum = 0;

		if (flags & BTRFS_EXTENT_FLAG_DATA) {
			/* push csums to sbio */
3736
			have_csum = scrub_find_csum(sctx, logical, csum);
3737 3738 3739
			if (have_csum == 0)
				goto skip;
		}
3740
		ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
3741 3742 3743 3744
					     flags, gen, mirror_num,
					     have_csum ? csum : NULL);
		if (ret)
			return ret;
3745
skip:
3746 3747 3748 3749 3750 3751 3752
		len -= l;
		logical += l;
		physical += l;
	}
	return 0;
}

3753 3754 3755 3756 3757 3758 3759 3760
/*
 * Given a physical address, this will calculate it's
 * logical offset. if this is a parity stripe, it will return
 * the most left data stripe's logical offset.
 *
 * return 0 if it is a data stripe, 1 means parity stripe.
 */
static int get_raid56_logic_offset(u64 physical, int num,
3761 3762
				   struct map_lookup *map, u64 *offset,
				   u64 *stripe_start)
3763 3764 3765 3766
{
	int i;
	int j = 0;
	u64 last_offset;
3767
	const int data_stripes = nr_data_stripes(map);
3768

3769
	last_offset = (physical - map->stripes[num].physical) * data_stripes;
3770 3771 3772
	if (stripe_start)
		*stripe_start = last_offset;

3773
	*offset = last_offset;
3774
	for (i = 0; i < data_stripes; i++) {
3775 3776 3777 3778
		u32 stripe_nr;
		u32 stripe_index;
		u32 rot;

3779
		*offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
3780

3781
		stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
3782 3783

		/* Work out the disk rotation on this stripe-set */
3784 3785
		rot = stripe_nr % map->num_stripes;
		stripe_nr /= map->num_stripes;
3786 3787
		/* calculate which stripe this data locates */
		rot += i;
3788
		stripe_index = rot % map->num_stripes;
3789 3790 3791 3792 3793
		if (stripe_index == num)
			return 0;
		if (stripe_index < num)
			j++;
	}
3794
	*offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
3795 3796 3797
	return 1;
}

3798 3799 3800
static void scrub_free_parity(struct scrub_parity *sparity)
{
	struct scrub_ctx *sctx = sparity->sctx;
3801
	struct scrub_sector *curr, *next;
3802 3803
	int nbits;

3804
	nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
3805 3806 3807 3808 3809 3810 3811
	if (nbits) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.read_errors += nbits;
		sctx->stat.uncorrectable_errors += nbits;
		spin_unlock(&sctx->stat_lock);
	}

3812
	list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
3813
		list_del_init(&curr->list);
3814
		scrub_sector_put(curr);
3815 3816 3817 3818 3819
	}

	kfree(sparity);
}

3820
static void scrub_parity_bio_endio_worker(struct work_struct *work)
3821 3822 3823 3824 3825
{
	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
						    work);
	struct scrub_ctx *sctx = sparity->sctx;

3826
	btrfs_bio_counter_dec(sctx->fs_info);
3827 3828 3829 3830
	scrub_free_parity(sparity);
	scrub_pending_bio_dec(sctx);
}

3831
static void scrub_parity_bio_endio(struct bio *bio)
3832
{
Y
Yu Zhe 已提交
3833
	struct scrub_parity *sparity = bio->bi_private;
3834
	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
3835

3836
	if (bio->bi_status)
3837 3838
		bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
			  &sparity->dbitmap, sparity->nsectors);
3839 3840

	bio_put(bio);
3841

3842 3843
	INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
	queue_work(fs_info->scrub_parity_workers, &sparity->work);
3844 3845 3846 3847 3848
}

static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
{
	struct scrub_ctx *sctx = sparity->sctx;
3849
	struct btrfs_fs_info *fs_info = sctx->fs_info;
3850 3851
	struct bio *bio;
	struct btrfs_raid_bio *rbio;
3852
	struct btrfs_io_context *bioc = NULL;
3853 3854 3855
	u64 length;
	int ret;

3856 3857
	if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
			   &sparity->ebitmap, sparity->nsectors))
3858 3859
		goto out;

3860
	length = sparity->logic_end - sparity->logic_start;
3861 3862

	btrfs_bio_counter_inc_blocked(fs_info);
3863
	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3864
			       &length, &bioc);
3865
	if (ret || !bioc)
3866
		goto bioc_out;
3867

3868
	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
3869 3870 3871 3872
	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
	bio->bi_private = sparity;
	bio->bi_end_io = scrub_parity_bio_endio;

3873
	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
3874
					      sparity->scrub_dev,
3875
					      &sparity->dbitmap,
3876
					      sparity->nsectors);
3877
	btrfs_put_bioc(bioc);
3878 3879 3880 3881 3882 3883 3884 3885 3886
	if (!rbio)
		goto rbio_out;

	scrub_pending_bio_inc(sctx);
	raid56_parity_submit_scrub_rbio(rbio);
	return;

rbio_out:
	bio_put(bio);
3887
bioc_out:
3888
	btrfs_bio_counter_dec(fs_info);
3889
	bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
3890 3891 3892 3893 3894 3895 3896 3897 3898 3899
		  sparity->nsectors);
	spin_lock(&sctx->stat_lock);
	sctx->stat.malloc_errors++;
	spin_unlock(&sctx->stat_lock);
out:
	scrub_free_parity(sparity);
}

static void scrub_parity_get(struct scrub_parity *sparity)
{
3900
	refcount_inc(&sparity->refs);
3901 3902 3903 3904
}

static void scrub_parity_put(struct scrub_parity *sparity)
{
3905
	if (!refcount_dec_and_test(&sparity->refs))
3906 3907 3908 3909 3910
		return;

	scrub_parity_check_and_repair(sparity);
}

3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017
/*
 * Return 0 if the extent item range covers any byte of the range.
 * Return <0 if the extent item is before @search_start.
 * Return >0 if the extent item is after @start_start + @search_len.
 */
static int compare_extent_item_range(struct btrfs_path *path,
				     u64 search_start, u64 search_len)
{
	struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
	u64 len;
	struct btrfs_key key;

	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
	ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
	       key.type == BTRFS_METADATA_ITEM_KEY);
	if (key.type == BTRFS_METADATA_ITEM_KEY)
		len = fs_info->nodesize;
	else
		len = key.offset;

	if (key.objectid + len <= search_start)
		return -1;
	if (key.objectid >= search_start + search_len)
		return 1;
	return 0;
}

/*
 * Locate one extent item which covers any byte in range
 * [@search_start, @search_start + @search_length)
 *
 * If the path is not initialized, we will initialize the search by doing
 * a btrfs_search_slot().
 * If the path is already initialized, we will use the path as the initial
 * slot, to avoid duplicated btrfs_search_slot() calls.
 *
 * NOTE: If an extent item starts before @search_start, we will still
 * return the extent item. This is for data extent crossing stripe boundary.
 *
 * Return 0 if we found such extent item, and @path will point to the extent item.
 * Return >0 if no such extent item can be found, and @path will be released.
 * Return <0 if hit fatal error, and @path will be released.
 */
static int find_first_extent_item(struct btrfs_root *extent_root,
				  struct btrfs_path *path,
				  u64 search_start, u64 search_len)
{
	struct btrfs_fs_info *fs_info = extent_root->fs_info;
	struct btrfs_key key;
	int ret;

	/* Continue using the existing path */
	if (path->nodes[0])
		goto search_forward;

	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;
	key.objectid = search_start;
	key.offset = (u64)-1;

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		return ret;

	ASSERT(ret > 0);
	/*
	 * Here we intentionally pass 0 as @min_objectid, as there could be
	 * an extent item starting before @search_start.
	 */
	ret = btrfs_previous_extent_item(extent_root, path, 0);
	if (ret < 0)
		return ret;
	/*
	 * No matter whether we have found an extent item, the next loop will
	 * properly do every check on the key.
	 */
search_forward:
	while (true) {
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		if (key.objectid >= search_start + search_len)
			break;
		if (key.type != BTRFS_METADATA_ITEM_KEY &&
		    key.type != BTRFS_EXTENT_ITEM_KEY)
			goto next;

		ret = compare_extent_item_range(path, search_start, search_len);
		if (ret == 0)
			return ret;
		if (ret > 0)
			break;
next:
		path->slots[0]++;
		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
			ret = btrfs_next_leaf(extent_root, path);
			if (ret) {
				/* Either no more item or fatal error */
				btrfs_release_path(path);
				return ret;
			}
		}
	}
	btrfs_release_path(path);
	return 1;
}

4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036
static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
			    u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
{
	struct btrfs_key key;
	struct btrfs_extent_item *ei;

	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
	ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
	       key.type == BTRFS_EXTENT_ITEM_KEY);
	*extent_start_ret = key.objectid;
	if (key.type == BTRFS_METADATA_ITEM_KEY)
		*size_ret = path->nodes[0]->fs_info->nodesize;
	else
		*size_ret = key.offset;
	ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
	*flags_ret = btrfs_extent_flags(path->nodes[0], ei);
	*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
}

4037 4038 4039 4040 4041 4042 4043 4044 4045
static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
				      u64 boundary_start, u64 boudary_len)
{
	return (extent_start < boundary_start &&
		extent_start + extent_len > boundary_start) ||
	       (extent_start < boundary_start + boudary_len &&
		extent_start + extent_len > boundary_start + boudary_len);
}

4046 4047 4048 4049 4050 4051 4052 4053 4054 4055
static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
					       struct scrub_parity *sparity,
					       struct map_lookup *map,
					       struct btrfs_device *sdev,
					       struct btrfs_path *path,
					       u64 logical)
{
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
4056
	u64 cur_logical = logical;
4057 4058 4059 4060 4061 4062 4063
	int ret;

	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);

	/* Path must not be populated */
	ASSERT(!path->nodes[0]);

4064
	while (cur_logical < logical + BTRFS_STRIPE_LEN) {
4065 4066 4067 4068 4069 4070 4071 4072 4073 4074
		struct btrfs_io_context *bioc = NULL;
		struct btrfs_device *extent_dev;
		u64 extent_start;
		u64 extent_size;
		u64 mapped_length;
		u64 extent_flags;
		u64 extent_gen;
		u64 extent_physical;
		u64 extent_mirror_num;

4075
		ret = find_first_extent_item(extent_root, path, cur_logical,
4076
					     logical + BTRFS_STRIPE_LEN - cur_logical);
4077 4078 4079
		/* No more extent item in this data stripe */
		if (ret > 0) {
			ret = 0;
4080 4081
			break;
		}
4082
		if (ret < 0)
4083
			break;
4084 4085
		get_extent_info(path, &extent_start, &extent_size, &extent_flags,
				&extent_gen);
4086

4087
		/* Metadata should not cross stripe boundaries */
4088
		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
4089
		    does_range_cross_boundary(extent_start, extent_size,
4090
					      logical, BTRFS_STRIPE_LEN)) {
4091
			btrfs_err(fs_info,
4092 4093
	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
				  extent_start, logical);
4094 4095 4096
			spin_lock(&sctx->stat_lock);
			sctx->stat.uncorrectable_errors++;
			spin_unlock(&sctx->stat_lock);
4097 4098
			cur_logical += extent_size;
			continue;
4099 4100
		}

4101 4102
		/* Skip hole range which doesn't have any extent */
		cur_logical = max(extent_start, cur_logical);
4103

4104 4105
		/* Truncate the range inside this data stripe */
		extent_size = min(extent_start + extent_size,
4106
				  logical + BTRFS_STRIPE_LEN) - cur_logical;
4107 4108
		extent_start = cur_logical;
		ASSERT(extent_size <= U32_MAX);
4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127

		scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);

		mapped_length = extent_size;
		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
				      &mapped_length, &bioc, 0);
		if (!ret && (!bioc || mapped_length < extent_size))
			ret = -EIO;
		if (ret) {
			btrfs_put_bioc(bioc);
			scrub_parity_mark_sectors_error(sparity, extent_start,
							extent_size);
			break;
		}
		extent_physical = bioc->stripes[0].physical;
		extent_mirror_num = bioc->mirror_num;
		extent_dev = bioc->stripes[0].dev;
		btrfs_put_bioc(bioc);

4128 4129 4130
		ret = btrfs_lookup_csums_list(csum_root, extent_start,
					      extent_start + extent_size - 1,
					      &sctx->csum_list, 1, false);
4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149
		if (ret) {
			scrub_parity_mark_sectors_error(sparity, extent_start,
							extent_size);
			break;
		}

		ret = scrub_extent_for_parity(sparity, extent_start,
					      extent_size, extent_physical,
					      extent_dev, extent_flags,
					      extent_gen, extent_mirror_num);
		scrub_free_csums(sctx);

		if (ret) {
			scrub_parity_mark_sectors_error(sparity, extent_start,
							extent_size);
			break;
		}

		cond_resched();
4150
		cur_logical += extent_size;
4151 4152 4153 4154 4155
	}
	btrfs_release_path(path);
	return ret;
}

4156 4157 4158 4159 4160 4161
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
						  struct map_lookup *map,
						  struct btrfs_device *sdev,
						  u64 logic_start,
						  u64 logic_end)
{
4162
	struct btrfs_fs_info *fs_info = sctx->fs_info;
4163
	struct btrfs_path *path;
4164
	u64 cur_logical;
4165 4166 4167 4168
	int ret;
	struct scrub_parity *sparity;
	int nsectors;

4169 4170 4171 4172 4173 4174 4175 4176 4177 4178
	path = btrfs_alloc_path();
	if (!path) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.malloc_errors++;
		spin_unlock(&sctx->stat_lock);
		return -ENOMEM;
	}
	path->search_commit_root = 1;
	path->skip_locking = 1;

4179
	nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
4180 4181
	ASSERT(nsectors <= BITS_PER_LONG);
	sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
4182 4183 4184 4185
	if (!sparity) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.malloc_errors++;
		spin_unlock(&sctx->stat_lock);
4186
		btrfs_free_path(path);
4187 4188 4189
		return -ENOMEM;
	}

4190
	sparity->stripe_len = BTRFS_STRIPE_LEN;
4191 4192 4193 4194 4195
	sparity->nsectors = nsectors;
	sparity->sctx = sctx;
	sparity->scrub_dev = sdev;
	sparity->logic_start = logic_start;
	sparity->logic_end = logic_end;
4196
	refcount_set(&sparity->refs, 1);
4197
	INIT_LIST_HEAD(&sparity->sectors_list);
4198 4199

	ret = 0;
4200
	for (cur_logical = logic_start; cur_logical < logic_end;
4201
	     cur_logical += BTRFS_STRIPE_LEN) {
4202 4203
		ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
							  sdev, path, cur_logical);
4204 4205
		if (ret < 0)
			break;
4206
	}
4207

4208 4209
	scrub_parity_put(sparity);
	scrub_submit(sctx);
4210
	mutex_lock(&sctx->wr_lock);
4211
	scrub_wr_submit(sctx);
4212
	mutex_unlock(&sctx->wr_lock);
4213

4214
	btrfs_free_path(path);
4215 4216 4217
	return ret < 0 ? ret : 0;
}

4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231
static void sync_replace_for_zoned(struct scrub_ctx *sctx)
{
	if (!btrfs_is_zoned(sctx->fs_info))
		return;

	sctx->flush_all_writes = true;
	scrub_submit(sctx);
	mutex_lock(&sctx->wr_lock);
	scrub_wr_submit(sctx);
	mutex_unlock(&sctx->wr_lock);

	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
}

4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257
static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
					u64 physical, u64 physical_end)
{
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	int ret = 0;

	if (!btrfs_is_zoned(fs_info))
		return 0;

	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);

	mutex_lock(&sctx->wr_lock);
	if (sctx->write_pointer < physical_end) {
		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
						    physical,
						    sctx->write_pointer);
		if (ret)
			btrfs_err(fs_info,
				  "zoned: failed to recover write pointer");
	}
	mutex_unlock(&sctx->wr_lock);
	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);

	return ret;
}

4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329
static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
				 struct scrub_stripe *stripe,
				 u64 extent_start, u64 extent_len,
				 u64 extent_flags, u64 extent_gen)
{
	for (u64 cur_logical = max(stripe->logical, extent_start);
	     cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
			       extent_start + extent_len);
	     cur_logical += fs_info->sectorsize) {
		const int nr_sector = (cur_logical - stripe->logical) >>
				      fs_info->sectorsize_bits;
		struct scrub_sector_verification *sector =
						&stripe->sectors[nr_sector];

		set_bit(nr_sector, &stripe->extent_sector_bitmap);
		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			sector->is_metadata = true;
			sector->generation = extent_gen;
		}
	}
}

static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
	stripe->extent_sector_bitmap = 0;
	stripe->init_error_bitmap = 0;
	stripe->error_bitmap = 0;
	stripe->io_error_bitmap = 0;
	stripe->csum_error_bitmap = 0;
	stripe->meta_error_bitmap = 0;
}

/*
 * Locate one stripe which has at least one extent in its range.
 *
 * Return 0 if found such stripe, and store its info into @stripe.
 * Return >0 if there is no such stripe in the specified range.
 * Return <0 for error.
 */
int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
				 struct btrfs_device *dev, u64 physical,
				 int mirror_num, u64 logical_start,
				 u32 logical_len, struct scrub_stripe *stripe)
{
	struct btrfs_fs_info *fs_info = bg->fs_info;
	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
	const u64 logical_end = logical_start + logical_len;
	struct btrfs_path path = { 0 };
	u64 cur_logical = logical_start;
	u64 stripe_end;
	u64 extent_start;
	u64 extent_len;
	u64 extent_flags;
	u64 extent_gen;
	int ret;

	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
				   stripe->nr_sectors);
	scrub_stripe_reset_bitmaps(stripe);

	/* The range must be inside the bg. */
	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);

	path.search_commit_root = 1;
	path.skip_locking = 1;

	ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
	/* Either error or not found. */
	if (ret)
		goto out;
	get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
4330 4331 4332 4333
	if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
		stripe->nr_meta_extents++;
	if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
		stripe->nr_data_extents++;
4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366
	cur_logical = max(extent_start, cur_logical);

	/*
	 * Round down to stripe boundary.
	 *
	 * The extra calculation against bg->start is to handle block groups
	 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
	 */
	stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
			  bg->start;
	stripe->physical = physical + stripe->logical - logical_start;
	stripe->dev = dev;
	stripe->bg = bg;
	stripe->mirror_num = mirror_num;
	stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;

	/* Fill the first extent info into stripe->sectors[] array. */
	fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
			     extent_flags, extent_gen);
	cur_logical = extent_start + extent_len;

	/* Fill the extent info for the remaining sectors. */
	while (cur_logical <= stripe_end) {
		ret = find_first_extent_item(extent_root, &path, cur_logical,
					     stripe_end - cur_logical + 1);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			ret = 0;
			break;
		}
		get_extent_info(&path, &extent_start, &extent_len,
				&extent_flags, &extent_gen);
4367 4368 4369 4370
		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
			stripe->nr_meta_extents++;
		if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
			stripe->nr_data_extents++;
4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408
		fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
				     extent_flags, extent_gen);
		cur_logical = extent_start + extent_len;
	}

	/* Now fill the data csum. */
	if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
		int sector_nr;
		unsigned long csum_bitmap = 0;

		/* Csum space should have already been allocated. */
		ASSERT(stripe->csums);

		/*
		 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
		 * should contain at most 16 sectors.
		 */
		ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);

		ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
						stripe_end, stripe->csums,
						&csum_bitmap, true);
		if (ret < 0)
			goto out;
		if (ret > 0)
			ret = 0;

		for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
			stripe->sectors[sector_nr].csum = stripe->csums +
				sector_nr * fs_info->csum_size;
		}
	}
	set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
out:
	btrfs_release_path(&path);
	return ret;
}

4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424
/*
 * Scrub one range which can only has simple mirror based profile.
 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
 *  RAID0/RAID10).
 *
 * Since we may need to handle a subset of block group, we need @logical_start
 * and @logical_length parameter.
 */
static int scrub_simple_mirror(struct scrub_ctx *sctx,
			       struct btrfs_block_group *bg,
			       struct map_lookup *map,
			       u64 logical_start, u64 logical_length,
			       struct btrfs_device *device,
			       u64 physical, int mirror_num)
{
	struct btrfs_fs_info *fs_info = sctx->fs_info;
4425 4426
	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467
	const u64 logical_end = logical_start + logical_length;
	/* An artificial limit, inherit from old scrub behavior */
	const u32 max_length = SZ_64K;
	struct btrfs_path path = { 0 };
	u64 cur_logical = logical_start;
	int ret;

	/* The range must be inside the bg */
	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);

	path.search_commit_root = 1;
	path.skip_locking = 1;
	/* Go through each extent items inside the logical range */
	while (cur_logical < logical_end) {
		u64 extent_start;
		u64 extent_len;
		u64 extent_flags;
		u64 extent_gen;
		u64 scrub_len;

		/* Canceled? */
		if (atomic_read(&fs_info->scrub_cancel_req) ||
		    atomic_read(&sctx->cancel_req)) {
			ret = -ECANCELED;
			break;
		}
		/* Paused? */
		if (atomic_read(&fs_info->scrub_pause_req)) {
			/* Push queued extents */
			sctx->flush_all_writes = true;
			scrub_submit(sctx);
			mutex_lock(&sctx->wr_lock);
			scrub_wr_submit(sctx);
			mutex_unlock(&sctx->wr_lock);
			wait_event(sctx->list_wait,
				   atomic_read(&sctx->bios_in_flight) == 0);
			sctx->flush_all_writes = false;
			scrub_blocked_if_needed(fs_info);
		}
		/* Block group removed? */
		spin_lock(&bg->lock);
4468
		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502
			spin_unlock(&bg->lock);
			ret = 0;
			break;
		}
		spin_unlock(&bg->lock);

		ret = find_first_extent_item(extent_root, &path, cur_logical,
					     logical_end - cur_logical);
		if (ret > 0) {
			/* No more extent, just update the accounting */
			sctx->stat.last_physical = physical + logical_length;
			ret = 0;
			break;
		}
		if (ret < 0)
			break;
		get_extent_info(&path, &extent_start, &extent_len,
				&extent_flags, &extent_gen);
		/* Skip hole range which doesn't have any extent */
		cur_logical = max(extent_start, cur_logical);

		/*
		 * Scrub len has three limits:
		 * - Extent size limit
		 * - Scrub range limit
		 *   This is especially imporatant for RAID0/RAID10 to reuse
		 *   this function
		 * - Max scrub size limit
		 */
		scrub_len = min(min(extent_start + extent_len,
				    logical_end), cur_logical + max_length) -
			    cur_logical;

		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
4503
			ret = btrfs_lookup_csums_list(csum_root, cur_logical,
4504
					cur_logical + scrub_len - 1,
4505
					&sctx->csum_list, 1, false);
4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520
			if (ret)
				break;
		}
		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
		    does_range_cross_boundary(extent_start, extent_len,
					      logical_start, logical_length)) {
			btrfs_err(fs_info,
"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
				  extent_start, logical_start, logical_end);
			spin_lock(&sctx->stat_lock);
			sctx->stat.uncorrectable_errors++;
			spin_unlock(&sctx->stat_lock);
			cur_logical += scrub_len;
			continue;
		}
4521 4522 4523 4524
		ret = scrub_extent(sctx, map, cur_logical, scrub_len,
				   cur_logical - logical_start + physical,
				   device, extent_flags, extent_gen,
				   mirror_num);
4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537
		scrub_free_csums(sctx);
		if (ret)
			break;
		if (sctx->is_dev_replace)
			sync_replace_for_zoned(sctx);
		cur_logical += scrub_len;
		/* Don't hold CPU for too long time */
		cond_resched();
	}
	btrfs_release_path(&path);
	return ret;
}

4538 4539 4540 4541 4542 4543
/* Calculate the full stripe length for simple stripe based profiles */
static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
{
	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			    BTRFS_BLOCK_GROUP_RAID10));

4544
	return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559
}

/* Get the logical bytenr for the stripe */
static u64 simple_stripe_get_logical(struct map_lookup *map,
				     struct btrfs_block_group *bg,
				     int stripe_index)
{
	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			    BTRFS_BLOCK_GROUP_RAID10));
	ASSERT(stripe_index < map->num_stripes);

	/*
	 * (stripe_index / sub_stripes) gives how many data stripes we need to
	 * skip.
	 */
4560 4561
	return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
	       bg->start;
4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594
}

/* Get the mirror number for the stripe */
static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
{
	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			    BTRFS_BLOCK_GROUP_RAID10));
	ASSERT(stripe_index < map->num_stripes);

	/* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
	return stripe_index % map->sub_stripes + 1;
}

static int scrub_simple_stripe(struct scrub_ctx *sctx,
			       struct btrfs_block_group *bg,
			       struct map_lookup *map,
			       struct btrfs_device *device,
			       int stripe_index)
{
	const u64 logical_increment = simple_stripe_full_stripe_len(map);
	const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
	const u64 orig_physical = map->stripes[stripe_index].physical;
	const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
	u64 cur_logical = orig_logical;
	u64 cur_physical = orig_physical;
	int ret = 0;

	while (cur_logical < bg->start + bg->length) {
		/*
		 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
		 * this stripe.
		 */
4595 4596 4597
		ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
					  BTRFS_STRIPE_LEN, device, cur_physical,
					  mirror_num);
4598 4599 4600 4601 4602
		if (ret)
			return ret;
		/* Skip to next stripe which belongs to the target device */
		cur_logical += logical_increment;
		/* For physical offset, we just go to next stripe */
4603
		cur_physical += BTRFS_STRIPE_LEN;
4604 4605 4606 4607
	}
	return ret;
}

4608
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
4609
					   struct btrfs_block_group *bg,
4610
					   struct extent_map *em,
4611
					   struct btrfs_device *scrub_dev,
4612
					   int stripe_index)
A
Arne Jansen 已提交
4613
{
4614
	struct btrfs_fs_info *fs_info = sctx->fs_info;
4615
	struct blk_plug plug;
4616
	struct map_lookup *map = em->map_lookup;
4617
	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
4618
	const u64 chunk_logical = bg->start;
A
Arne Jansen 已提交
4619
	int ret;
4620
	u64 physical = map->stripes[stripe_index].physical;
4621 4622
	const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
	const u64 physical_end = physical + dev_stripe_len;
A
Arne Jansen 已提交
4623
	u64 logical;
L
Liu Bo 已提交
4624
	u64 logic_end;
4625
	/* The logical increment after finishing one stripe */
4626
	u64 increment;
4627
	/* Offset inside the chunk */
A
Arne Jansen 已提交
4628
	u64 offset;
4629 4630
	u64 stripe_logical;
	u64 stripe_end;
4631
	int stop_loop = 0;
D
David Woodhouse 已提交
4632

4633
	wait_event(sctx->list_wait,
4634
		   atomic_read(&sctx->bios_in_flight) == 0);
4635
	scrub_blocked_if_needed(fs_info);
A
Arne Jansen 已提交
4636

A
Arne Jansen 已提交
4637 4638 4639 4640
	/*
	 * collect all data csums for the stripe to avoid seeking during
	 * the scrub. This might currently (crc32) end up to be about 1MB
	 */
4641
	blk_start_plug(&plug);
A
Arne Jansen 已提交
4642

4643 4644 4645 4646 4647 4648 4649 4650
	if (sctx->is_dev_replace &&
	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
		mutex_lock(&sctx->wr_lock);
		sctx->write_pointer = physical;
		mutex_unlock(&sctx->wr_lock);
		sctx->flush_all_writes = true;
	}

4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667
	/*
	 * There used to be a big double loop to handle all profiles using the
	 * same routine, which grows larger and more gross over time.
	 *
	 * So here we handle each profile differently, so simpler profiles
	 * have simpler scrubbing function.
	 */
	if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
			 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
		/*
		 * Above check rules out all complex profile, the remaining
		 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
		 * mirrored duplication without stripe.
		 *
		 * Only @physical and @mirror_num needs to calculated using
		 * @stripe_index.
		 */
4668 4669
		ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
				scrub_dev, map->stripes[stripe_index].physical,
4670
				stripe_index + 1);
4671
		offset = 0;
4672 4673
		goto out;
	}
4674
	if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
4675
		ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
4676
		offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
4677 4678 4679 4680 4681
		goto out;
	}

	/* Only RAID56 goes through the old code */
	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
A
Arne Jansen 已提交
4682
	ret = 0;
4683 4684 4685 4686 4687 4688 4689 4690

	/* Calculate the logical end of the stripe */
	get_raid56_logic_offset(physical_end, stripe_index,
				map, &logic_end, NULL);
	logic_end += chunk_logical;

	/* Initialize @offset in case we need to go to out: label */
	get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
4691
	increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
4692

4693 4694 4695 4696
	/*
	 * Due to the rotation, for RAID56 it's better to iterate each stripe
	 * using their physical offset.
	 */
4697
	while (physical < physical_end) {
4698 4699
		ret = get_raid56_logic_offset(physical, stripe_index, map,
					      &logical, &stripe_logical);
4700 4701 4702 4703 4704 4705 4706 4707 4708 4709
		logical += chunk_logical;
		if (ret) {
			/* it is parity strip */
			stripe_logical += chunk_logical;
			stripe_end = stripe_logical + increment;
			ret = scrub_raid56_parity(sctx, map, scrub_dev,
						  stripe_logical,
						  stripe_end);
			if (ret)
				goto out;
4710
			goto next;
4711 4712
		}

4713 4714 4715 4716 4717 4718 4719 4720
		/*
		 * Now we're at a data stripe, scrub each extents in the range.
		 *
		 * At this stage, if we ignore the repair part, inside each data
		 * stripe it is no different than SINGLE profile.
		 * We can reuse scrub_simple_mirror() here, as the repair part
		 * is still based on @mirror_num.
		 */
4721
		ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
4722
					  scrub_dev, physical, 1);
A
Arne Jansen 已提交
4723 4724 4725 4726
		if (ret < 0)
			goto out;
next:
		logical += increment;
4727
		physical += BTRFS_STRIPE_LEN;
4728
		spin_lock(&sctx->stat_lock);
L
Liu Bo 已提交
4729
		if (stop_loop)
4730 4731
			sctx->stat.last_physical =
				map->stripes[stripe_index].physical + dev_stripe_len;
L
Liu Bo 已提交
4732 4733
		else
			sctx->stat.last_physical = physical;
4734
		spin_unlock(&sctx->stat_lock);
L
Liu Bo 已提交
4735 4736
		if (stop_loop)
			break;
A
Arne Jansen 已提交
4737
	}
4738
out:
A
Arne Jansen 已提交
4739
	/* push queued extents */
4740
	scrub_submit(sctx);
4741
	mutex_lock(&sctx->wr_lock);
4742
	scrub_wr_submit(sctx);
4743
	mutex_unlock(&sctx->wr_lock);
A
Arne Jansen 已提交
4744

4745
	blk_finish_plug(&plug);
4746 4747 4748 4749

	if (sctx->is_dev_replace && ret >= 0) {
		int ret2;

4750 4751 4752 4753
		ret2 = sync_write_pointer_for_zoned(sctx,
				chunk_logical + offset,
				map->stripes[stripe_index].physical,
				physical_end);
4754 4755 4756 4757
		if (ret2)
			ret = ret2;
	}

A
Arne Jansen 已提交
4758 4759 4760
	return ret < 0 ? ret : 0;
}

4761
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
4762
					  struct btrfs_block_group *bg,
4763
					  struct btrfs_device *scrub_dev,
4764
					  u64 dev_offset,
4765
					  u64 dev_extent_len)
A
Arne Jansen 已提交
4766
{
4767
	struct btrfs_fs_info *fs_info = sctx->fs_info;
4768
	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
A
Arne Jansen 已提交
4769 4770 4771
	struct map_lookup *map;
	struct extent_map *em;
	int i;
4772
	int ret = 0;
A
Arne Jansen 已提交
4773

4774
	read_lock(&map_tree->lock);
4775
	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
4776
	read_unlock(&map_tree->lock);
A
Arne Jansen 已提交
4777

4778 4779 4780 4781 4782
	if (!em) {
		/*
		 * Might have been an unused block group deleted by the cleaner
		 * kthread or relocation.
		 */
4783
		spin_lock(&bg->lock);
4784
		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
4785
			ret = -EINVAL;
4786
		spin_unlock(&bg->lock);
4787 4788 4789

		return ret;
	}
4790
	if (em->start != bg->start)
A
Arne Jansen 已提交
4791
		goto out;
4792
	if (em->len < dev_extent_len)
A
Arne Jansen 已提交
4793 4794
		goto out;

4795
	map = em->map_lookup;
A
Arne Jansen 已提交
4796
	for (i = 0; i < map->num_stripes; ++i) {
4797
		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
4798
		    map->stripes[i].physical == dev_offset) {
4799
			ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
A
Arne Jansen 已提交
4800 4801 4802 4803 4804 4805 4806 4807 4808 4809
			if (ret)
				goto out;
		}
	}
out:
	free_extent_map(em);

	return ret;
}

4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828
static int finish_extent_writes_for_zoned(struct btrfs_root *root,
					  struct btrfs_block_group *cache)
{
	struct btrfs_fs_info *fs_info = cache->fs_info;
	struct btrfs_trans_handle *trans;

	if (!btrfs_is_zoned(fs_info))
		return 0;

	btrfs_wait_block_group_reservations(cache);
	btrfs_wait_nocow_writers(cache);
	btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);

	trans = btrfs_join_transaction(root);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
	return btrfs_commit_transaction(trans);
}

A
Arne Jansen 已提交
4829
static noinline_for_stack
4830
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
4831
			   struct btrfs_device *scrub_dev, u64 start, u64 end)
A
Arne Jansen 已提交
4832 4833 4834
{
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
4835 4836
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
A
Arne Jansen 已提交
4837
	u64 chunk_offset;
4838
	int ret = 0;
4839
	int ro_set;
A
Arne Jansen 已提交
4840 4841 4842 4843
	int slot;
	struct extent_buffer *l;
	struct btrfs_key key;
	struct btrfs_key found_key;
4844
	struct btrfs_block_group *cache;
4845
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
A
Arne Jansen 已提交
4846 4847 4848 4849 4850

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4851
	path->reada = READA_FORWARD;
A
Arne Jansen 已提交
4852 4853 4854
	path->search_commit_root = 1;
	path->skip_locking = 1;

4855
	key.objectid = scrub_dev->devid;
A
Arne Jansen 已提交
4856 4857 4858 4859
	key.offset = 0ull;
	key.type = BTRFS_DEV_EXTENT_KEY;

	while (1) {
4860 4861
		u64 dev_extent_len;

A
Arne Jansen 已提交
4862 4863
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		if (ret < 0)
4864 4865 4866 4867 4868
			break;
		if (ret > 0) {
			if (path->slots[0] >=
			    btrfs_header_nritems(path->nodes[0])) {
				ret = btrfs_next_leaf(root, path);
4869 4870 4871 4872
				if (ret < 0)
					break;
				if (ret > 0) {
					ret = 0;
4873
					break;
4874 4875 4876
				}
			} else {
				ret = 0;
4877 4878
			}
		}
A
Arne Jansen 已提交
4879 4880 4881 4882 4883 4884

		l = path->nodes[0];
		slot = path->slots[0];

		btrfs_item_key_to_cpu(l, &found_key, slot);

4885
		if (found_key.objectid != scrub_dev->devid)
A
Arne Jansen 已提交
4886 4887
			break;

4888
		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
A
Arne Jansen 已提交
4889 4890 4891 4892 4893 4894 4895 4896 4897
			break;

		if (found_key.offset >= end)
			break;

		if (found_key.offset < key.offset)
			break;

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4898
		dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
A
Arne Jansen 已提交
4899

4900
		if (found_key.offset + dev_extent_len <= start)
4901
			goto skip;
A
Arne Jansen 已提交
4902 4903 4904 4905 4906 4907 4908 4909

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);

		/*
		 * get a reference on the corresponding block group to prevent
		 * the chunk from going away while we scrub it
		 */
		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
4910 4911 4912 4913 4914 4915

		/* some chunks are removed but not committed to disk yet,
		 * continue scrubbing */
		if (!cache)
			goto skip;

4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940
		ASSERT(cache->start <= chunk_offset);
		/*
		 * We are using the commit root to search for device extents, so
		 * that means we could have found a device extent item from a
		 * block group that was deleted in the current transaction. The
		 * logical start offset of the deleted block group, stored at
		 * @chunk_offset, might be part of the logical address range of
		 * a new block group (which uses different physical extents).
		 * In this case btrfs_lookup_block_group() has returned the new
		 * block group, and its start address is less than @chunk_offset.
		 *
		 * We skip such new block groups, because it's pointless to
		 * process them, as we won't find their extents because we search
		 * for them using the commit root of the extent tree. For a device
		 * replace it's also fine to skip it, we won't miss copying them
		 * to the target device because we have the write duplication
		 * setup through the regular write path (by btrfs_map_block()),
		 * and we have committed a transaction when we started the device
		 * replace, right after setting up the device replace state.
		 */
		if (cache->start < chunk_offset) {
			btrfs_put_block_group(cache);
			goto skip;
		}

4941
		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
4942
			if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
4943 4944
				btrfs_put_block_group(cache);
				goto skip;
4945 4946 4947
			}
		}

4948 4949 4950 4951 4952 4953 4954 4955 4956
		/*
		 * Make sure that while we are scrubbing the corresponding block
		 * group doesn't get its logical address and its device extents
		 * reused for another block group, which can possibly be of a
		 * different type and different profile. We do this to prevent
		 * false error detections and crashes due to bogus attempts to
		 * repair extents.
		 */
		spin_lock(&cache->lock);
4957
		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
4958 4959 4960 4961
			spin_unlock(&cache->lock);
			btrfs_put_block_group(cache);
			goto skip;
		}
4962
		btrfs_freeze_block_group(cache);
4963 4964
		spin_unlock(&cache->lock);

4965 4966 4967 4968 4969 4970 4971 4972 4973
		/*
		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
		 * to avoid deadlock caused by:
		 * btrfs_inc_block_group_ro()
		 * -> btrfs_wait_for_commit()
		 * -> btrfs_commit_transaction()
		 * -> btrfs_scrub_pause()
		 */
		scrub_pause_on(fs_info);
4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991

		/*
		 * Don't do chunk preallocation for scrub.
		 *
		 * This is especially important for SYSTEM bgs, or we can hit
		 * -EFBIG from btrfs_finish_chunk_alloc() like:
		 * 1. The only SYSTEM bg is marked RO.
		 *    Since SYSTEM bg is small, that's pretty common.
		 * 2. New SYSTEM bg will be allocated
		 *    Due to regular version will allocate new chunk.
		 * 3. New SYSTEM bg is empty and will get cleaned up
		 *    Before cleanup really happens, it's marked RO again.
		 * 4. Empty SYSTEM bg get scrubbed
		 *    We go back to 2.
		 *
		 * This can easily boost the amount of SYSTEM chunks if cleaner
		 * thread can't be triggered fast enough, and use up all space
		 * of btrfs_super_block::sys_chunk_array
4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003
		 *
		 * While for dev replace, we need to try our best to mark block
		 * group RO, to prevent race between:
		 * - Write duplication
		 *   Contains latest data
		 * - Scrub copy
		 *   Contains data from commit tree
		 *
		 * If target block group is not marked RO, nocow writes can
		 * be overwritten by scrub copy, causing data corruption.
		 * So for dev-replace, it's not allowed to continue if a block
		 * group is not RO.
5004
		 */
5005
		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
5006 5007 5008 5009 5010 5011 5012 5013 5014 5015
		if (!ret && sctx->is_dev_replace) {
			ret = finish_extent_writes_for_zoned(root, cache);
			if (ret) {
				btrfs_dec_block_group_ro(cache);
				scrub_pause_off(fs_info);
				btrfs_put_block_group(cache);
				break;
			}
		}

5016 5017
		if (ret == 0) {
			ro_set = 1;
5018
		} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
5019 5020 5021
			/*
			 * btrfs_inc_block_group_ro return -ENOSPC when it
			 * failed in creating new chunk for metadata.
5022
			 * It is not a problem for scrub, because
5023 5024 5025 5026
			 * metadata are always cowed, and our scrub paused
			 * commit_transactions.
			 */
			ro_set = 0;
5027 5028 5029 5030 5031 5032 5033
		} else if (ret == -ETXTBSY) {
			btrfs_warn(fs_info,
		   "skipping scrub of block group %llu due to active swapfile",
				   cache->start);
			scrub_pause_off(fs_info);
			ret = 0;
			goto skip_unfreeze;
5034
		} else {
J
Jeff Mahoney 已提交
5035
			btrfs_warn(fs_info,
5036
				   "failed setting block group ro: %d", ret);
5037
			btrfs_unfreeze_block_group(cache);
5038
			btrfs_put_block_group(cache);
5039
			scrub_pause_off(fs_info);
5040 5041 5042
			break;
		}

5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054
		/*
		 * Now the target block is marked RO, wait for nocow writes to
		 * finish before dev-replace.
		 * COW is fine, as COW never overwrites extents in commit tree.
		 */
		if (sctx->is_dev_replace) {
			btrfs_wait_nocow_writers(cache);
			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
					cache->length);
		}

		scrub_pause_off(fs_info);
5055
		down_write(&dev_replace->rwsem);
5056
		dev_replace->cursor_right = found_key.offset + dev_extent_len;
5057 5058
		dev_replace->cursor_left = found_key.offset;
		dev_replace->item_needs_writeback = 1;
5059 5060
		up_write(&dev_replace->rwsem);

5061 5062
		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
				  dev_extent_len);
5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073

		/*
		 * flush, submit all pending read and write bios, afterwards
		 * wait for them.
		 * Note that in the dev replace case, a read request causes
		 * write requests that are submitted in the read completion
		 * worker. Therefore in the current situation, it is required
		 * that all write requests are flushed, so that all read and
		 * write requests are really completed when bios_in_flight
		 * changes to 0.
		 */
5074
		sctx->flush_all_writes = true;
5075
		scrub_submit(sctx);
5076
		mutex_lock(&sctx->wr_lock);
5077
		scrub_wr_submit(sctx);
5078
		mutex_unlock(&sctx->wr_lock);
5079 5080 5081

		wait_event(sctx->list_wait,
			   atomic_read(&sctx->bios_in_flight) == 0);
5082 5083

		scrub_pause_on(fs_info);
5084 5085 5086 5087 5088 5089

		/*
		 * must be called before we decrease @scrub_paused.
		 * make sure we don't block transaction commit while
		 * we are waiting pending workers finished.
		 */
5090 5091
		wait_event(sctx->list_wait,
			   atomic_read(&sctx->workers_pending) == 0);
5092
		sctx->flush_all_writes = false;
5093

5094
		scrub_pause_off(fs_info);
5095

5096 5097 5098 5099 5100
		if (sctx->is_dev_replace &&
		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
						      cache, found_key.offset))
			ro_set = 0;

5101
		down_write(&dev_replace->rwsem);
5102 5103
		dev_replace->cursor_left = dev_replace->cursor_right;
		dev_replace->item_needs_writeback = 1;
5104
		up_write(&dev_replace->rwsem);
5105

5106
		if (ro_set)
5107
			btrfs_dec_block_group_ro(cache);
5108

5109 5110 5111 5112 5113 5114 5115 5116
		/*
		 * We might have prevented the cleaner kthread from deleting
		 * this block group if it was already unused because we raced
		 * and set it to RO mode first. So add it back to the unused
		 * list, otherwise it might not ever be deleted unless a manual
		 * balance is triggered or it becomes used and unused again.
		 */
		spin_lock(&cache->lock);
5117 5118
		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
		    !cache->ro && cache->reserved == 0 && cache->used == 0) {
5119
			spin_unlock(&cache->lock);
5120 5121 5122 5123 5124
			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
				btrfs_discard_queue_work(&fs_info->discard_ctl,
							 cache);
			else
				btrfs_mark_bg_unused(cache);
5125 5126 5127
		} else {
			spin_unlock(&cache->lock);
		}
5128
skip_unfreeze:
5129
		btrfs_unfreeze_block_group(cache);
A
Arne Jansen 已提交
5130 5131 5132
		btrfs_put_block_group(cache);
		if (ret)
			break;
5133
		if (sctx->is_dev_replace &&
5134
		    atomic64_read(&dev_replace->num_write_errors) > 0) {
5135 5136 5137 5138 5139 5140 5141
			ret = -EIO;
			break;
		}
		if (sctx->stat.malloc_errors > 0) {
			ret = -ENOMEM;
			break;
		}
5142
skip:
5143
		key.offset = found_key.offset + dev_extent_len;
C
Chris Mason 已提交
5144
		btrfs_release_path(path);
A
Arne Jansen 已提交
5145 5146 5147
	}

	btrfs_free_path(path);
5148

5149
	return ret;
A
Arne Jansen 已提交
5150 5151
}

5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186
static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
			   struct page *page, u64 physical, u64 generation)
{
	struct btrfs_fs_info *fs_info = sctx->fs_info;
	struct bio_vec bvec;
	struct bio bio;
	struct btrfs_super_block *sb = page_address(page);
	int ret;

	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
	bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
	__bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
	ret = submit_bio_wait(&bio);
	bio_uninit(&bio);

	if (ret < 0)
		return ret;
	ret = btrfs_check_super_csum(fs_info, sb);
	if (ret != 0) {
		btrfs_err_rl(fs_info,
			"super block at physical %llu devid %llu has bad csum",
			physical, dev->devid);
		return -EIO;
	}
	if (btrfs_super_generation(sb) != generation) {
		btrfs_err_rl(fs_info,
"super block at physical %llu devid %llu has bad generation %llu expect %llu",
			     physical, dev->devid,
			     btrfs_super_generation(sb), generation);
		return -EUCLEAN;
	}

	return btrfs_validate_super(fs_info, sb, -1);
}

5187 5188
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
					   struct btrfs_device *scrub_dev)
A
Arne Jansen 已提交
5189 5190 5191 5192
{
	int	i;
	u64	bytenr;
	u64	gen;
5193 5194
	int ret = 0;
	struct page *page;
5195
	struct btrfs_fs_info *fs_info = sctx->fs_info;
A
Arne Jansen 已提交
5196

J
Josef Bacik 已提交
5197
	if (BTRFS_FS_ERROR(fs_info))
5198
		return -EROFS;
5199

5200 5201 5202 5203 5204 5205 5206 5207
	page = alloc_page(GFP_KERNEL);
	if (!page) {
		spin_lock(&sctx->stat_lock);
		sctx->stat.malloc_errors++;
		spin_unlock(&sctx->stat_lock);
		return -ENOMEM;
	}

5208
	/* Seed devices of a new filesystem has their own generation. */
5209
	if (scrub_dev->fs_devices != fs_info->fs_devices)
5210 5211
		gen = scrub_dev->generation;
	else
5212
		gen = fs_info->last_trans_committed;
A
Arne Jansen 已提交
5213 5214 5215

	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
		bytenr = btrfs_sb_offset(i);
5216 5217
		if (bytenr + BTRFS_SUPER_INFO_SIZE >
		    scrub_dev->commit_total_bytes)
A
Arne Jansen 已提交
5218
			break;
5219 5220
		if (!btrfs_check_super_location(scrub_dev, bytenr))
			continue;
A
Arne Jansen 已提交
5221

5222 5223 5224 5225 5226 5227
		ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
		if (ret) {
			spin_lock(&sctx->stat_lock);
			sctx->stat.super_errors++;
			spin_unlock(&sctx->stat_lock);
		}
A
Arne Jansen 已提交
5228
	}
5229
	__free_page(page);
A
Arne Jansen 已提交
5230 5231 5232
	return 0;
}

5233 5234 5235 5236
static void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
					&fs_info->scrub_lock)) {
5237 5238 5239 5240 5241
		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
		struct workqueue_struct *scrub_wr_comp =
						fs_info->scrub_wr_completion_workers;
		struct workqueue_struct *scrub_parity =
						fs_info->scrub_parity_workers;
5242 5243 5244 5245 5246 5247

		fs_info->scrub_workers = NULL;
		fs_info->scrub_wr_completion_workers = NULL;
		fs_info->scrub_parity_workers = NULL;
		mutex_unlock(&fs_info->scrub_lock);

5248 5249 5250 5251 5252 5253
		if (scrub_workers)
			destroy_workqueue(scrub_workers);
		if (scrub_wr_comp)
			destroy_workqueue(scrub_wr_comp);
		if (scrub_parity)
			destroy_workqueue(scrub_parity);
5254 5255 5256
	}
}

A
Arne Jansen 已提交
5257 5258 5259
/*
 * get a reference count on fs_info->scrub_workers. start worker if necessary
 */
5260 5261
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
						int is_dev_replace)
A
Arne Jansen 已提交
5262
{
5263 5264 5265
	struct workqueue_struct *scrub_workers = NULL;
	struct workqueue_struct *scrub_wr_comp = NULL;
	struct workqueue_struct *scrub_parity = NULL;
5266
	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
5267
	int max_active = fs_info->thread_pool_size;
5268
	int ret = -ENOMEM;
A
Arne Jansen 已提交
5269

5270 5271
	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
		return 0;
5272

5273 5274
	scrub_workers = alloc_workqueue("btrfs-scrub", flags,
					is_dev_replace ? 1 : max_active);
5275 5276
	if (!scrub_workers)
		goto fail_scrub_workers;
5277

5278
	scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
5279 5280
	if (!scrub_wr_comp)
		goto fail_scrub_wr_completion_workers;
5281

5282
	scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293
	if (!scrub_parity)
		goto fail_scrub_parity_workers;

	mutex_lock(&fs_info->scrub_lock);
	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
		ASSERT(fs_info->scrub_workers == NULL &&
		       fs_info->scrub_wr_completion_workers == NULL &&
		       fs_info->scrub_parity_workers == NULL);
		fs_info->scrub_workers = scrub_workers;
		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
		fs_info->scrub_parity_workers = scrub_parity;
5294
		refcount_set(&fs_info->scrub_workers_refcnt, 1);
5295 5296
		mutex_unlock(&fs_info->scrub_lock);
		return 0;
A
Arne Jansen 已提交
5297
	}
5298 5299 5300
	/* Other thread raced in and created the workers for us */
	refcount_inc(&fs_info->scrub_workers_refcnt);
	mutex_unlock(&fs_info->scrub_lock);
5301

5302
	ret = 0;
5303
	destroy_workqueue(scrub_parity);
5304
fail_scrub_parity_workers:
5305
	destroy_workqueue(scrub_wr_comp);
5306
fail_scrub_wr_completion_workers:
5307
	destroy_workqueue(scrub_workers);
5308
fail_scrub_workers:
5309
	return ret;
A
Arne Jansen 已提交
5310 5311
}

5312 5313
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
		    u64 end, struct btrfs_scrub_progress *progress,
5314
		    int readonly, int is_dev_replace)
A
Arne Jansen 已提交
5315
{
5316
	struct btrfs_dev_lookup_args args = { .devid = devid };
5317
	struct scrub_ctx *sctx;
A
Arne Jansen 已提交
5318 5319
	int ret;
	struct btrfs_device *dev;
5320
	unsigned int nofs_flag;
5321
	bool need_commit = false;
A
Arne Jansen 已提交
5322

5323
	if (btrfs_fs_closing(fs_info))
5324
		return -EAGAIN;
A
Arne Jansen 已提交
5325

5326 5327
	/* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
	ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
5328

5329 5330 5331 5332 5333 5334 5335
	/*
	 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
	 * value (max nodesize / min sectorsize), thus nodesize should always
	 * be fine.
	 */
	ASSERT(fs_info->nodesize <=
	       SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
5336

5337 5338 5339 5340
	/* Allocate outside of device_list_mutex */
	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
	if (IS_ERR(sctx))
		return PTR_ERR(sctx);
A
Arne Jansen 已提交
5341

5342 5343 5344 5345
	ret = scrub_workers_get(fs_info, is_dev_replace);
	if (ret)
		goto out_free_ctx;

5346
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5347
	dev = btrfs_find_device(fs_info->fs_devices, &args);
5348 5349
	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
		     !is_dev_replace)) {
5350
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5351
		ret = -ENODEV;
5352
		goto out;
A
Arne Jansen 已提交
5353 5354
	}

5355 5356
	if (!is_dev_replace && !readonly &&
	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
5357
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5358 5359
		btrfs_err_in_rcu(fs_info,
			"scrub on devid %llu: filesystem on %s is not writable",
5360
				 devid, btrfs_dev_name(dev));
5361
		ret = -EROFS;
5362
		goto out;
5363 5364
	}

5365
	mutex_lock(&fs_info->scrub_lock);
5366
	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
5367
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
A
Arne Jansen 已提交
5368
		mutex_unlock(&fs_info->scrub_lock);
5369
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5370
		ret = -EIO;
5371
		goto out;
A
Arne Jansen 已提交
5372 5373
	}

5374
	down_read(&fs_info->dev_replace.rwsem);
5375
	if (dev->scrub_ctx ||
5376 5377
	    (!is_dev_replace &&
	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
5378
		up_read(&fs_info->dev_replace.rwsem);
A
Arne Jansen 已提交
5379
		mutex_unlock(&fs_info->scrub_lock);
5380
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5381
		ret = -EINPROGRESS;
5382
		goto out;
A
Arne Jansen 已提交
5383
	}
5384
	up_read(&fs_info->dev_replace.rwsem);
5385

5386
	sctx->readonly = readonly;
5387
	dev->scrub_ctx = sctx;
5388
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
A
Arne Jansen 已提交
5389

5390 5391 5392 5393
	/*
	 * checking @scrub_pause_req here, we can avoid
	 * race between committing transaction and scrubbing.
	 */
5394
	__scrub_blocked_if_needed(fs_info);
A
Arne Jansen 已提交
5395 5396 5397
	atomic_inc(&fs_info->scrubs_running);
	mutex_unlock(&fs_info->scrub_lock);

5398 5399 5400
	/*
	 * In order to avoid deadlock with reclaim when there is a transaction
	 * trying to pause scrub, make sure we use GFP_NOFS for all the
5401
	 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
5402 5403 5404 5405 5406 5407
	 * invoked by our callees. The pausing request is done when the
	 * transaction commit starts, and it blocks the transaction until scrub
	 * is paused (done at specific points at scrub_stripe() or right above
	 * before incrementing fs_info->scrubs_running).
	 */
	nofs_flag = memalloc_nofs_save();
5408
	if (!is_dev_replace) {
5409 5410 5411 5412 5413 5414
		u64 old_super_errors;

		spin_lock(&sctx->stat_lock);
		old_super_errors = sctx->stat.super_errors;
		spin_unlock(&sctx->stat_lock);

5415
		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
5416 5417 5418 5419
		/*
		 * by holding device list mutex, we can
		 * kick off writing super in log tree sync.
		 */
5420
		mutex_lock(&fs_info->fs_devices->device_list_mutex);
5421
		ret = scrub_supers(sctx, dev);
5422
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5423 5424 5425 5426 5427 5428 5429 5430 5431 5432

		spin_lock(&sctx->stat_lock);
		/*
		 * Super block errors found, but we can not commit transaction
		 * at current context, since btrfs_commit_transaction() needs
		 * to pause the current running scrub (hold by ourselves).
		 */
		if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
			need_commit = true;
		spin_unlock(&sctx->stat_lock);
5433
	}
A
Arne Jansen 已提交
5434 5435

	if (!ret)
5436
		ret = scrub_enumerate_chunks(sctx, dev, start, end);
5437
	memalloc_nofs_restore(nofs_flag);
A
Arne Jansen 已提交
5438

5439
	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
A
Arne Jansen 已提交
5440 5441 5442
	atomic_dec(&fs_info->scrubs_running);
	wake_up(&fs_info->scrub_pause_wait);

5443
	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
5444

A
Arne Jansen 已提交
5445
	if (progress)
5446
		memcpy(progress, &sctx->stat, sizeof(*progress));
A
Arne Jansen 已提交
5447

5448 5449 5450 5451
	if (!is_dev_replace)
		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
			ret ? "not finished" : "finished", devid, ret);

A
Arne Jansen 已提交
5452
	mutex_lock(&fs_info->scrub_lock);
5453
	dev->scrub_ctx = NULL;
A
Arne Jansen 已提交
5454 5455
	mutex_unlock(&fs_info->scrub_lock);

5456
	scrub_workers_put(fs_info);
5457
	scrub_put_ctx(sctx);
A
Arne Jansen 已提交
5458

5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477
	/*
	 * We found some super block errors before, now try to force a
	 * transaction commit, as scrub has finished.
	 */
	if (need_commit) {
		struct btrfs_trans_handle *trans;

		trans = btrfs_start_transaction(fs_info->tree_root, 0);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			btrfs_err(fs_info,
	"scrub: failed to start transaction to fix super block errors: %d", ret);
			return ret;
		}
		ret = btrfs_commit_transaction(trans);
		if (ret < 0)
			btrfs_err(fs_info,
	"scrub: failed to commit transaction to fix super block errors: %d", ret);
	}
5478
	return ret;
5479 5480
out:
	scrub_workers_put(fs_info);
5481 5482 5483
out_free_ctx:
	scrub_free_ctx(sctx);

A
Arne Jansen 已提交
5484 5485 5486
	return ret;
}

5487
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
A
Arne Jansen 已提交
5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501
{
	mutex_lock(&fs_info->scrub_lock);
	atomic_inc(&fs_info->scrub_pause_req);
	while (atomic_read(&fs_info->scrubs_paused) !=
	       atomic_read(&fs_info->scrubs_running)) {
		mutex_unlock(&fs_info->scrub_lock);
		wait_event(fs_info->scrub_pause_wait,
			   atomic_read(&fs_info->scrubs_paused) ==
			   atomic_read(&fs_info->scrubs_running));
		mutex_lock(&fs_info->scrub_lock);
	}
	mutex_unlock(&fs_info->scrub_lock);
}

5502
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
A
Arne Jansen 已提交
5503 5504 5505 5506 5507
{
	atomic_dec(&fs_info->scrub_pause_req);
	wake_up(&fs_info->scrub_pause_wait);
}

5508
int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
A
Arne Jansen 已提交
5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528
{
	mutex_lock(&fs_info->scrub_lock);
	if (!atomic_read(&fs_info->scrubs_running)) {
		mutex_unlock(&fs_info->scrub_lock);
		return -ENOTCONN;
	}

	atomic_inc(&fs_info->scrub_cancel_req);
	while (atomic_read(&fs_info->scrubs_running)) {
		mutex_unlock(&fs_info->scrub_lock);
		wait_event(fs_info->scrub_pause_wait,
			   atomic_read(&fs_info->scrubs_running) == 0);
		mutex_lock(&fs_info->scrub_lock);
	}
	atomic_dec(&fs_info->scrub_cancel_req);
	mutex_unlock(&fs_info->scrub_lock);

	return 0;
}

5529
int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
5530
{
5531
	struct btrfs_fs_info *fs_info = dev->fs_info;
5532
	struct scrub_ctx *sctx;
A
Arne Jansen 已提交
5533 5534

	mutex_lock(&fs_info->scrub_lock);
5535
	sctx = dev->scrub_ctx;
5536
	if (!sctx) {
A
Arne Jansen 已提交
5537 5538 5539
		mutex_unlock(&fs_info->scrub_lock);
		return -ENOTCONN;
	}
5540
	atomic_inc(&sctx->cancel_req);
5541
	while (dev->scrub_ctx) {
A
Arne Jansen 已提交
5542 5543
		mutex_unlock(&fs_info->scrub_lock);
		wait_event(fs_info->scrub_pause_wait,
5544
			   dev->scrub_ctx == NULL);
A
Arne Jansen 已提交
5545 5546 5547 5548 5549 5550
		mutex_lock(&fs_info->scrub_lock);
	}
	mutex_unlock(&fs_info->scrub_lock);

	return 0;
}
S
Stefan Behrens 已提交
5551

5552
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
A
Arne Jansen 已提交
5553 5554
			 struct btrfs_scrub_progress *progress)
{
5555
	struct btrfs_dev_lookup_args args = { .devid = devid };
A
Arne Jansen 已提交
5556
	struct btrfs_device *dev;
5557
	struct scrub_ctx *sctx = NULL;
A
Arne Jansen 已提交
5558

5559
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5560
	dev = btrfs_find_device(fs_info->fs_devices, &args);
A
Arne Jansen 已提交
5561
	if (dev)
5562
		sctx = dev->scrub_ctx;
5563 5564
	if (sctx)
		memcpy(progress, &sctx->stat, sizeof(*progress));
5565
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
A
Arne Jansen 已提交
5566

5567
	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
A
Arne Jansen 已提交
5568
}