btrfs: fix dead lock while running replace and defrag concurrently

This can be reproduced by fstests: btrfs/070 The scenario is like the following: replace worker thread defrag thread --------------------- ------------- copy_nocow_pages_worker btrfs_defrag_file copy_nocow_pages_for_inode ... btrfs_writepages |A| lock_extent_bits extent_write_cache_pages |B| lock_page __extent_writepage ... writepage_delalloc find_lock_delalloc_range |B| lock_extent_bits find_or_create_page pagecache_get_page |A| lock_page This leads to an ABBA pattern deadlock. To fix it, o we just change it to an AABB pattern which means to @unlock_extent_bits() before we @lock_page(), and in this way the @extent_read_full_page_nolock() is no longer in an locked context, so change it back to @extent_read_full_page() to regain protection. o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(), the extent may not really point at the physical block we want, so we have to check it before write. Signed-off-by: N Gui Hecheng <guihc.fnst@cn.fujitsu.com> Tested-by: N David Sterba <dsterba@suse.cz> Signed-off-by: N Chris Mason <clm@fb.com>

btrfs: fix dead lock while running replace and defrag concurrently
This can be reproduced by fstests: btrfs/070 The scenario is like the following: replace worker thread defrag thread --------------------- ------------- copy_nocow_pages_worker btrfs_defrag_file copy_nocow_pages_for_inode ... btrfs_writepages |A| lock_extent_bits extent_write_cache_pages |B| lock_page __extent_writepage ... writepage_delalloc find_lock_delalloc_range |B| lock_extent_bits find_or_create_page pagecache_get_page |A| lock_page This leads to an ABBA pattern deadlock. To fix it, o we just change it to an AABB pattern which means to @unlock_extent_bits() before we @lock_page(), and in this way the @extent_read_full_page_nolock() is no longer in an locked context, so change it back to @extent_read_full_page() to regain protection. o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(), the extent may not really point at the physical block we want, so we have to check it before write. Signed-off-by: N Gui Hecheng <guihc.fnst@cn.fujitsu.com> Tested-by: N David Sterba <dsterba@suse.cz> Signed-off-by: N Chris Mason <clm@fb.com>
32159242 · Gui Hecheng · Chris Mason · 5f5bc6b1 · 32159242
隐藏空白更改
内联并排

Showing with 60 addition and 30 deletion

fs/btrfs/scrub.c fs/btrfs/scrub.c +60 -30

未找到文件。
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3310,6 +3310,50 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
 	scrub_pending_trans_workers_dec(sctx);
 }

+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+				 u64 logical)
+{
+	struct extent_state *cached_state = NULL;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_io_tree *io_tree;
+	struct extent_map *em;
+	u64 lockstart = start, lockend = start + len - 1;
+	int ret = 0;
+
+	io_tree = &BTRFS_I(inode)->io_tree;
+
+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		ret = 1;
+		goto out_unlock;
+	}
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock;
+	}
+
+	/*
+	 * This extent does not actually cover the logical extent anymore,
+	 * move on to the next inode.
+	 */
+	if (em->block_start > logical ||
+	    em->block_start + em->block_len < logical + len) {
+		free_extent_map(em);
+		ret = 1;
+		goto out_unlock;
+	}
+	free_extent_map(em);
+
+out_unlock:
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+			     GFP_NOFS);
+	return ret;
+}
+
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				      struct scrub_copy_nocow_ctx *nocow_ctx)
 {
@@ -3318,13 +3362,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 	struct inode *inode;
 	struct page *page;
 	struct btrfs_root *local_root;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_map *em;
-	struct extent_state *cached_state = NULL;
 	struct extent_io_tree *io_tree;
 	u64 physical_for_dev_replace;
+	u64 nocow_ctx_logical;
 	u64 len = nocow_ctx->len;
-	u64 lockstart = offset, lockend = offset + len - 1;
 	unsigned long index;
 	int srcu_index;
 	int ret = 0;
@@ -3356,30 +3397,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
 	io_tree = &BTRFS_I(inode)->io_tree;
+	nocow_ctx_logical = nocow_ctx->logical;

-	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
-	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
-	if (ordered) {
-		btrfs_put_ordered_extent(ordered);
-		goto out_unlock;
-	}
-
-	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
-	if (IS_ERR(em)) {
-		ret = PTR_ERR(em);
-		goto out_unlock;
-	}
-
-	/*
-	 * This extent does not actually cover the logical extent anymore,
-	 * move on to the next inode.
-	 */
-	if (em->block_start > nocow_ctx->logical ||
-	    em->block_start + em->block_len < nocow_ctx->logical + len) {
-		free_extent_map(em);
-		goto out_unlock;
+	ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+	if (ret) {
+		ret = ret > 0 ? 0 : ret;
+		goto out;
 	}
-	free_extent_map(em);

 	while (len >= PAGE_CACHE_SIZE) {
 		index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +3420,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				goto next_page;
 		} else {
 			ClearPageError(page);
-			err = extent_read_full_page_nolock(io_tree, page,
+			err = extent_read_full_page(io_tree, page,
 							   btrfs_get_extent,
 							   nocow_ctx->mirror_num);
 			if (err) {
@@ -3421,6 +3445,14 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				goto next_page;
 			}
 		}
+
+		ret = check_extent_to_block(inode, offset, len,
+					    nocow_ctx_logical);
+		if (ret) {
+			ret = ret > 0 ? 0 : ret;
+			goto next_page;
+		}
+
 		err = write_page_nocow(nocow_ctx->sctx,
 				       physical_for_dev_replace, page);
 		if (err)
@@ -3434,12 +3466,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

 		offset += PAGE_CACHE_SIZE;
 		physical_for_dev_replace += PAGE_CACHE_SIZE;
+		nocow_ctx_logical += PAGE_CACHE_SIZE;
 		len -= PAGE_CACHE_SIZE;
 	}
 	ret = COPY_COMPLETE;
-out_unlock:
-	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
-			     GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
 	iput(inode);