diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e90800388a413fff1f1c058d28649ada282202e4..6557b1b7f89ae2291700cbd87d14659be2a89d30 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include "misc.h" #include "ctree.h" @@ -3379,6 +3380,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); + spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3433,32 +3435,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; + enum btrfs_block_group_size_class size_class; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + goto out; + } - /* - * Compression can use less space than we reserved, so wake - * tickets if that happens - */ - if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + if (btrfs_is_block_group_data_only(cache)) { + size_class = btrfs_calc_block_group_size_class(num_bytes); + ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); + if (ret) + goto out; } + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake tickets if + * that happens. + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); +out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -4218,3 +4230,64 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } + +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) +{ + if (size <= SZ_128K) + return BTRFS_BG_SZ_SMALL; + if (size <= SZ_8M) + return BTRFS_BG_SZ_MEDIUM; + return BTRFS_BG_SZ_LARGE; +} + +/* + * Handle a block group allocating an extent in a size class + * + * @bg: The block group we allocated in. + * @size_class: The size class of the allocation. + * @force_wrong_size_class: Whether we are desperate enough to allow + * mismatched size classes. + * + * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the + * case of a race that leads to the wrong size class without + * force_wrong_size_class set. + * + * find_free_extent will skip block groups with a mismatched size class until + * it really needs to avoid ENOSPC. In that case it will set + * force_wrong_size_class. However, if a block group is newly allocated and + * doesn't yet have a size class, then it is possible for two allocations of + * different sizes to race and both try to use it. The loser is caught here and + * has to retry. + */ +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class) +{ + ASSERT(size_class != BTRFS_BG_SZ_NONE); + + /* The new allocation is in the right size class, do nothing */ + if (bg->size_class == size_class) + return 0; + /* + * The new allocation is in a mismatched size class. + * This means one of two things: + * + * 1. Two tasks in find_free_extent for different size_classes raced + * and hit the same empty block_group. Make the loser try again. + * 2. A call to find_free_extent got desperate enough to set + * 'force_wrong_slab'. Don't change the size_class, but allow the + * allocation. + */ + if (bg->size_class != BTRFS_BG_SZ_NONE) { + if (force_wrong_size_class) + return 0; + return -EAGAIN; + } + /* + * The happy new block group case: the new allocation is the first + * one in the block_group so we set size_class. + */ + bg->size_class = size_class; + + return 0; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a02ea76fd6cffe6acee57f45bb9406e6b559d46c..2b9d328b589ebad76d6a2e5d562aef673d4dafcd 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -12,6 +12,17 @@ enum btrfs_disk_cache_state { BTRFS_DC_SETUP, }; +enum btrfs_block_group_size_class { + /* Unset */ + BTRFS_BG_SZ_NONE, + /* 0 < size <= 128K */ + BTRFS_BG_SZ_SMALL, + /* 128K < size <= 8M */ + BTRFS_BG_SZ_MEDIUM, + /* 8M < size < BG_LENGTH */ + BTRFS_BG_SZ_LARGE, +}; + /* * This describes the state of the block_group for async discard. This is due * to the two pass nature of it where extent discarding is prioritized over @@ -233,6 +244,7 @@ struct btrfs_block_group { struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; + enum btrfs_block_group_size_class size_class; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -302,7 +314,8 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc, + bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, @@ -346,4 +359,9 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); +enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); +int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, + enum btrfs_block_group_size_class size_class, + bool force_wrong_size_class); + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 203c8cbb9983c1922127043fae717af71531ffae..e9be9430faa089f20a4818357eb213fe23b50604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3385,7 +3385,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) enum btrfs_loop_type { LOOP_CACHING_NOWAIT, LOOP_CACHING_WAIT, + LOOP_UNSET_SIZE_CLASS, LOOP_ALLOC_CHUNK, + LOOP_WRONG_SIZE_CLASS, LOOP_NO_EMPTY_SIZE, }; @@ -3950,24 +3952,6 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, } } -static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) -{ - switch (ffe_ctl->policy) { - case BTRFS_EXTENT_ALLOC_CLUSTERED: - /* - * If we can't allocate a new chunk we've already looped through - * at least once, move on to the NO_EMPTY_SIZE case. - */ - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - return 0; - case BTRFS_EXTENT_ALLOC_ZONED: - /* Give up here */ - return -ENOSPC; - default: - BUG(); - } -} - /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. @@ -4001,31 +3985,28 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_UNSET_SIZE_CLASS, allow unset size class * LOOP_ALLOC_CHUNK, force a chunk allocation and try again * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have + * any uncached bgs and we've already done a full search + * through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; - } + ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; - /*Check if allocation policy allows to create a new chunk */ + /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; @@ -4045,8 +4026,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) - ret = chunk_allocation_failed(ffe_ctl); + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) btrfs_abort_transaction(trans, ret); else @@ -4076,6 +4059,21 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_is_block_group_data_only(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4210,6 +4208,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; @@ -4346,6 +4345,9 @@ static noinline int find_free_extent(struct btrfs_root *root, if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; + if (!find_free_extent_check_size_class(ffe_ctl, block_group)) + goto loop; + bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { @@ -4380,7 +4382,8 @@ static noinline int find_free_extent(struct btrfs_root *root, ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, - ffe_ctl->delalloc); + ffe_ctl->delalloc, + ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index daa5e3505886e3cd8a319e8753d2bfdd588ba932..0c958fc1b3b8a1a72bf2dd596b3f7d8b53a978fa 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -79,6 +79,9 @@ struct find_free_extent_ctl { /* Whether or not the allocator is currently following a hint */ bool hinted; + + /* Size class of block groups to prefer in early loops */ + enum btrfs_block_group_size_class size_class; }; enum btrfs_inline_ref_type { diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index cc2eaab28d580cf5180755b573d34320607ab067..75d7d22c3a276c5a16b8a44b7a5dbe89c1e6cfb4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1349,28 +1349,33 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, TP_STRUCT__entry_btrfs( __field( u64, bg_objectid ) __field( u64, flags ) + __field( int, bg_size_class ) __field( u64, start ) __field( u64, len ) __field( u64, loop ) __field( bool, hinted ) + __field( int, size_class ) ), TP_fast_assign_btrfs(block_group->fs_info, __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; + __entry->bg_size_class = block_group->size_class; __entry->start = ffe_ctl->search_start; __entry->len = ffe_ctl->num_bytes; __entry->loop = ffe_ctl->loop; __entry->hinted = ffe_ctl->hinted; + __entry->size_class = ffe_ctl->size_class; ), TP_printk_btrfs( -"root=%llu(%s) block_group=%llu flags=%llu(%s) start=%llu len=%llu loop=%llu hinted=%d", +"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d", show_root_type(BTRFS_EXTENT_TREE_OBJECTID), __entry->bg_objectid, __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->start, __entry->len, __entry->loop, __entry->hinted) + __entry->bg_size_class, __entry->start, __entry->len, + __entry->loop, __entry->hinted, __entry->size_class) ); DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,