diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 76a843198bcb6bdbb9976b9e2b87982b6c1d99ea..82200dbca5ac4743aab68f86451579df4657cb71 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o + block-rsv.o delalloc-space.o block-group.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 122cb97c79098ec10623a59d1101a191c7038a6a..2e9e13ffbd08290926d9e54e33ddfc4216624623 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -12,9 +12,11 @@ #include "async-thread.h" #include "ctree.h" -#define WORK_DONE_BIT 0 -#define WORK_ORDER_DONE_BIT 1 -#define WORK_HIGH_PRIO_BIT 2 +enum { + WORK_DONE_BIT, + WORK_ORDER_DONE_BIT, + WORK_HIGH_PRIO_BIT, +}; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c new file mode 100644 index 0000000000000000000000000000000000000000..bf7e3f23bba745528f863066c4d0be5224d8d2e1 --- /dev/null +++ b/fs/btrfs/block-group.c @@ -0,0 +1,3173 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "ctree.h" +#include "block-group.h" +#include "space-info.h" +#include "disk-io.h" +#include "free-space-cache.h" +#include "free-space-tree.h" +#include "disk-io.h" +#include "volumes.h" +#include "transaction.h" +#include "ref-verify.h" +#include "sysfs.h" +#include "tree-log.h" +#include "delalloc-space.h" + +/* + * Return target flags in extended format or 0 if restripe for this chunk_type + * is not in progress + * + * Should be called with balance_lock held + */ +static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + u64 target = 0; + + if (!bctl) + return 0; + + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + return target; +} + +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Return reduced profile in chunk format. If profile changing is in progress + * (either running or paused) picks the target profile (if it's already + * available), otherwise falls back to plain reducing. + */ +static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices = fs_info->fs_devices->rw_devices; + u64 target; + u64 raid_type; + u64 allowed = 0; + + /* + * See if restripe for this chunk_type is in progress, if so try to + * reduce to the target profile + */ + spin_lock(&fs_info->balance_lock); + target = get_restripe_target(fs_info, flags); + if (target) { + /* Pick target profile only if it's already available */ + if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { + spin_unlock(&fs_info->balance_lock); + return extended_to_chunk(target); + } + } + spin_unlock(&fs_info->balance_lock); + + /* First, mask out the RAID levels which aren't possible */ + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_array[raid_type].bg_flag; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); +} + +static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + unsigned seq; + u64 flags; + + do { + flags = orig_flags; + seq = read_seqbegin(&fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + return btrfs_reduce_alloc_profile(fs_info, flags); +} + +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +{ + return get_alloc_profile(fs_info, orig_flags); +} + +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); + kfree(cache->free_space_ctl); + kfree(cache); + } +} + +/* + * This adds the block group to the fs_info rb tree for the block group cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache *block_group_cache_tree_search( + struct btrfs_fs_info *info, u64 bytenr, int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) { + btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * Return the block group that starts at or after bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 0); +} + +/* + * Return the block group that contains the given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr) +{ + return block_group_cache_tree_search(info, bytenr, 1); +} + +struct btrfs_block_group_cache *btrfs_next_block_group( + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct rb_node *node; + + spin_lock(&fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + } + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&fs_info->block_group_cache_lock); + return cache; +} + +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* No put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_var(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_var(&bg->reservations); + btrfs_put_block_group(bg); +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); +} + +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +void btrfs_put_caching_control(struct btrfs_caching_control *ctl) +{ + if (refcount_dec_and_test(&ctl->count)) + kfree(ctl); +} + +/* + * When we wait for progress in the block group caching, its because our + * allocation attempt failed at least once. So, we must sleep and let some + * progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to show + * up, and then it will check the block group free space numbers for our min + * num_bytes. Another option is to have it go ahead and look in the rbtree for + * a free extent of a given size, but this is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. + */ +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + struct btrfs_caching_control *caching_ctl; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return; + + wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) || + (cache->free_space_ctl->free_space >= num_bytes)); + + btrfs_put_caching_control(caching_ctl); +} + +int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = btrfs_get_caching_control(cache); + if (!caching_ctl) + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; + + wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache)); + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; + btrfs_put_caching_control(caching_ctl); + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + fs_info->nodesize : fs_info->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + +/* + * This is only called by btrfs_cache_block_group, since we could have freed + * extents we need to check the pinned_extents for any extents that can't be + * used yet since their free space will be released as soon as the transaction + * commits. + */ +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + u64 start, u64 end) +{ + struct btrfs_fs_info *info = block_group->fs_info; + u64 extent_start, extent_end, size, total_added = 0; + int ret; + + while (start < end) { + ret = find_first_extent_bit(info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL); + if (ret) + break; + + if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); /* -ENOMEM or logic error */ + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + total_added += size; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); /* -ENOMEM or logic error */ + } + + return total_added; +} + +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group_cache *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 total_found = 0; + u64 last = 0; + u32 nritems; + int ret; + bool wakeup = true; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(block_group)) + wakeup = false; +#endif + /* + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = READA_FORWARD; + + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + +next: + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (btrfs_fs_closing(fs_info) > 1) { + last = (u64)-1; + break; + } + + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); + if (ret) + break; + + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&caching_ctl->mutex); + cond_resched(); + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; + } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + if (wakeup) + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; + continue; + } + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + total_found += add_new_free_space(block_group, last, + key.objectid); + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->nodesize; + else + last = key.objectid + key.offset; + + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + if (wakeup) + wake_up(&caching_ctl->wait); + } + } + path->slots[0]++; + } + ret = 0; + + total_found += add_new_free_space(block_group, last, + block_group->key.objectid + + block_group->key.offset); + caching_ctl->progress = (u64)-1; + +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(block_group); + } +#endif + + caching_ctl->progress = (u64)-1; + + up_read(&fs_info->commit_root_sem); + btrfs_free_excluded_extents(block_group); + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + + btrfs_put_caching_control(caching_ctl); + btrfs_put_block_group(block_group); +} + +int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, + int load_cache_only) +{ + DEFINE_WAIT(wait); + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; + int ret = 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + if (!caching_ctl) + return -ENOMEM; + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + refcount_set(&caching_ctl->count, 1); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + refcount_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + btrfs_put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); + return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); + + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + mutex_lock(&caching_ctl->mutex); + ret = load_free_space_cache(cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; + } else { + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + } + spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(cache); + } +#endif + mutex_unlock(&caching_ctl->mutex); + + wake_up(&caching_ctl->wait); + if (ret == 1) { + btrfs_put_caching_control(caching_ctl); + btrfs_free_excluded_extents(cache); + return 0; + } + } else { + /* + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); + } + + if (load_cache_only) { + btrfs_put_caching_control(caching_ctl); + return 0; + } + + down_write(&fs_info->commit_root_sem); + refcount_inc(&caching_ctl->count); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->commit_root_sem); + + btrfs_get_block_group(cache); + + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); + + return ret; +} + +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + bool found = false; + + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found = true; + up_read(&sinfo->groups_sem); + + if (found) + return; + } + btrfs_clear_fs_incompat(fs_info, RAID56); + } +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_key key; + struct inode *inode; + struct kobject *kobj = NULL; + int ret; + int index; + int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; + bool remove_rsv = false; + + block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + trace_btrfs_remove_block_group(block_group); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + btrfs_free_excluded_extents(block_group); + btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, + block_group->key.offset); + + memcpy(&key, &block_group->key, sizeof(key)); + index = btrfs_bg_flags_to_raid_index(block_group->flags); + factor = btrfs_bg_type_to_factor(block_group->flags); + + /* make sure this block group isn't part of an allocation cluster */ + cluster = &fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ + inode = lookup_free_space_inode(block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * Make sure our free space cache IO is done before removing the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(trans, block_group, path); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + remove_rsv = true; + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + + if (!IS_ERR(inode)) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + btrfs_add_delayed_iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + + spin_lock(&fs_info->block_group_cache_lock); + rb_erase(&block_group->cache_node, + &fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + + if (fs_info->first_logical_byte == block_group->key.objectid) + fs_info->first_logical_byte = (u64)-1; + spin_unlock(&fs_info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) { + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; + clear_avail_alloc_bits(fs_info, block_group->flags); + } + up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + + if (block_group->has_caching_ctl) + caching_ctl = btrfs_get_caching_control(block_group); + if (block_group->cached == BTRFS_CACHE_STARTED) + btrfs_wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + WARN_ON(!list_empty(&block_group->dirty_list)); + WARN_ON(!list_empty(&block_group->io_list)); + spin_unlock(&trans->transaction->dirty_bgs_lock); + + btrfs_remove_free_space_cache(block_group); + + spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; + + spin_unlock(&block_group->space_info->lock); + + memcpy(&key, &block_group->key, sizeof(key)); + + mutex_lock(&fs_info->chunk_mutex); + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + spin_unlock(&block_group->lock); + + mutex_unlock(&fs_info->chunk_mutex); + + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } +out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_free_path(path); + return ret; +} + +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, const u64 chunk_offset) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = em->map_lookup; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, + num_items, 1); +} + +/* + * Mark block group @cache read-only, so later write won't happen to block + * group @cache. + * + * If @force is not set, this function will only mark the block group readonly + * if we have enough free space (1M) in other metadata/system block groups. + * If @force is not set, this function will mark the block group readonly + * without checking free space. + * + * NOTE: This function doesn't care if other block groups can contain all the + * data in this block group. That check should be done by relocation routine, + * not this function. + */ +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + u64 sinfo_used; + u64 min_allocable_bytes; + int ret = -ENOSPC; + + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = SZ_1M; + else + min_allocable_bytes = 0; + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + + if (cache->ro) { + cache->ro++; + ret = 0; + goto out; + } + + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * sinfo_used + num_bytes should always <= sinfo->total_bytes. + * + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer (if min_allocable_bytes is not 0). + */ + if (sinfo_used + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + cache->ro++; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); + ret = 0; + } +out: + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { + btrfs_info(cache->fs_info, + "unable to make block group %llu ro", + cache->key.objectid); + btrfs_info(cache->fs_info, + "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", + sinfo_used, num_bytes, min_allocable_bytes); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + } + return ret; +} + +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + int trimming; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->pinned || + btrfs_block_group_used(&block_group->item) || + block_group->ro || + list_is_singular(&block_group->list)) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = inc_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->key.objectid); + if (IS_ERR(trans)) { + btrfs_dec_block_group_ro(block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_dec_block_group_ro(block_group); + goto end_trans; + } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -block_group->pinned, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + block_group->pinned = 0; + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(fs_info, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_get_block_group(block_group); + } +end_trans: + btrfs_end_transaction(trans); +next: + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +static int find_first_block_group(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + struct btrfs_block_group_item bg; + u64 flags; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else if (em->start != found_key.objectid || + em->len != found_key.offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + found_key.objectid, found_key.offset, + em->start, em->len); + ret = -EUCLEAN; + } else { + read_extent_buffer(leaf, &bg, + btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & + BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + found_key.objectid, + found_key.offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & + em->map_lookup->type)); + ret = -EUCLEAN; + } else { + ret = 0; + } + } + free_extent_map(em); + goto out; + } + path->slots[0]++; + } +out: + return ret; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = chunk_to_extended(flags) & + BTRFS_EXTENDED_PROFILE_MASK; + + write_seqlock(&fs_info->profiles_lock); + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); +} + +static int exclude_super_stripes(struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid, + stripe_len); + if (ret) + return ret; + } + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(fs_info, cache->key.objectid, + bytenr, &logical, &nr, &stripe_len); + if (ret) + return ret; + + while (nr--) { + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = btrfs_add_excluded_extent(fs_info, start, len); + if (ret) { + kfree(logical); + return ret; + } + } + + kfree(logical); + } + return 0; +} + +static void link_block_group(struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *space_info = cache->space_info; + int index = btrfs_bg_flags_to_raid_index(cache->flags); + bool first = false; + + down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) + btrfs_sysfs_add_block_group_type(cache); +} + +static struct btrfs_block_group_cache *btrfs_create_block_group_cache( + struct btrfs_fs_info *fs_info, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->fs_info = fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); + set_free_space_tree_thresholds(cache); + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); + btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); + + return cache; +} + +/* + * Iterate all chunks and verify that each of them has the corresponding block + * group + */ +static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct btrfs_block_group_cache *bg; + u64 start = 0; + int ret = 0; + + while (1) { + read_lock(&map_tree->lock); + /* + * lookup_extent_mapping will return the first extent map + * intersecting the range, so setting @len to 1 is enough to + * get the first chunk. + */ + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); + if (!em) + break; + + bg = btrfs_lookup_block_group(fs_info, em->start); + if (!bg) { + btrfs_err(fs_info, + "chunk start=%llu len=%llu doesn't have corresponding block group", + em->start, em->len); + ret = -EUCLEAN; + free_extent_map(em); + break; + } + if (bg->key.objectid != em->start || + bg->key.offset != em->len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", + em->start, em->len, + em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + bg->key.objectid, bg->key.offset, + bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ret = -EUCLEAN; + free_extent_map(em); + btrfs_put_block_group(bg); + break; + } + start = em->start + em->len; + free_extent_map(em); + btrfs_put_block_group(bg); + } + return ret; +} + +int btrfs_read_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; + u64 feature; + int mixed; + + feature = btrfs_super_incompat_flags(info->super_copy); + mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); + + key.objectid = 0; + key.offset = 0; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + cache_gen = btrfs_super_cache_generation(info->super_copy); + if (btrfs_test_opt(info, SPACE_CACHE) && + btrfs_super_generation(info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(info, CLEAR_CACHE)) + need_clear = 1; + + while (1) { + ret = find_first_block_group(info, path, &key); + if (ret > 0) + break; + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + cache = btrfs_create_block_group_cache(info, found_key.objectid, + found_key.offset); + if (!cache) { + ret = -ENOMEM; + goto error; + } + + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) + cache->disk_cache_state = BTRFS_DC_CLEAR; + } + + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + cache->flags = btrfs_block_group_flags(&cache->item); + if (!mixed && + ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->key.objectid); + ret = -EINVAL; + goto error; + } + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(path); + + /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + ret = exclude_super_stripes(cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); + goto error; + } + + /* + * Check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _a_lot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + btrfs_free_excluded_extents(cache); + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, found_key.objectid, + found_key.objectid + + found_key.offset); + btrfs_free_excluded_extents(cache); + } + + ret = btrfs_add_block_group_cache(info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + + trace_btrfs_add_block_group(info, cache, 0); + btrfs_update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); + + cache->space_info = space_info; + + link_block_group(cache); + + set_avail_alloc_bits(info, cache->flags); + if (btrfs_chunk_readonly(info, cache->key.objectid)) { + inc_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + ASSERT(list_empty(&cache->bg_list)); + btrfs_mark_bg_unused(cache); + } + } + + list_for_each_entry_rcu(space_info, &info->space_info, list) { + if (!(btrfs_get_alloc_profile(info, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * Avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) + inc_block_group_ro(cache, 1); + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) + inc_block_group_ro(cache, 1); + } + + btrfs_init_global_block_rsv(info); + ret = check_chunk_block_group_mappings(info); +error: + btrfs_free_path(path); + return ret; +} + +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *block_group; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; + + if (!trans->can_flush_pending_bgs) + return; + + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, + bg_list); + if (ret) + goto next; + + spin_lock(&block_group->lock); + memcpy(&item, &block_group->item, sizeof(item)); + memcpy(&key, &block_group->key, sizeof(key)); + spin_unlock(&block_group->lock); + + ret = btrfs_insert_item(trans, extent_root, &key, &item, + sizeof(item)); + if (ret) + btrfs_abort_transaction(trans, ret); + ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, ret); + add_block_group_free_space(trans, block_group); + /* Already aborted the transaction if it failed. */ +next: + btrfs_delayed_refs_rsv_release(fs_info, 1); + list_del_init(&block_group->bg_list); + } + btrfs_trans_release_chunk_metadata(trans); +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, + u64 type, u64 chunk_offset, u64 size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache; + int ret; + + btrfs_set_log_full_commit(trans); + + cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); + if (!cache) + return -ENOMEM; + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_block_group_flags(&cache->item, type); + + cache->flags = type; + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case */ + btrfs_free_excluded_extents(cache); + btrfs_put_block_group(cache); + return ret; + } + + add_new_free_space(cache, chunk_offset, chunk_offset + size); + + btrfs_free_excluded_extents(cache); + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(cache); + } +#endif + /* + * Ensure the corresponding space_info object is created and + * assigned to our block group. We want our bg to be added to the rbtree + * with its ->space_info set. + */ + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + ASSERT(cache->space_info); + + ret = btrfs_add_block_group_cache(fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ + trace_btrfs_add_block_group(fs_info, cache, 1); + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, + cache->bytes_super, &cache->space_info); + btrfs_update_global_block_rsv(fs_info); + + link_block_group(cache); + + list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); + + set_avail_alloc_bits(fs_info, type); + return 0; +} + +static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 num_devices; + u64 stripped; + + /* + * if restripe for this chunk_type is on pick target profile and + * return, otherwise do the usual balance + */ + stripped = get_restripe_target(fs_info, flags); + if (stripped) + return extended_to_chunk(stripped); + + num_devices = fs_info->fs_devices->rw_devices; + + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; + + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* this is drive concat, leave it alone */ + } + + return flags; +} + +int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) + +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; + +again: + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; + + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + goto again; + } + + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(fs_info, cache->flags); + if (alloc_flags != cache->flags) { + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + + ret = inc_block_group_ro(cache, 0); + if (!ret) + goto out; + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); +out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(fs_info, cache->flags); + mutex_lock(&fs_info->chunk_mutex); + check_system_chunk(trans, alloc_flags); + mutex_unlock(&fs_info->chunk_mutex); + } + mutex_unlock(&fs_info->ro_block_group_mutex); + + btrfs_end_transaction(trans); + return ret; +} + +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) +{ + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_root *extent_root = fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto fail; + } + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); +fail: + btrfs_release_path(path); + return ret; + +} + +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; + u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; + u64 num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * SZ_1M)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + + if (trans->aborted) + return 0; +again: + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, ret); + goto out_put; + } + WARN_ON(ret); + + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + + if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(fs_info, + &fs_info->global_block_rsv); + if (ret) + goto out_put; + + ret = btrfs_truncate_free_space_cache(trans, NULL, inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED || + !btrfs_test_opt(fs_info, SPACE_CACHE)) { + /* + * don't bother trying to write stuff out _if_ + * a) we're not cached, + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). + */ + dcs = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* + * Try to preallocate enough space based on how big the block group is. + * Keep in mind this has to include any pinned space which could end up + * taking up quite a bit since it's not folded into the other space + * cache. + */ + num_pages = div_u64(block_group->key.offset, SZ_256M); + if (!num_pages) + num_pages = 1; + + num_pages *= 16; + num_pages *= PAGE_SIZE; + + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ + if (!ret) + dcs = BTRFS_DC_SETUP; + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + +out_put: + iput(inode); +out_free: + btrfs_release_path(path); +out: + spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); + + extent_changeset_free(data_reserved); + return ret; +} + +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + +/* + * Transaction commit does final block group cache writeback during a critical + * section where nothing is allowed to change the FS. This is required in + * order for the cache to actually match the block group, but can introduce a + * lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. + * There's a chance we'll have to redo some of it if the block group changes + * again during the commit, but it greatly reduces the commit latency by + * getting rid of the easy block groups while we're still allowing others to + * join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* Make sure all the block groups on our dirty list actually exist */ + btrfs_create_pending_block_groups(trans); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + bool drop_reserve = true; + + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * This can happen if something re-dirties a block group that + * is already under IO. Just wait for it to finish and then do + * it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide if + * it should update the cache_state. Don't delete until after + * we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * The cache_write_mutex is protecting the + * io_list, also refer to the definition of + * btrfs_transaction::io_bgs for more details + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + drop_reserve = false; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, ret); + } + } + + /* If it's not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * Go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret < 0) { + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + spin_lock(&cur_trans->dirty_bgs_lock); + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + + /* + * This can happen if cache_save_setup re-dirties a block group + * that is already under IO. Just wait for it to finish and + * then do it all again + */ + if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + + /* + * Don't remove from the dirty list until after we've waited on + * any pending IO + */ + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (!ret) + ret = btrfs_run_delayed_refs(trans, + (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * If we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, path, cache); + } + if (ret) + btrfs_abort_transaction(trans, ret); + } + + /* If its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); + spin_lock(&cur_trans->dirty_bgs_lock); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(trans, cache, path); + btrfs_put_block_group(cache); + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int alloc) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + int factor; + int ret = 0; + + /* Block accounting for super block */ + spin_lock(&info->delalloc_root_lock); + old_val = btrfs_super_bytes_used(info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(info->super_copy, old_val); + spin_unlock(&info->delalloc_root_lock); + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) { + ret = -ENOENT; + break; + } + factor = btrfs_bg_type_to_factor(cache->flags); + + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + btrfs_cache_block_group(cache, 1); + + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, + cache->space_info, num_bytes); + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + percpu_counter_add_batch( + &cache->space_info->total_bytes_pinned, + num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->delayed_ref_updates++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) + btrfs_mark_bg_unused(cache); + + btrfs_put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; +} + +/** + * btrfs_add_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by the allocator when it reserves space. If this is a + * reservation and the block group has become read only we cannot make the + * reservation and return -EAGAIN, otherwise this function always succeeds. + */ +int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); + if (delalloc) + cache->delalloc_bytes += num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} + +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ +void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); +} + +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = CHUNK_ALLOC_FORCE; + } + rcu_read_unlock(); +} + +static int should_alloc_chunk(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, int force) +{ + u64 bytes_used = btrfs_space_info_used(sinfo, false); + u64 thresh; + + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(fs_info->super_copy); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + + if (sinfo->total_bytes - bytes_used < thresh) + return 1; + } + + if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + return 0; + return 1; +} + +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) +{ + u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); +} + +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *space_info; + bool wait_for_alloc = false; + bool should_alloc = false; + int ret = 0; + + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + + do { + spin_lock(&space_info->lock); + if (force < space_info->force_alloc) + force = space_info->force_alloc; + should_alloc = should_alloc_chunk(fs_info, space_info, force); + if (space_info->full) { + /* No more free physical space */ + if (should_alloc) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&space_info->lock); + return ret; + } else if (!should_alloc) { + spin_unlock(&space_info->lock); + return 0; + } else if (space_info->chunk_alloc) { + /* + * Someone is already allocating, so we need to block + * until this someone is finished and then loop to + * recheck if we should continue with our allocation + * attempt. + */ + wait_for_alloc = true; + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); + } else { + /* Proceed with allocation */ + space_info->chunk_alloc = 1; + wait_for_alloc = false; + spin_unlock(&space_info->lock); + } + + cond_resched(); + } while (wait_for_alloc); + + mutex_lock(&fs_info->chunk_mutex); + trans->allocating_chunk = true; + + /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + + /* + * Check if we have enough space in SYSTEM chunk because we may need + * to update devices. + */ + check_system_chunk(trans, flags); + + ret = btrfs_alloc_chunk(trans, flags); + trans->allocating_chunk = false; + + spin_lock(&space_info->lock); + if (ret < 0) { + if (ret == -ENOSPC) + space_info->full = 1; + else + goto out; + } else { + ret = 1; + space_info->max_extent_size = 0; + } + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: + space_info->chunk_alloc = 0; + spin_unlock(&space_info->lock); + mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (u64)SZ_2M) + btrfs_create_pending_block_groups(trans); + + return ret; +} + +static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +{ + u64 num_dev; + + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) + num_dev = fs_info->fs_devices->rw_devices; + + return num_dev; +} + +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_space_info *info; + u64 left; + u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + spin_lock(&info->lock); + left = info->total_bytes - btrfs_space_info_used(info, true); + spin_unlock(&info->lock); + + num_devs = get_profile_num_devs(fs_info, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + btrfs_dump_space_info(fs_info, info, 0, 0); + } + + if (left < thresh) { + u64 flags = btrfs_system_alloc_profile(fs_info); + + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(fs_info->chunk_root, + &fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; + } +} + +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = btrfs_next_block_group(block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + +/* + * Must be called only after stopping all workers, since we could have block + * group caching kthreads running, and therefore they could race with us if we + * freed the block groups before stopping them. + */ +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; + struct rb_node *n; + + down_write(&info->commit_root_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } + up_write(&info->commit_root_sem); + + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + spin_unlock(&info->block_group_cache_lock); + + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) + btrfs_free_excluded_extents(block_group); + + btrfs_remove_free_space_cache(block_group); + ASSERT(block_group->cached != BTRFS_CACHE_STARTED); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); + btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + + /* + * Now that all the block groups are freed, go through and free all the + * space_info structs. This is only called during the final stages of + * unmount, and so we know nobody is using them. We call + * synchronize_rcu() once before we start, just to be on the safe side. + */ + synchronize_rcu(); + + btrfs_release_global_block_rsv(info); + + while (!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + list_del(&space_info->list); + btrfs_sysfs_remove_space_info(space_info); + } + return 0; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h new file mode 100644 index 0000000000000000000000000000000000000000..c391800388ddaaae8875d7f80ac58406c1b3767a --- /dev/null +++ b/fs/btrfs/block-group.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_GROUP_H +#define BTRFS_BLOCK_GROUP_H + +#include "free-space-cache.h" + +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, +}; + +/* + * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to + * only allocate a chunk if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few + * chunks already allocated. This is used as part of the clustering code to + * help make sure we have a good pool of storage to cluster in, without filling + * the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, +}; + +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_work work; + struct btrfs_block_group_cache *block_group; + u64 progress; + refcount_t count; +}; + +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP SZ_2M + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; + struct inode *inode; + spinlock_t lock; + u64 pinned; + u64 reserved; + u64 delalloc_bytes; + u64 bytes_super; + u64 flags; + u64 cache_generation; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; + + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + + /* For raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + + unsigned int ro; + unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; + + int disk_cache_state; + + /* Cache tracking stuff */ + int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; + + struct btrfs_space_info *space_info; + + /* Free space cache stuff */ + struct btrfs_free_space_ctl *free_space_ctl; + + /* Block group cache stuff */ + struct rb_node cache_node; + + /* For block groups in the same raid type */ + struct list_head list; + + /* Usage count */ + atomic_t count; + + /* + * List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; + + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; + + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; +}; + +#ifdef CONFIG_BTRFS_DEBUG +static inline int btrfs_should_fragment_free_space( + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + +struct btrfs_block_group_cache *btrfs_lookup_first_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, u64 bytenr); +struct btrfs_block_group_cache *btrfs_next_block_group( + struct btrfs_block_group_cache *cache); +void btrfs_get_block_group(struct btrfs_block_group_cache *cache); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes); +int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache); +int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, + int load_cache_only); +void btrfs_put_caching_control(struct btrfs_caching_control *ctl); +struct btrfs_caching_control *btrfs_get_caching_control( + struct btrfs_block_group_cache *cache); +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + u64 start, u64 end); +struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( + struct btrfs_fs_info *fs_info, + const u64 chunk_offset); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + u64 group_start, struct extent_map *em); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); +int btrfs_read_block_groups(struct btrfs_fs_info *info); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, + u64 type, u64 chunk_offset, u64 size); +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); +int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); +int btrfs_update_block_group(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, int alloc); +int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); +void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +int btrfs_free_block_groups(struct btrfs_fs_info *info); + +static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); +} + +static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); +} + +static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) +{ + return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +} + +static inline int btrfs_block_group_cache_done( + struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; +} + +#endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 698470b9f32d358612d1d27233f6f5ad7517afd3..d07bd41a7c1e01b2b12254802702198606796108 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +#include "misc.h" #include "ctree.h" #include "block-rsv.h" #include "space-info.h" -#include "math.h" #include "transaction.h" static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, @@ -54,8 +54,9 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(fs_info, + space_info, + num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -258,6 +259,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; u64 num_bytes; + unsigned min_items; /* * The global block rsv is based on the size of the extent tree, the @@ -267,7 +269,26 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + btrfs_root_used(&fs_info->csum_root->root_item) + btrfs_root_used(&fs_info->tree_root->root_item); - num_bytes = max_t(u64, num_bytes, SZ_16M); + + /* + * We at a minimum are going to modify the csum root, the tree root, and + * the extent root. + */ + min_items = 3; + + /* + * But we also want to reserve enough space so we can do the fallback + * global reserve for an unlink, which is an additional 5 items (see the + * comment in __unlink_start_trans for what we're modifying.) + * + * But we also need space for the delayed ref updates from the unlink, + * so its 10, 5 for the actual operation, and 5 for the delayed ref + * updates. + */ + min_items += 10; + + num_bytes = max_t(u64, num_bytes, + btrfs_calc_insert_metadata_size(fs_info, min_items)); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); @@ -275,25 +296,16 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = min_t(u64, num_bytes, SZ_512M); if (block_rsv->reserved < block_rsv->size) { - num_bytes = btrfs_space_info_used(sinfo, true); - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - num_bytes = min(num_bytes, - block_rsv->size - block_rsv->reserved); - block_rsv->reserved += num_bytes; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, - 1); - } + num_bytes = block_rsv->size - block_rsv->reserved; + block_rsv->reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(fs_info, sinfo, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; + btrfs_try_granting_tickets(fs_info, sinfo); } if (block_rsv->reserved == block_rsv->size) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 81a9731959a92e481c0087c5e1403bb6c9af5692..0b52ab4cb964900b17c2bcb542044902cee87740 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -940,7 +940,7 @@ static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) kfree(sf); } -static int btrfsic_process_metablock( +static noinline_for_stack int btrfsic_process_metablock( struct btrfsic_state *state, struct btrfsic_block *const first_block, struct btrfsic_block_data_ctx *const first_block_ctx, @@ -1706,8 +1706,9 @@ static void btrfsic_dump_database(struct btrfsic_state *state) * Test whether the disk block contains a tree block (leaf or node) * (note that this test fails for the super block) */ -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - char **datav, unsigned int num_pages) +static noinline_for_stack int btrfsic_test_for_metadata( + struct btrfsic_state *state, + char **datav, unsigned int num_pages) { struct btrfs_fs_info *fs_info = state->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 60c47b417a4b6c197b3603654d403eeba86c9824..b05b361e2062337eef9837cb71753a7f0a26ba2d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -18,6 +18,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1039,7 +1040,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, struct list_head *workspace; int ret; - level = btrfs_compress_op[type]->set_level(level); + level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, start, pages, @@ -1611,7 +1612,23 @@ unsigned int btrfs_compress_str2level(unsigned int type, const char *str) level = 0; } - level = btrfs_compress_op[type]->set_level(level); + level = btrfs_compress_set_level(type, level); + + return level; +} + +/* + * Adjust @level according to the limits of the compression algorithm or + * fallback to default + */ +unsigned int btrfs_compress_set_level(int type, unsigned level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + if (level == 0) + level = ops->default_level; + else + level = min(level, ops->max_level); return level; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2035b8eb12904c95cd35792ae54b668069de0a48..4cb8be9ff88b0e19ee77e0f21954e8badbc88d87 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -156,12 +156,9 @@ struct btrfs_compress_op { unsigned long start_byte, size_t srclen, size_t destlen); - /* - * This bounds the level set by the user to be within range of a - * particular compression type. It returns the level that will be used - * if the level is out of bounds or the default if 0 is passed in. - */ - unsigned int (*set_level)(unsigned int level); + /* Maximum level supported by the compression algorithm */ + unsigned int max_level; + unsigned int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -175,6 +172,8 @@ extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); +unsigned int btrfs_compress_set_level(int type, unsigned level); + int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5df76c17775ad4650c6deb71617f9e2e5b75f2b1..e59cde204b2fe8322bee2ee07599083e6f210147 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -29,6 +29,28 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); +static const struct btrfs_csums { + u16 size; + const char *name; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, +}; + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ + return btrfs_csums[t].size; +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].name; +} + struct btrfs_path *btrfs_alloc_path(void) { return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); @@ -376,8 +398,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * The 'start address' is the logical address of the *new* root node * for root replace operations, or the logical address of the affected * block for all other operations. - * - * Note: must be called with write lock for fs_info::tree_mod_log_lock. */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -387,6 +407,8 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node *parent = NULL; struct tree_mod_elem *cur; + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; @@ -1343,6 +1365,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb = NULL; struct extent_buffer *eb_root; + u64 eb_root_owner = 0; struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; @@ -1380,6 +1403,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(old); } } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); @@ -1396,7 +1420,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); + btrfs_set_header_owner(eb, eb_root_owner); btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } @@ -1790,8 +1814,8 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). */ -static noinline struct extent_buffer *read_node_slot( - struct extent_buffer *parent, int slot) +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot) { int level = btrfs_header_level(parent); struct extent_buffer *eb; @@ -1860,7 +1884,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; /* promote the child to a root */ - child = read_node_slot(mid, 0); + child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); btrfs_handle_fs_error(fs_info, ret, NULL); @@ -1900,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = read_node_slot(parent, pslot - 1); + left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -1915,7 +1939,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = read_node_slot(parent, pslot + 1); + right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2075,7 +2099,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = read_node_slot(parent, pslot - 1); + left = btrfs_read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -2127,7 +2151,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = read_node_slot(parent, pslot + 1); + right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2889,15 +2913,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (!p->skip_locking) { level = btrfs_header_level(b); if (level <= write_lock_level) { - err = btrfs_try_tree_write_lock(b); - if (!err) { + if (!btrfs_try_tree_write_lock(b)) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); } p->locks[level] = BTRFS_WRITE_LOCK; } else { - err = btrfs_tree_read_lock_atomic(b); - if (!err) { + if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); } @@ -3031,8 +3053,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, } level = btrfs_header_level(b); - err = btrfs_tree_read_lock_atomic(b); - if (!err) { + if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); } @@ -3572,7 +3593,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) if (!nr) return 0; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, l); start_item = btrfs_item_nr(start); end_item = btrfs_item_nr(end); data_len = btrfs_token_item_offset(l, start_item, &token) + @@ -3630,8 +3651,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path, u32 data_end; u32 this_item_size; - btrfs_init_map_token(&token); - if (empty) nr = 0; else @@ -3704,6 +3723,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path, push_items * sizeof(struct btrfs_item)); /* update the item pointers */ + btrfs_init_map_token(&token, right); right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); @@ -3781,7 +3801,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - right = read_node_slot(upper, slot + 1); + right = btrfs_read_node_slot(upper, slot + 1); /* * slot + 1 is not valid or we fail to read the right node, * no big deal, just return. @@ -3858,8 +3878,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, u32 old_left_item_size; struct btrfs_map_token token; - btrfs_init_map_token(&token); - if (empty) nr = min(right_nritems, max_slot); else @@ -3913,6 +3931,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); + btrfs_init_map_token(&token, left); old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; @@ -3944,6 +3963,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, (btrfs_header_nritems(right) - push_items) * sizeof(struct btrfs_item)); } + + btrfs_init_map_token(&token, right); right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); @@ -4015,7 +4036,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - left = read_node_slot(path->nodes[1], slot - 1); + left = btrfs_read_node_slot(path->nodes[1], slot - 1); /* * slot - 1 is not valid or we fail to read the left node, * no big deal, just return. @@ -4074,8 +4095,6 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct btrfs_map_token token; - btrfs_init_map_token(&token); - nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); @@ -4091,6 +4110,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); + btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; @@ -4574,8 +4594,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) int i; struct btrfs_map_token token; - btrfs_init_map_token(&token); - leaf = path->nodes[0]; slot = path->slots[0]; @@ -4597,6 +4615,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(i); @@ -4671,8 +4690,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) int i; struct btrfs_map_token token; - btrfs_init_map_token(&token); - leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -4697,6 +4714,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ + btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(i); @@ -4748,8 +4766,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } btrfs_unlock_up_safe(path, 1); - btrfs_init_map_token(&token); - leaf = path->nodes[0]; slot = path->slots[0]; @@ -4763,6 +4779,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, BUG(); } + btrfs_init_map_token(&token, leaf); if (slot != nritems) { unsigned int old_data = btrfs_item_end_nr(leaf, slot); @@ -4969,9 +4986,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wret; int i; u32 nritems; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); leaf = path->nodes[0]; last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); @@ -4983,12 +4997,14 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { int data_end = leaf_data_end(leaf); + struct btrfs_map_token token; memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, BTRFS_LEAF_DATA_OFFSET + data_end, last_off - data_end); + btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { u32 ioff; @@ -5222,7 +5238,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, goto out; } btrfs_set_path_blocking(path); - cur = read_node_slot(cur, slot); + cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); goto out; @@ -5244,368 +5260,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, return ret; } -static int tree_move_down(struct btrfs_path *path, int *level) -{ - struct extent_buffer *eb; - - BUG_ON(*level == 0); - eb = read_node_slot(path->nodes[*level], path->slots[*level]); - if (IS_ERR(eb)) - return PTR_ERR(eb); - - path->nodes[*level - 1] = eb; - path->slots[*level - 1] = 0; - (*level)--; - return 0; -} - -static int tree_move_next_or_upnext(struct btrfs_path *path, - int *level, int root_level) -{ - int ret = 0; - int nritems; - nritems = btrfs_header_nritems(path->nodes[*level]); - - path->slots[*level]++; - - while (path->slots[*level] >= nritems) { - if (*level == root_level) - return -1; - - /* move upnext */ - path->slots[*level] = 0; - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - (*level)++; - path->slots[*level]++; - - nritems = btrfs_header_nritems(path->nodes[*level]); - ret = 1; - } - return ret; -} - -/* - * Returns 1 if it had to move up and next. 0 is returned if it moved only next - * or down. - */ -static int tree_advance(struct btrfs_path *path, - int *level, int root_level, - int allow_down, - struct btrfs_key *key) -{ - int ret; - - if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(path, level, root_level); - } else { - ret = tree_move_down(path, level); - } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } - return ret; -} - -static int tree_compare_item(struct btrfs_path *left_path, - struct btrfs_path *right_path, - char *tmp_buf) -{ - int cmp; - int len1, len2; - unsigned long off1, off2; - - len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); - len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); - if (len1 != len2) - return 1; - - off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); - off2 = btrfs_item_ptr_offset(right_path->nodes[0], - right_path->slots[0]); - - read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); - - cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); - if (cmp) - return 1; - return 0; -} - -#define ADVANCE 1 -#define ADVANCE_ONLY_NEXT -1 - -/* - * This function compares two trees and calls the provided callback for - * every changed/new/deleted item it finds. - * If shared tree blocks are encountered, whole subtrees are skipped, making - * the compare pretty fast on snapshotted subvolumes. - * - * This currently works on commit roots only. As commit roots are read only, - * we don't do any locking. The commit roots are protected with transactions. - * Transactions are ended and rejoined when a commit is tried in between. - * - * This function checks for modifications done to the trees while comparing. - * If it detects a change, it aborts immediately. - */ -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) -{ - struct btrfs_fs_info *fs_info = left_root->fs_info; - int ret; - int cmp; - struct btrfs_path *left_path = NULL; - struct btrfs_path *right_path = NULL; - struct btrfs_key left_key; - struct btrfs_key right_key; - char *tmp_buf = NULL; - int left_root_level; - int right_root_level; - int left_level; - int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; - u64 left_blockptr; - u64 right_blockptr; - u64 left_gen; - u64 right_gen; - - left_path = btrfs_alloc_path(); - if (!left_path) { - ret = -ENOMEM; - goto out; - } - right_path = btrfs_alloc_path(); - if (!right_path) { - ret = -ENOMEM; - goto out; - } - - tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } - - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; - - /* - * Strategy: Go to the first items of both trees. Then do - * - * If both trees are at level 0 - * Compare keys of current items - * If left < right treat left item as new, advance left tree - * and repeat - * If left > right treat right item as deleted, advance right tree - * and repeat - * If left == right do deep compare of items, treat as changed if - * needed, advance both trees and repeat - * If both trees are at the same level but not at level 0 - * Compare keys of current nodes/leafs - * If left < right advance left tree and repeat - * If left > right advance right tree and repeat - * If left == right compare blockptrs of the next nodes/leafs - * If they match advance both trees but stay at the same level - * and repeat - * If they don't match advance both trees while allowing to go - * deeper and repeat - * If tree levels are different - * Advance the tree that needs it and repeat - * - * Advancing a tree means: - * If we are at level 0, try to go to the next slot. If that's not - * possible, go one level up and repeat. Stop when we found a level - * where we could go to the next slot. We may at this point be on a - * node or a leaf. - * - * If we are not at level 0 and not on shared tree blocks, go one - * level deeper. - * - * If we are not at level 0 and on shared tree blocks, go one slot to - * the right if possible or go up and right. - */ - - down_read(&fs_info->commit_root_sem); - left_level = btrfs_header_level(left_root->commit_root); - left_root_level = left_level; - left_path->nodes[left_level] = - btrfs_clone_extent_buffer(left_root->commit_root); - if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); - ret = -ENOMEM; - goto out; - } - - right_level = btrfs_header_level(right_root->commit_root); - right_root_level = right_level; - right_path->nodes[right_level] = - btrfs_clone_extent_buffer(right_root->commit_root); - if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); - ret = -ENOMEM; - goto out; - } - up_read(&fs_info->commit_root_sem); - - if (left_level == 0) - btrfs_item_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - else - btrfs_node_key_to_cpu(left_path->nodes[left_level], - &left_key, left_path->slots[left_level]); - if (right_level == 0) - btrfs_item_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - else - btrfs_node_key_to_cpu(right_path->nodes[right_level], - &right_key, right_path->slots[right_level]); - - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; - - while (1) { - if (advance_left && !left_end_reached) { - ret = tree_advance(left_path, &left_level, - left_root_level, - advance_left != ADVANCE_ONLY_NEXT, - &left_key); - if (ret == -1) - left_end_reached = ADVANCE; - else if (ret < 0) - goto out; - advance_left = 0; - } - if (advance_right && !right_end_reached) { - ret = tree_advance(right_path, &right_level, - right_root_level, - advance_right != ADVANCE_ONLY_NEXT, - &right_key); - if (ret == -1) - right_end_reached = ADVANCE; - else if (ret < 0) - goto out; - advance_right = 0; - } - - if (left_end_reached && right_end_reached) { - ret = 0; - goto out; - } else if (left_end_reached) { - if (right_level == 0) { - ret = changed_cb(left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - } - advance_right = ADVANCE; - continue; - } else if (right_end_reached) { - if (left_level == 0) { - ret = changed_cb(left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - } - advance_left = ADVANCE; - continue; - } - - if (left_level == 0 && right_level == 0) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - ret = changed_cb(left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_NEW, - ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - } else if (cmp > 0) { - ret = changed_cb(left_path, right_path, - &right_key, - BTRFS_COMPARE_TREE_DELETED, - ctx); - if (ret < 0) - goto out; - advance_right = ADVANCE; - } else { - enum btrfs_compare_tree_result result; - - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = tree_compare_item(left_path, right_path, - tmp_buf); - if (ret) - result = BTRFS_COMPARE_TREE_CHANGED; - else - result = BTRFS_COMPARE_TREE_SAME; - ret = changed_cb(left_path, right_path, - &left_key, result, ctx); - if (ret < 0) - goto out; - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } else if (left_level == right_level) { - cmp = btrfs_comp_cpu_keys(&left_key, &right_key); - if (cmp < 0) { - advance_left = ADVANCE; - } else if (cmp > 0) { - advance_right = ADVANCE; - } else { - left_blockptr = btrfs_node_blockptr( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_blockptr = btrfs_node_blockptr( - right_path->nodes[right_level], - right_path->slots[right_level]); - left_gen = btrfs_node_ptr_generation( - left_path->nodes[left_level], - left_path->slots[left_level]); - right_gen = btrfs_node_ptr_generation( - right_path->nodes[right_level], - right_path->slots[right_level]); - if (left_blockptr == right_blockptr && - left_gen == right_gen) { - /* - * As we're on a shared block, don't - * allow to go deeper. - */ - advance_left = ADVANCE_ONLY_NEXT; - advance_right = ADVANCE_ONLY_NEXT; - } else { - advance_left = ADVANCE; - advance_right = ADVANCE; - } - } - } else if (left_level < right_level) { - advance_right = ADVANCE; - } else { - advance_left = ADVANCE; - } - } - -out: - btrfs_free_path(left_path); - btrfs_free_path(right_path); - kvfree(tmp_buf); - return ret; -} - /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the @@ -5623,7 +5277,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int slot; struct extent_buffer *c; - WARN_ON(!path->keep_locks); + WARN_ON(!path->keep_locks && !path->skip_locking); while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) return 1; @@ -5639,7 +5293,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, !path->nodes[level + 1]) return 1; - if (path->locks[level + 1]) { + if (path->locks[level + 1] || path->skip_locking) { level++; continue; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 94660063a1628989108b5e15c5b1682e616b2dd3..19d669d12ca15226237e698630d7514b65004a88 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -39,10 +38,12 @@ struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; +struct btrfs_block_group_cache; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; +extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; @@ -82,10 +83,6 @@ struct btrfs_ref; */ #define BTRFS_LINK_MAX 65535U -/* four bytes for CRC32 */ -static const int btrfs_csum_sizes[] = { 4 }; -static const char *btrfs_csum_names[] = { "crc32c" }; - #define BTRFS_EMPTY_DIR_SIZE 0 /* ioprio of readahead is set to idle */ @@ -397,12 +394,6 @@ struct btrfs_dev_replace { wait_queue_head_t replace_wait; }; -/* For raid type sysfs entries */ -struct raid_kobject { - u64 flags; - struct kobject kobj; -}; - /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -439,40 +430,6 @@ enum btrfs_caching_type { BTRFS_CACHE_ERROR, }; -enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN, - BTRFS_DC_ERROR, - BTRFS_DC_CLEAR, - BTRFS_DC_SETUP, -}; - -struct btrfs_caching_control { - struct list_head list; - struct mutex mutex; - wait_queue_head_t wait; - struct btrfs_work work; - struct btrfs_block_group_cache *block_group; - u64 progress; - refcount_t count; -}; - -/* Once caching_thread() finds this much free space, it will wake up waiters. */ -#define CACHING_CTL_WAKE_UP SZ_2M - -struct btrfs_io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_fs_info *fs_info; - struct inode *inode; - unsigned long size; - int index; - int num_pages; - int entries; - int bitmaps; - unsigned check_crcs:1; -}; - /* * Tree to record all locked full stripes of a RAID5/6 block group */ @@ -481,120 +438,6 @@ struct btrfs_full_stripe_locks_tree { struct mutex lock; }; -struct btrfs_block_group_cache { - struct btrfs_key key; - struct btrfs_block_group_item item; - struct btrfs_fs_info *fs_info; - struct inode *inode; - spinlock_t lock; - u64 pinned; - u64 reserved; - u64 delalloc_bytes; - u64 bytes_super; - u64 flags; - u64 cache_generation; - - /* - * If the free space extent count exceeds this number, convert the block - * group to bitmaps. - */ - u32 bitmap_high_thresh; - - /* - * If the free space extent count drops below this number, convert the - * block group back to extents. - */ - u32 bitmap_low_thresh; - - /* - * It is just used for the delayed data space allocation because - * only the data space allocation and the relative metadata update - * can be done cross the transaction. - */ - struct rw_semaphore data_rwsem; - - /* for raid56, this is a full stripe, without parity */ - unsigned long full_stripe_len; - - unsigned int ro; - unsigned int iref:1; - unsigned int has_caching_ctl:1; - unsigned int removed:1; - - int disk_cache_state; - - /* cache tracking stuff */ - int cached; - struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; - - struct btrfs_space_info *space_info; - - /* free space cache stuff */ - struct btrfs_free_space_ctl *free_space_ctl; - - /* block group cache stuff */ - struct rb_node cache_node; - - /* for block groups in the same raid type */ - struct list_head list; - - /* usage count */ - atomic_t count; - - /* List of struct btrfs_free_clusters for this block group. - * Today it will only have one thing on it, but that may change - */ - struct list_head cluster_list; - - /* For delayed block group creation or deletion of empty block groups */ - struct list_head bg_list; - - /* For read-only block groups */ - struct list_head ro_list; - - atomic_t trimming; - - /* For dirty block groups */ - struct list_head dirty_list; - struct list_head io_list; - - struct btrfs_io_ctl io_ctl; - - /* - * Incremented when doing extent allocations and holding a read lock - * on the space_info's groups_sem semaphore. - * Decremented when an ordered extent that represents an IO against this - * block group's range is created (after it's added to its inode's - * root's list of ordered extents) or immediately after the allocation - * if it's a metadata extent or fallocate extent (for these cases we - * don't create ordered extents). - */ - atomic_t reservations; - - /* - * Incremented while holding the spinlock *lock* by a task checking if - * it can perform a nocow write (incremented if the value for the *ro* - * field is 0). Decremented by such tasks once they create an ordered - * extent or before that if some error happens before reaching that step. - * This is to prevent races between block group relocation and nocow - * writes through direct IO. - */ - atomic_t nocow_writers; - - /* Lock for free space tree operations. */ - struct mutex free_space_lock; - - /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - - /* Record locked full stripes for RAID5/6 block group */ - struct btrfs_full_stripe_locks_tree full_stripe_locks_root; -}; - /* delayed seq elem */ struct seq_list { struct list_head list; @@ -610,22 +453,6 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -/* used by the raid56 code to lock stripes for read/modify/write */ -struct btrfs_stripe_hash { - struct list_head hash_list; - spinlock_t lock; -}; - -/* used by the raid56 code to lock stripes for read/modify/write */ -struct btrfs_stripe_hash_table { - struct list_head stripe_cache; - spinlock_t cache_lock; - int cache_size; - struct btrfs_stripe_hash table[]; -}; - -#define BTRFS_STRIPE_HASH_TABLE_BITS 11 - void btrfs_init_async_reclaim_work(struct work_struct *work); /* fs_info */ @@ -1279,6 +1106,16 @@ struct btrfs_root { #endif }; +struct btrfs_clone_extent_info { + u64 disk_offset; + u64 disk_len; + u64 data_offset; + u64 data_len; + u64 file_offset; + char *extent_buf; + u32 item_size; +}; + struct btrfs_file_private { void *filldir_buf; }; @@ -1377,19 +1214,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) btrfs_clear_opt(fs_info->mount_opt, opt); \ } -#ifdef CONFIG_BTRFS_DEBUG -static inline int -btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} -#endif - /* * Requests for changes that need to be done during transaction commit. * @@ -1475,8 +1299,10 @@ struct btrfs_map_token { #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sb->s_blocksize_bits) -static inline void btrfs_init_map_token (struct btrfs_map_token *token) +static inline void btrfs_init_map_token(struct btrfs_map_token *token, + struct extent_buffer *eb) { + token->eb = eb; token->kaddr = NULL; } @@ -1507,17 +1333,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ unsigned long off, u##bits val, \ struct btrfs_map_token *token); \ -static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, \ - unsigned long off) \ -{ \ - return btrfs_get_token_##bits(eb, ptr, off, NULL); \ -} \ -static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\ - unsigned long off, u##bits val) \ -{ \ - btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ -} +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) @@ -2059,16 +1878,6 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, btrfs_disk_key_to_cpu(key, &disk_key); } -static inline u8 btrfs_key_type(const struct btrfs_key *key) -{ - return key->type; -} - -static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) -{ - key->type = val; -} - /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, @@ -2354,20 +2163,8 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_sizes[t]; -} - -static inline const char *btrfs_super_csum_name(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csum_names[csum_type]; -} +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); /* * The leaf data grows from end-to-front in the node. @@ -2440,30 +2237,6 @@ static inline u32 btrfs_file_extent_inline_item_len( return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } -/* btrfs_dev_stats_item */ -static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, - const struct btrfs_dev_stats_item *ptr, - int index) -{ - u64 val; - - read_extent_buffer(eb, &val, - offsetof(struct btrfs_dev_stats_item, values) + - ((unsigned long)ptr) + (index * sizeof(u64)), - sizeof(val)); - return val; -} - -static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, - struct btrfs_dev_stats_item *ptr, - int index, u64 val) -{ - write_extent_buffer(eb, &val, - offsetof(struct btrfs_dev_stats_item, values) + - ((unsigned long)ptr) + (index * sizeof(u64)), - sizeof(val)); -} - /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); @@ -2600,32 +2373,33 @@ enum btrfs_inline_ref_type { int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* - * Doing a truncate won't result in new nodes or leaves, just what we need for - * COW. + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. */ -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, - const u64 start); -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, @@ -2642,11 +2416,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr); -void btrfs_get_block_group(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2685,28 +2454,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_free_block_groups(struct btrfs_fs_info *info); -int btrfs_read_block_groups(struct btrfs_fs_info *info); -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, u64 chunk_offset, - u64 size); -struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( - struct btrfs_fs_info *fs_info, - const u64 chunk_offset); -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); -void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); -u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); -u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); -u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2717,6 +2467,7 @@ enum btrfs_reserve_flush_enum { * case, use FLUSH LIMIT */ BTRFS_RESERVE_FLUSH_LIMIT, + BTRFS_RESERVE_FLUSH_EVICT, BTRFS_RESERVE_FLUSH_ALL, }; @@ -2729,31 +2480,10 @@ enum btrfs_flush_state { FLUSH_DELALLOC_WAIT = 6, ALLOC_CHUNK = 7, ALLOC_CHUNK_FORCE = 8, - COMMIT_TRANS = 9, -}; - -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum btrfs_chunk_alloc_enum { - CHUNK_ALLOC_NO_FORCE, - CHUNK_ALLOC_LIMITED, - CHUNK_ALLOC_FORCE, + RUN_DELAYED_IPUTS = 9, + COMMIT_TRANS = 10, }; -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - enum btrfs_chunk_alloc_enum force); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -2763,15 +2493,11 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); @@ -2780,10 +2506,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); -void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end); -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, @@ -2806,20 +2528,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); -enum btrfs_compare_tree_result { - BTRFS_COMPARE_TREE_NEW, - BTRFS_COMPARE_TREE_DELETED, - BTRFS_COMPARE_TREE_CHANGED, - BTRFS_COMPARE_TREE_SAME, -}; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); -int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t cb, void *ctx); +struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + int slot); + int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -3068,14 +2779,12 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const char *name, - int name_len, struct btrfs_inode_ref **ref_ret); -int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, - u64 ref_objectid, const char *name, - int name_len, - struct btrfs_inode_extref **extref_ret); - +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len); +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -3137,7 +2846,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int dedupe); + struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, @@ -3233,6 +2942,10 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); +int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, + const u64 start, const u64 end, + struct btrfs_clone_extent_info *clone_info, + struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -3248,12 +2961,6 @@ loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); -/* sysfs.c */ -int __init btrfs_init_sysfs(void); -void __cold btrfs_exit_sysfs(void); -int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); @@ -3722,26 +3429,4 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif -static inline void cond_wake_up(struct wait_queue_head *wq) -{ - /* - * This implies a full smp_mb barrier, see comments for - * waitqueue_active why. - */ - if (wq_has_sleeper(wq)) - wake_up(wq); -} - -static inline void cond_wake_up_nomb(struct wait_queue_head *wq) -{ - /* - * Special case for conditional wakeup where the barrier required for - * waitqueue_active is implied by some of the preceding code. Eg. one - * of such atomic operations (atomic_dec_and_return, ...), or a - * unlock/lock sequence, etc. - */ - if (waitqueue_active(wq)) - wake_up(wq); -} - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h deleted file mode 100644 index 90281a7a35a81849656979a03071e3533a1a6362..0000000000000000000000000000000000000000 --- a/fs/btrfs/dedupe.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2016 Fujitsu. All rights reserved. - */ - -#ifndef BTRFS_DEDUPE_H -#define BTRFS_DEDUPE_H - -/* later in-band dedupe will expand this struct */ -struct btrfs_dedupe_hash; - -#endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 17f7c0d3876850087c27f43fbec84aa02a559ab1..d949d7d2abed8489baa5318f5e56a0d1462fb5bd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -7,6 +7,7 @@ #include "space-info.h" #include "transaction.h" #include "qgroup.h" +#include "block-group.h" int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { @@ -129,8 +130,6 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) return -ENOSPC; } btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -182,8 +181,6 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } @@ -254,13 +251,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, lockdep_assert_held(&inode->lock); outstanding_extents = inode->outstanding_extents; - if (outstanding_extents) - reserve_size = btrfs_calc_trans_metadata_size(fs_info, - outstanding_extents + 1); + + /* + * Insert size for the number of outstanding extents, 1 normal size for + * updating the inode. + */ + if (outstanding_extents) { + reserve_size = btrfs_calc_insert_metadata_size(fs_info, + outstanding_extents); + reserve_size += btrfs_calc_metadata_size(fs_info, 1); + } csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - reserve_size += btrfs_calc_trans_metadata_size(fs_info, - csum_leaves); + reserve_size += btrfs_calc_insert_metadata_size(fs_info, + csum_leaves); /* * For qgroup rsv, the calculation is very simple: * account one nodesize for each outstanding extent @@ -281,10 +285,16 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, { u64 nr_extents = count_max_extents(num_bytes); u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); - /* We add one for the inode update at finish ordered time */ - *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, - nr_extents + csum_leaves + 1); + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, + nr_extents + csum_leaves); + + /* + * finish_ordered_io has to update the inode, so add the space required + * for an inode update. + */ + *meta_reserve += inode_update; *qgroup_reserve = nr_extents * fs_info->nodesize; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 43fdb2992956a0954734db87b9e0a99ba4cbb1a8..1f7f39b10bd00ac45fc74c726852329122a31571 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include #include +#include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" @@ -474,6 +475,9 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; + /* Not associated with any delayed_node */ + if (!delayed_item->delayed_node) + return; delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); @@ -555,7 +559,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); /* * Here we migrate space rsv from transaction rsv, since have already @@ -609,7 +613,7 @@ static int btrfs_delayed_inode_reserve_metadata( src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_metadata_size(fs_info, 1); /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction @@ -1525,7 +1529,12 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. */ - BUG_ON(ret); + if (ret < 0) { + btrfs_err(trans->fs_info, +"metadata reservation failed for delayed dir item deltiona, should have been reserved"); + btrfs_release_delayed_item(item); + goto end; + } mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); @@ -1534,7 +1543,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", index, node->root->root_key.objectid, node->inode_id, ret); - BUG(); + btrfs_delayed_item_release_metadata(dir->root, item); + btrfs_release_delayed_item(item); } mutex_unlock(&node->mutex); end: diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9a91d1eb0af4418c2d5e0ca85eb8b17d15cdd46e..df3bd880061d4ed19a215ca2b77df9365a2bdea0 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -79,7 +79,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); u64 released = 0; released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, @@ -105,8 +105,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) if (!trans->delayed_ref_updates) return; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, - trans->delayed_ref_updates); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, + trans->delayed_ref_updates); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->full = 0; @@ -158,7 +158,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, num_bytes, 1); if (to_free) - btrfs_space_info_add_old_bytes(fs_info, + btrfs_space_info_free_bytes_may_use(fs_info, delayed_refs_rsv->space_info, to_free); } @@ -174,7 +174,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); u64 num_bytes = 0; int ret = -ENOSPC; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6b2e9aa83ffaa5643d5e9d451b63792f79851593..48890826b5e666d0d358495ff84bf72c9e1a55c2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -9,6 +9,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -56,7 +57,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) no_valid_dev_replace_entry_found: ret = 0; dev_replace->replace_state = - BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; + BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; dev_replace->time_started = 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 97beb351a10cdc0117bc41b5dade4d01eb916093..044981cf6df9149cd28d0f3c89cbe05b3be39e12 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -40,6 +40,7 @@ #include "compression.h" #include "tree-checker.h" #include "ref-verify.h" +#include "block-group.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -416,6 +417,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, */ if (btrfs_header_generation(eb) > fs_info->last_trans_committed) return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else @@ -1037,35 +1048,6 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); - if (ret) { - free_extent_buffer_stale(buf); - return ret; - } - - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer_stale(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { - free_extent_buffer(buf); - } - return 0; -} - struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e80f7c45a3072bacd3f2480d1e15c3599565bb85..a6958103d87e74fc190770056e181940121e5185 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -45,8 +45,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); -int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8b7eb22d508a597b3bcc97f018c096f6d251e715..49cb26fa7c6393321989b6f0346ae598cbe466df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include #include +#include "misc.h" #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -25,13 +25,13 @@ #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "math.h" #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" #include "delalloc-space.h" +#include "block-group.h" #undef SCRAMBLE_DELAYED_REFS @@ -54,132 +54,13 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static noinline int -block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED || - cache->cached == BTRFS_CACHE_ERROR; -} - static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) -{ - atomic_inc(&cache->count); -} - -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) { - WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); - - /* - * If not empty, someone is still holding mutex of - * full_stripe_lock, which can only be released by caller. - * And it will definitely cause use-after-free when caller - * tries to release full stripe lock. - * - * No better way to resolve, but only to warn. - */ - WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); - kfree(cache->free_space_ctl); - kfree(cache); - } -} - -/* - * this adds the block group to the fs_info rb tree for the block group - * cache - */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group_cache *block_group) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group_cache *cache; - - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group_cache, - cache_node); - if (block_group->key.objectid < cache->key.objectid) { - p = &(*p)->rb_left; - } else if (block_group->key.objectid > cache->key.objectid) { - p = &(*p)->rb_right; - } else { - spin_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); - - if (info->first_logical_byte > block_group->key.objectid) - info->first_logical_byte = block_group->key.objectid; - - spin_unlock(&info->block_group_cache_lock); - - return 0; -} - -/* - * This will return the block group at or after bytenr if contains is 0, else - * it will return the block group that contains the bytenr - */ -static struct btrfs_block_group_cache * -block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, - int contains) -{ - struct btrfs_block_group_cache *cache, *ret = NULL; - struct rb_node *n; - u64 end, start; - - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; - - while (n) { - cache = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - end = cache->key.objectid + cache->key.offset - 1; - start = cache->key.objectid; - - if (bytenr < start) { - if (!contains && (!ret || start < ret->key.objectid)) - ret = cache; - n = n->rb_left; - } else if (bytenr > start) { - if (contains && bytenr <= end) { - ret = cache; - break; - } - n = n->rb_right; - } else { - ret = cache; - break; - } - } - if (ret) { - btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) - info->first_logical_byte = ret->key.objectid; - } - spin_unlock(&info->block_group_cache_lock); - - return ret; -} - -static int add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; set_extent_bits(&fs_info->freed_extents[0], @@ -189,7 +70,7 @@ static int add_excluded_extent(struct btrfs_fs_info *fs_info, return 0; } -static void free_excluded_extents(struct btrfs_block_group_cache *cache) +void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; u64 start, end; @@ -203,8649 +84,5429 @@ static void free_excluded_extents(struct btrfs_block_group_cache *cache) start, end, EXTENT_UPTODATE); } -static int exclude_super_stripes(struct btrfs_block_group_cache *cache) +static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { - stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; - cache->bytes_super += stripe_len; - ret = add_excluded_extent(fs_info, cache->key.objectid, - stripe_len); - if (ret) - return ret; - } - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->key.objectid, - bytenr, &logical, &nr, &stripe_len); - if (ret) - return ret; - - while (nr--) { - u64 start, len; - - if (logical[nr] > cache->key.objectid + - cache->key.offset) - continue; - - if (logical[nr] + stripe_len <= cache->key.objectid) - continue; - - start = logical[nr]; - if (start < cache->key.objectid) { - start = cache->key.objectid; - len = (logical[nr] + stripe_len) - start; - } else { - len = min_t(u64, stripe_len, - cache->key.objectid + - cache->key.offset - start); - } - - cache->bytes_super += len; - ret = add_excluded_extent(fs_info, start, len); - if (ret) { - kfree(logical); - return ret; - } - } - - kfree(logical); + if (ref->type == BTRFS_REF_METADATA) { + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) + return BTRFS_BLOCK_GROUP_SYSTEM; + else + return BTRFS_BLOCK_GROUP_METADATA; } - return 0; + return BTRFS_BLOCK_GROUP_DATA; } -static struct btrfs_caching_control * -get_caching_control(struct btrfs_block_group_cache *cache) +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { - struct btrfs_caching_control *ctl; - - spin_lock(&cache->lock); - if (!cache->caching_ctl) { - spin_unlock(&cache->lock); - return NULL; - } + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - spin_unlock(&cache->lock); - return ctl; + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, + BTRFS_TOTAL_BYTES_PINNED_BATCH); } -static void put_caching_control(struct btrfs_caching_control *ctl) +static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { - if (refcount_dec_and_test(&ctl->count)) - kfree(ctl); -} + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); -#ifdef CONFIG_BTRFS_DEBUG -static void fragment_free_space(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 start = block_group->key.objectid; - u64 len = block_group->key.offset; - u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? - fs_info->nodesize : fs_info->sectorsize; - u64 step = chunk << 1; - - while (len > chunk) { - btrfs_remove_free_space(block_group, start, chunk); - start += step; - if (len < step) - len = 0; - else - len -= step; - } + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, + BTRFS_TOTAL_BYTES_PINNED_BATCH); } -#endif -/* - * this is only called by cache_block_group, since we could have freed extents - * we need to check the pinned_extents for any extents that can't be used yet - * since their free space will be released as soon as the transaction commits. - */ -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end) +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - struct btrfs_fs_info *info = block_group->fs_info; - u64 extent_start, extent_end, size, total_added = 0; int ret; + struct btrfs_key key; + struct btrfs_path *path; - while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) - break; - - if (extent_start <= start) { - start = extent_end + 1; - } else if (extent_start > start && extent_start < end) { - size = extent_start - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); - BUG_ON(ret); /* -ENOMEM or logic error */ - start = extent_end + 1; - } else { - break; - } - } - - if (start < end) { - size = end - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ - } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - return total_added; + key.objectid = start; + key.offset = len; + key.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + btrfs_free_path(path); + return ret; } -static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) +/* + * helper function to lookup reference count and flags of a tree block. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags) { - struct btrfs_block_group_cache *block_group = caching_ctl->block_group; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; + struct btrfs_extent_item *ei; struct extent_buffer *leaf; struct btrfs_key key; - u64 total_found = 0; - u64 last = 0; - u32 nritems; + u32 item_size; + u64 num_refs; + u64 extent_flags; int ret; - bool wakeup = true; + + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + offset = fs_info->nodesize; + metadata = 0; + } path = btrfs_alloc_path(); if (!path) return -ENOMEM; - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); - -#ifdef CONFIG_BTRFS_DEBUG - /* - * If we're fragmenting we don't want to make anybody think we can - * allocate from this block group until we've had a chance to fragment - * the free space. - */ - if (btrfs_should_fragment_free_space(block_group)) - wakeup = false; -#endif - /* - * We don't want to deadlock with somebody trying to allocate a new - * extent for the extent root while also trying to search the extent - * root to add free space. So we skip locking and search the commit - * root, since its read-only - */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = READA_FORWARD; + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; -next: - ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); + goto out_free; - while (1) { - if (btrfs_fs_closing(fs_info) > 1) { - last = (u64)-1; - break; + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == fs_info->nodesize) + ret = 0; } + } - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); } else { - ret = find_next_key(path, 0, &key); - if (ret) - break; - - if (need_resched() || - rwsem_is_contended(&fs_info->commit_root_sem)) { - if (wakeup) - caching_ctl->progress = last; - btrfs_release_path(path); - up_read(&fs_info->commit_root_sem); - mutex_unlock(&caching_ctl->mutex); - cond_resched(); - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - goto next; - } - - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - if (key.objectid < last) { - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; + ret = -EINVAL; + btrfs_print_v0_err(fs_info); + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(fs_info, ret, NULL); - if (wakeup) - caching_ctl->progress = last; - btrfs_release_path(path); - goto next; + goto out_free; } - if (key.objectid < block_group->key.objectid) { - path->slots[0]++; - continue; - } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; + if (!trans) + goto out; - if (key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY) { - total_found += add_new_free_space(block_group, last, - key.objectid); - if (key.type == BTRFS_METADATA_ITEM_KEY) - last = key.objectid + - fs_info->nodesize; - else - last = key.objectid + key.offset; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); - if (total_found > CACHING_CTL_WAKE_UP) { - total_found = 0; - if (wakeup) - wake_up(&caching_ctl->wait); - } - } - path->slots[0]++; - } - ret = 0; + btrfs_release_path(path); - total_found += add_new_free_space(block_group, last, - block_group->key.objectid + - block_group->key.offset); - caching_ctl->progress = (u64)-1; + /* + * Mutex was contended, block until it's released and try + * again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto search_again; + } + spin_lock(&head->lock); + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + num_refs += head->ref_mod; + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: btrfs_free_path(path); return ret; } -static noinline void caching_thread(struct btrfs_work *work) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; - int ret; - - caching_ctl = container_of(work, struct btrfs_caching_control, work); - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; - - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); - - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - ret = load_free_space_tree(caching_ctl); - else - ret = load_extent_tree_free(caching_ctl); - - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; - spin_unlock(&block_group->lock); +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COWed through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure for the implicit back refs has fields for: + * + * - Objectid of the subvolume root + * - objectid of the file holding the reference + * - original offset in the file + * - how many bookend extents + * + * The key offset for the implicit back refs is hash of the first + * three fields. + * + * The extent ref structure for the full back refs has field for: + * + * - number of pointers in the tree leaf + * + * The key offset for the implicit back refs is the first byte of + * the tree leaf + * + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: + * + * (root_key.objectid, inode objectid, offset in file, 1) + * + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: + * + * (btrfs_header_owner(leaf), inode objectid, offset in file) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. + * + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. + */ -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(block_group)) { - u64 bytes_used; +/* + * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, + * is_data == BTRFS_REF_TYPE_ANY, either type is OK. + */ +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data) +{ + int type = btrfs_extent_inline_ref_type(eb, iref); + u64 offset = btrfs_extent_inline_ref_offset(eb, iref); - spin_lock(&block_group->space_info->lock); - spin_lock(&block_group->lock); - bytes_used = block_group->key.offset - - btrfs_block_group_used(&block_group->item); - block_group->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&block_group->lock); - spin_unlock(&block_group->space_info->lock); - fragment_free_space(block_group); + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY || + type == BTRFS_SHARED_DATA_REF_KEY || + type == BTRFS_EXTENT_DATA_REF_KEY) { + if (is_data == BTRFS_REF_TYPE_BLOCK) { + if (type == BTRFS_TREE_BLOCK_REF_KEY) + return type; + if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + ASSERT(eb->fs_info); + /* + * Every shared one has parent tree + * block, which must be aligned to + * nodesize. + */ + if (offset && + IS_ALIGNED(offset, eb->fs_info->nodesize)) + return type; + } + } else if (is_data == BTRFS_REF_TYPE_DATA) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return type; + if (type == BTRFS_SHARED_DATA_REF_KEY) { + ASSERT(eb->fs_info); + /* + * Every shared one has parent tree + * block, which must be aligned to + * nodesize. + */ + if (offset && + IS_ALIGNED(offset, eb->fs_info->nodesize)) + return type; + } + } else { + ASSERT(is_data == BTRFS_REF_TYPE_ANY); + return type; + } } -#endif - - caching_ctl->progress = (u64)-1; - up_read(&fs_info->commit_root_sem); - free_excluded_extents(block_group); - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); + btrfs_print_leaf((struct extent_buffer *)eb); + btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", + eb->start, type); + WARN_ON(1); - put_caching_control(caching_ctl); - btrfs_put_block_group(block_group); + return BTRFS_REF_TYPE_INVALID; } -static int cache_block_group(struct btrfs_block_group_cache *cache, - int load_cache_only) +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { - DEFINE_WAIT(wait); - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_caching_control *caching_ctl; - int ret = 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - if (!caching_ctl) - return -ENOMEM; - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - refcount_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, - caching_thread, NULL, NULL); - - spin_lock(&cache->lock); - /* - * This should be a rare occasion, but this could happen I think in the - * case where one thread starts to load the space cache info, and then - * some other thread starts a transaction commit which tries to do an - * allocation while the other thread is still loading the space cache - * info. The previous loop should have kept us from choosing this block - * group, but if we've moved to the state where we will wait on caching - * block groups we need to first check if we're doing a fast load here, - * so we can wait for it to finish, otherwise we could end up allocating - * from a block group who's cache gets evicted for one reason or - * another. - */ - while (cache->cached == BTRFS_CACHE_FAST) { - struct btrfs_caching_control *ctl; + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&cache->lock); + lenum = cpu_to_le64(root_objectid); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); - schedule(); + return ((u64)high_crc << 31) ^ (u64)low_crc; +} - finish_wait(&ctl->wait, &wait); - put_caching_control(ctl); - spin_lock(&cache->lock); - } +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) return 0; - } - WARN_ON(cache->caching_ctl); - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - if (btrfs_test_opt(fs_info, SPACE_CACHE)) { - mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(cache); - - spin_lock(&cache->lock); - if (ret == 1) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_FINISHED; - cache->last_byte_to_unpin = (u64)-1; - caching_ctl->progress = (u64)-1; - } else { - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - } - spin_unlock(&cache->lock); -#ifdef CONFIG_BTRFS_DEBUG - if (ret == 1 && - btrfs_should_fragment_free_space(cache)) { - u64 bytes_used; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - bytes_used = cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fragment_free_space(cache); - } -#endif - mutex_unlock(&caching_ctl->mutex); + return 1; +} - wake_up(&caching_ctl->wait); - if (ret == 1) { - put_caching_control(caching_ctl); - free_excluded_extents(cache); - return 0; - } +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; + struct extent_buffer *leaf; + u32 nritems; + int ret; + int recow; + int err = -ENOENT; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; } else { - /* - * We're either using the free space tree or no caching at all. - * Set cached to the appropriate value and wakeup any waiters. - */ - spin_lock(&cache->lock); - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; } - if (load_cache_only) { - put_caching_control(caching_ctl); - return 0; + if (parent) { + if (!ret) + return 0; + goto fail; } - down_write(&fs_info->commit_root_sem); - refcount_inc(&caching_ctl->count); - list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->commit_root_sem); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; - btrfs_get_block_group(cache); + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } - btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; - return ret; -} + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); -/* - * return the block group that starts at or after bytenr - */ -static struct btrfs_block_group_cache * -btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) -{ - return block_group_cache_tree_search(info, bytenr, 0); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; + } +fail: + return err; } -/* - * return the block group that contains the given bytenr - */ -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr) +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) { - return block_group_cache_tree_search(info, bytenr, 1); -} + struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_key key; + struct extent_buffer *leaf; + u32 size; + u32 num_refs; + int ret; -static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) -{ - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - return BTRFS_BLOCK_GROUP_SYSTEM; - else - return BTRFS_BLOCK_GROUP_METADATA; + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); } - return BTRFS_BLOCK_GROUP_DATA; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; -static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); + } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); + } + } + btrfs_mark_buffer_dirty(leaf); + ret = 0; +fail: + btrfs_release_path(path); + return ret; } -/* simple helper to search for an existing data extent at a given offset */ -int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + int refs_to_drop, int *last_ref) { - int ret; struct btrfs_key key; - struct btrfs_path *path; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; + struct extent_buffer *leaf; + u32 num_refs = 0; + int ret = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - key.objectid = start; - key.offset = len; - key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); - btrfs_free_path(path); + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { + btrfs_print_v0_err(trans->fs_info); + btrfs_abort_transaction(trans, -EINVAL); + return -EINVAL; + } else { + BUG(); + } + + BUG_ON(num_refs < refs_to_drop); + num_refs -= refs_to_drop; + + if (num_refs == 0) { + ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + *last_ref = 1; + } else { + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); + btrfs_mark_buffer_dirty(leaf); + } return ret; } -/* - * helper function to lookup reference count and flags of a tree block. - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. - */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) +static noinline u32 extent_data_ref_count(struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) { - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + int type; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (iref) { + /* + * If type is invalid, we should have bailed out earlier than + * this call. + */ + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } else { + WARN_ON(1); + } + return num_refs; +} + +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_key key; int ret; - /* - * If we don't have skinny metadata, don't bother doing anything - * different - */ - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { - offset = fs_info->nodesize; - metadata = 0; + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; } - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + return ret; +} - if (!trans) { - path->skip_locking = 1; - path->search_commit_root = 1; +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) +{ + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; } -search_again: + ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, + path, &key, 0); + btrfs_release_path(path); + return ret; +} + +static inline int extent_ref_type(u64 parent, u64 owner) +{ + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} + +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + int needed; + key.objectid = bytenr; - key.offset = offset; - if (metadata) + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + path->keep_locks = 1; + } else + extra_size = -1; + + /* + * Owner is our level, so we can just add one to get the level for the + * block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = owner; + } - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); - if (ret < 0) - goto out_free; +again: + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); + if (ret < 0) { + err = ret; + goto out; + } - if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == fs_info->nodesize) + key.offset == num_bytes) ret = 0; } - } - - if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(fs_info, ret, NULL); - - goto out_free; + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; } + } - BUG_ON(num_refs == 0); - } else { - num_refs = 0; - extent_flags = 0; - ret = 0; + if (ret && !insert) { + err = -ENOENT; + goto out; + } else if (WARN_ON(ret)) { + err = -EIO; + goto out; } - if (!trans) + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (unlikely(item_size < sizeof(*ei))) { + err = -EINVAL; + btrfs_print_v0_err(fs_info); + btrfs_abort_transaction(trans, err); goto out; + } - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); - if (head) { - if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's released and try - * again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref_head(head); - goto search_again; - } - spin_lock(&head->lock); - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - - num_refs += head->ref_mod; - spin_unlock(&head->lock); - mutex_unlock(&head->mutex); - } - spin_unlock(&delayed_refs->lock); -out: - WARN_ON(num_refs == 0); - if (refs) - *refs = num_refs; - if (flags) - *flags = extent_flags; -out_free: - btrfs_free_path(path); - return ret; -} - -/* - * Back reference rules. Back refs have three main goals: - * - * 1) differentiate between all holders of references to an extent so that - * when a reference is dropped we can make sure it was a valid reference - * before freeing the extent. - * - * 2) Provide enough information to quickly find the holders of an extent - * if we notice a given block is corrupted or bad. - * - * 3) Make it easy to migrate blocks for FS shrinking or storage pool - * maintenance. This is actually the same as #2, but with a slightly - * different use case. - * - * There are two kinds of back refs. The implicit back refs is optimized - * for pointers in non-shared tree blocks. For a given pointer in a block, - * back refs of this kind provide information about the block's owner tree - * and the pointer's key. These information allow us to find the block by - * b-tree searching. The full back refs is for pointers in tree blocks not - * referenced by their owner trees. The location of tree block is recorded - * in the back refs. Actually the full back refs is generic, and can be - * used in all cases the implicit back refs is used. The major shortcoming - * of the full back refs is its overhead. Every time a tree block gets - * COWed, we have to update back refs entry for all pointers in it. - * - * For a newly allocated tree block, we use implicit back refs for - * pointers in it. This means most tree related operations only involve - * implicit back refs. For a tree block created in old transaction, the - * only way to drop a reference to it is COW it. So we can detect the - * event that tree block loses its owner tree's reference and do the - * back refs conversion. - * - * When a tree block is COWed through a tree, there are four cases: - * - * The reference count of the block is one and the tree is the block's - * owner tree. Nothing to do in this case. - * - * The reference count of the block is one and the tree is not the - * block's owner tree. In this case, full back refs is used for pointers - * in the block. Remove these full back refs, add implicit back refs for - * every pointers in the new block. - * - * The reference count of the block is greater than one and the tree is - * the block's owner tree. In this case, implicit back refs is used for - * pointers in the block. Add full back refs for every pointers in the - * block, increase lower level extents' reference counts. The original - * implicit back refs are entailed to the new block. - * - * The reference count of the block is greater than one and the tree is - * not the block's owner tree. Add implicit back refs for every pointer in - * the new block, increase lower level extents' reference count. - * - * Back Reference Key composing: - * - * The key objectid corresponds to the first byte in the extent, - * The key type is used to differentiate between types of back refs. - * There are different meanings of the key offset for different types - * of back refs. - * - * File extents can be referenced by: - * - * - multiple snapshots, subvolumes, or different generations in one subvol - * - different files inside a single subvolume - * - different offsets inside a file (bookend extents in file.c) - * - * The extent ref structure for the implicit back refs has fields for: - * - * - Objectid of the subvolume root - * - objectid of the file holding the reference - * - original offset in the file - * - how many bookend extents - * - * The key offset for the implicit back refs is hash of the first - * three fields. - * - * The extent ref structure for the full back refs has field for: - * - * - number of pointers in the tree leaf - * - * The key offset for the implicit back refs is the first byte of - * the tree leaf - * - * When a file extent is allocated, The implicit back refs is used. - * the fields are filled in: - * - * (root_key.objectid, inode objectid, offset in file, 1) - * - * When a file extent is removed file truncation, we find the - * corresponding implicit back refs and check the following fields: - * - * (btrfs_header_owner(leaf), inode objectid, offset in file) - * - * Btree extents can be referenced by: - * - * - Different subvolumes - * - * Both the implicit back refs and the full back refs for tree blocks - * only consist of key. The key offset for the implicit back refs is - * objectid of block's owner tree. The key offset for the full back refs - * is the first byte of parent block. - * - * When implicit back refs is used, information about the lowest key and - * level of the tree block are required. These information are stored in - * tree block info structure. - */ - -/* - * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, - * is_data == BTRFS_REF_TYPE_ANY, either type is OK. - */ -int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, - enum btrfs_inline_ref_type is_data) -{ - int type = btrfs_extent_inline_ref_type(eb, iref); - u64 offset = btrfs_extent_inline_ref_offset(eb, iref); - - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY || - type == BTRFS_SHARED_DATA_REF_KEY || - type == BTRFS_EXTENT_DATA_REF_KEY) { - if (is_data == BTRFS_REF_TYPE_BLOCK) { - if (type == BTRFS_TREE_BLOCK_REF_KEY) - return type; - if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); - /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. - */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) - return type; - } - } else if (is_data == BTRFS_REF_TYPE_DATA) { - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return type; - if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); - /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. - */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) - return type; - } - } else { - ASSERT(is_data == BTRFS_REF_TYPE_ANY); - return type; - } - } - - btrfs_print_leaf((struct extent_buffer *)eb); - btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", - eb->start, type); - WARN_ON(1); - - return BTRFS_REF_TYPE_INVALID; -} - -static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) -{ - u32 high_crc = ~(u32)0; - u32 low_crc = ~(u32)0; - __le64 lenum; - - lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); - lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); - lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); - - return ((u64)high_crc << 31) ^ (u64)low_crc; -} - -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) -{ - return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), - btrfs_extent_data_ref_objectid(leaf, ref), - btrfs_extent_data_ref_offset(leaf, ref)); -} - -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) -{ - if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != owner || - btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; -} - -static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, - u64 owner, u64 offset) -{ - struct btrfs_root *root = trans->fs_info->extent_root; - struct btrfs_key key; - struct btrfs_extent_data_ref *ref; - struct extent_buffer *leaf; - u32 nritems; - int ret; - int recow; - int err = -ENOENT; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); - } -again: - recow = 0; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } - - if (parent) { - if (!ret) - return 0; - goto fail; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - recow = 1; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != bytenr || - key.type != BTRFS_EXTENT_DATA_REF_KEY) - goto fail; - - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) { - if (recow) { - btrfs_release_path(path); - goto again; - } - err = 0; - break; - } - path->slots[0]++; - } -fail: - return err; -} - -static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) -{ - struct btrfs_root *root = trans->fs_info->extent_root; - struct btrfs_key key; - struct extent_buffer *leaf; - u32 size; - u32 num_refs; - int ret; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; - size = sizeof(struct btrfs_shared_data_ref); - } else { - key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); - size = sizeof(struct btrfs_extent_data_ref); - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, size); - if (ret && ret != -EEXIST) - goto fail; - - leaf = path->nodes[0]; - if (parent) { - struct btrfs_shared_data_ref *ref; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); - } else { - num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; - btrfs_set_shared_data_ref_count(leaf, ref, num_refs); - } - } else { - struct btrfs_extent_data_ref *ref; - while (ret == -EEXIST) { - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) - break; - btrfs_release_path(path); - key.offset++; - ret = btrfs_insert_empty_item(trans, root, path, &key, - size); - if (ret && ret != -EEXIST) - goto fail; - - leaf = path->nodes[0]; - } - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, ref, owner); - btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); - } else { - num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; - btrfs_set_extent_data_ref_count(leaf, ref, num_refs); - } - } - btrfs_mark_buffer_dirty(leaf); - ret = 0; -fail: - btrfs_release_path(path); - return ret; -} - -static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - int refs_to_drop, int *last_ref) -{ - struct btrfs_key key; - struct btrfs_extent_data_ref *ref1 = NULL; - struct btrfs_shared_data_ref *ref2 = NULL; - struct extent_buffer *leaf; - u32 num_refs = 0; - int ret = 0; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ref2 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, -EINVAL); - return -EINVAL; - } else { - BUG(); - } - - BUG_ON(num_refs < refs_to_drop); - num_refs -= refs_to_drop; - - if (num_refs == 0) { - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); - *last_ref = 1; - } else { - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) - btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); - else if (key.type == BTRFS_SHARED_DATA_REF_KEY) - btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(leaf); - } - return ret; -} - -static noinline u32 extent_data_ref_count(struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; - u32 num_refs = 0; - int type; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); - if (iref) { - /* - * If type is invalid, we should have bailed out earlier than - * this call. - */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); - ASSERT(type != BTRFS_REF_TYPE_INVALID); - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ref2 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else { - WARN_ON(1); - } - return num_refs; -} - -static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) -{ - struct btrfs_root *root = trans->fs_info->extent_root; - struct btrfs_key key; - int ret; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; - } - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -ENOENT; - return ret; -} - -static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) -{ - struct btrfs_key key; - int ret; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; - } - - ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, - path, &key, 0); - btrfs_release_path(path); - return ret; -} - -static inline int extent_ref_type(u64 parent, u64 owner) -{ - int type; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (parent > 0) - type = BTRFS_SHARED_BLOCK_REF_KEY; - else - type = BTRFS_TREE_BLOCK_REF_KEY; - } else { - if (parent > 0) - type = BTRFS_SHARED_DATA_REF_KEY; - else - type = BTRFS_EXTENT_DATA_REF_KEY; - } - return type; -} - -static int find_next_key(struct btrfs_path *path, int level, - struct btrfs_key *key) - -{ - for (; level < BTRFS_MAX_LEVEL; level++) { - if (!path->nodes[level]) - break; - if (path->slots[level] + 1 >= - btrfs_header_nritems(path->nodes[level])) - continue; - if (level == 0) - btrfs_item_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - else - btrfs_node_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - return 0; - } - return 1; -} - -/* - * look for inline back ref. if back ref is found, *ref_ret is set - * to the address of inline back ref, and 0 is returned. - * - * if back ref isn't found, *ref_ret is set to the address where it - * should be inserted, and -ENOENT is returned. - * - * if insert is true and there are too many inline back refs, the path - * points to the extent item, and -EAGAIN is returned. - * - * NOTE: inline back refs are ordered in the same way that back ref - * items in the tree are ordered. - */ -static noinline_for_stack -int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_extent_inline_ref **ref_ret, - u64 bytenr, u64 num_bytes, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int insert) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; - u64 flags; - u64 item_size; - unsigned long ptr; - unsigned long end; - int extra_size; - int type; - int want; - int ret; - int err = 0; - bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - int needed; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - want = extent_ref_type(parent, owner); - if (insert) { - extra_size = btrfs_extent_inline_ref_size(want); - path->keep_locks = 1; - } else - extra_size = -1; - - /* - * Owner is our level, so we can just add one to get the level for the - * block we are interested in. - */ - if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = owner; - } - -again: - ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; - goto out; - } - - /* - * We may be a newly converted file system which still has the old fat - * extent entries for metadata, so try and see if we have one of those. - */ - if (ret > 0 && skinny_metadata) { - skinny_metadata = false; - if (path->slots[0]) { - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == num_bytes) - ret = 0; - } - if (ret) { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - btrfs_release_path(path); - goto again; - } - } - - if (ret && !insert) { - err = -ENOENT; - goto out; - } else if (WARN_ON(ret)) { - err = -EIO; - goto out; - } - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); - goto out; - } - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - flags = btrfs_extent_flags(leaf, ei); - - ptr = (unsigned long)(ei + 1); - end = (unsigned long)ei + item_size; - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { - ptr += sizeof(struct btrfs_tree_block_info); - BUG_ON(ptr > end); - } - - if (owner >= BTRFS_FIRST_FREE_OBJECTID) - needed = BTRFS_REF_TYPE_DATA; - else - needed = BTRFS_REF_TYPE_BLOCK; - - err = -ENOENT; - while (1) { - if (ptr >= end) { - WARN_ON(ptr > end); - break; - } - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); - if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; - goto out; - } - - if (want < type) - break; - if (want > type) { - ptr += btrfs_extent_inline_ref_size(type); - continue; - } - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - struct btrfs_extent_data_ref *dref; - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - if (match_extent_data_ref(leaf, dref, root_objectid, - owner, offset)) { - err = 0; - break; - } - if (hash_extent_data_ref_item(leaf, dref) < - hash_extent_data_ref(root_objectid, owner, offset)) - break; - } else { - u64 ref_offset; - ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); - if (parent > 0) { - if (parent == ref_offset) { - err = 0; - break; - } - if (ref_offset < parent) - break; - } else { - if (root_objectid == ref_offset) { - err = 0; - break; - } - if (ref_offset < root_objectid) - break; - } - } - ptr += btrfs_extent_inline_ref_size(type); - } - if (err == -ENOENT && insert) { - if (item_size + extra_size >= - BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; - goto out; - } - /* - * To add new inline back ref, we have to make sure - * there is no corresponding back ref item. - * For simplicity, we just do not add new inline back - * ref if there is any kind of item for this block - */ - if (find_next_key(path, 0, &key) == 0 && - key.objectid == bytenr && - key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; - goto out; - } - } - *ref_ret = (struct btrfs_extent_inline_ref *)ptr; -out: - if (insert) { - path->keep_locks = 0; - btrfs_unlock_up_safe(path, 1); - } - return err; -} - -/* - * helper to add new inline back ref - */ -static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - unsigned long ptr; - unsigned long end; - unsigned long item_offset; - u64 refs; - int size; - int type; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - item_offset = (unsigned long)iref - (unsigned long)ei; - - type = extent_ref_type(parent, owner); - size = btrfs_extent_inline_ref_size(type); - - btrfs_extend_item(path, size); - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - refs += refs_to_add; - btrfs_set_extent_refs(leaf, ei, refs); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - - ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); - if (ptr < end - size) - memmove_extent_buffer(leaf, ptr + size, ptr, - end - size - ptr); - - iref = (struct btrfs_extent_inline_ref *)ptr; - btrfs_set_extent_inline_ref_type(leaf, iref, type); - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - struct btrfs_extent_data_ref *dref; - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, dref, owner); - btrfs_set_extent_data_ref_offset(leaf, dref, offset); - btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_shared_data_ref *sref; - sref = (struct btrfs_shared_data_ref *)(iref + 1); - btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else { - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); - } - btrfs_mark_buffer_dirty(leaf); -} - -static int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_extent_inline_ref **ref_ret, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) -{ - int ret; - - ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 0); - if (ret != -ENOENT) - return ret; - - btrfs_release_path(path); - *ref_ret = NULL; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = lookup_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = lookup_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset); - } - return ret; -} - -/* - * helper to update/remove inline back ref - */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_extent_item *ei; - struct btrfs_extent_data_ref *dref = NULL; - struct btrfs_shared_data_ref *sref = NULL; - unsigned long ptr; - unsigned long end; - u32 item_size; - int size; - int type; - u64 refs; - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); - refs += refs_to_mod; - btrfs_set_extent_refs(leaf, ei, refs); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - - /* - * If type is invalid, we should have bailed out after - * lookup_inline_extent_backref(). - */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - ASSERT(type != BTRFS_REF_TYPE_INVALID); - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - refs = btrfs_extent_data_ref_count(leaf, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - sref = (struct btrfs_shared_data_ref *)(iref + 1); - refs = btrfs_shared_data_ref_count(leaf, sref); - } else { - refs = 1; - BUG_ON(refs_to_mod != -1); - } - - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); - refs += refs_to_mod; - - if (refs > 0) { - if (type == BTRFS_EXTENT_DATA_REF_KEY) - btrfs_set_extent_data_ref_count(leaf, dref, refs); - else - btrfs_set_shared_data_ref_count(leaf, sref, refs); - } else { - *last_ref = 1; - size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = (unsigned long)iref; - end = (unsigned long)ei + item_size; - if (ptr + size < end) - memmove_extent_buffer(leaf, ptr, ptr + size, - end - ptr - size); - item_size -= size; - btrfs_truncate_item(path, item_size, 1); - } - btrfs_mark_buffer_dirty(leaf); -} - -static noinline_for_stack -int insert_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_extent_inline_ref *iref; - int ret; - - ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, - num_bytes, parent, root_objectid, - owner, offset, 1); - if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); - } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans->fs_info, path, iref, parent, - root_objectid, owner, offset, - refs_to_add, extent_op); - ret = 0; - } - return ret; -} - -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); - } - return ret; -} - -static int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) -{ - int ret = 0; - - BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); - } - return ret; -} - -static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, - u64 *discarded_bytes) -{ - int j, ret = 0; - u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << 9); - - if (WARN_ON(start != aligned_start)) { - len -= aligned_start - start; - len = round_down(len, 1 << 9); - start = aligned_start; - } - - *discarded_bytes = 0; - - if (!len) - return 0; - - end = start + len; - bytes_left = len; - - /* Skip any superblocks on this device. */ - for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { - u64 sb_start = btrfs_sb_offset(j); - u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; - u64 size = sb_start - start; - - if (!in_range(sb_start, start, bytes_left) && - !in_range(sb_end, start, bytes_left) && - !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) - continue; - - /* - * Superblock spans beginning of range. Adjust start and - * try again. - */ - if (sb_start <= start) { - start += sb_end - start; - if (start > end) { - bytes_left = 0; - break; - } - bytes_left = end - start; - continue; - } - - if (size) { - ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); - if (!ret) - *discarded_bytes += size; - else if (ret != -EOPNOTSUPP) - return ret; - } - - start = sb_end; - if (start > end) { - bytes_left = 0; - break; - } - bytes_left = end - start; - } - - if (bytes_left) { - ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); - if (!ret) - *discarded_bytes += bytes_left; - } - return ret; -} - -int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) -{ - int ret; - u64 discarded_bytes = 0; - struct btrfs_bio *bbio = NULL; - - - /* - * Avoid races with device replace and make sure our bbio has devices - * associated to its stripes that don't go away while we are discarding. - */ - btrfs_bio_counter_inc_blocked(fs_info); - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, - &bbio, 0); - /* Error condition is -ENOMEM */ - if (!ret) { - struct btrfs_bio_stripe *stripe = bbio->stripes; - int i; - - - for (i = 0; i < bbio->num_stripes; i++, stripe++) { - u64 bytes; - struct request_queue *req_q; - - if (!stripe->dev->bdev) { - ASSERT(btrfs_test_opt(fs_info, DEGRADED)); - continue; - } - req_q = bdev_get_queue(stripe->dev->bdev); - if (!blk_queue_discard(req_q)) - continue; - - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length, - &bytes); - if (!ret) - discarded_bytes += bytes; - else if (ret != -EOPNOTSUPP) - break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; - } - btrfs_put_bbio(bbio); - } - btrfs_bio_counter_dec(fs_info); - - if (actual_bytes) - *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; - return ret; -} - -/* Can return -ENOMEM */ -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; - int ret; - - ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && - generic_ref->action); - BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); - - if (generic_ref->type == BTRFS_REF_METADATA) - ret = btrfs_add_delayed_tree_ref(trans, generic_ref, - NULL, &old_ref_mod, &new_ref_mod); - else - ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, - &old_ref_mod, &new_ref_mod); - - btrfs_ref_tree_mod(fs_info, generic_ref); - - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - sub_pinned_bytes(fs_info, generic_ref); - - return ret; -} - -/* - * __btrfs_inc_extent_ref - insert backreference for a given extent - * - * @trans: Handle of transaction - * - * @node: The delayed ref node used to get the bytenr/length for - * extent whose references are incremented. - * - * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ - * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical - * bytenr of the parent block. Since new extents are always - * created with indirect references, this will only be the case - * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must - * be 0 - * - * @root_objectid: The id of the root where this modification has originated, - * this can be either one of the well-known metadata trees or - * the subvolume id which references this extent. - * - * @owner: For data extents it is the inode number of the owning file. - * For metadata extents this parameter holds the level in the - * tree of the extent. - * - * @offset: For metadata extents the offset is ignored and is currently - * always passed as 0. For data extents it is the fileoffset - * this extent belongs to. - * - * @refs_to_add Number of references to add - * - * @extent_op Pointer to a structure, holding information necessary when - * updating a tree block's flags - * - */ -static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_extent_item *item; - struct btrfs_key key; - u64 bytenr = node->bytenr; - u64 num_bytes = node->num_bytes; - u64 refs; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = READA_FORWARD; - path->leave_spinning = 1; - /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, - parent, root_objectid, owner, - offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; - - /* - * Ok we had -EAGAIN which means we didn't have space to insert and - * inline extent ref, so just update the reference count and add a - * normal backref. - */ - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, item); - btrfs_set_extent_refs(leaf, item, refs + refs_to_add); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, item); - - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - path->reada = READA_FORWARD; - path->leave_spinning = 1; - /* now insert the actual backref */ - ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); - if (ret) - btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); - return ret; -} - -static int run_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; - u64 parent = 0; - u64 ref_root = 0; - u64 flags = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); - - if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; - ref_root = ref->root; - - if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) - flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, parent, ref_root, - flags, ref->objectid, - ref->offset, &ins, - node->ref_mod); - } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->objectid, ref->offset, - node->ref_mod, extent_op); - } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); - } else { - BUG(); - } - return ret; -} - -static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, - struct extent_buffer *leaf, - struct btrfs_extent_item *ei) -{ - u64 flags = btrfs_extent_flags(leaf, ei); - if (extent_op->update_flags) { - flags |= extent_op->flags_to_set; - btrfs_set_extent_flags(leaf, ei, flags); - } - - if (extent_op->update_key) { - struct btrfs_tree_block_info *bi; - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - bi = (struct btrfs_tree_block_info *)(ei + 1); - btrfs_set_tree_block_key(leaf, bi, &extent_op->key); - } -} - -static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - u32 item_size; - int ret; - int err = 0; - int metadata = !extent_op->is_data; - - if (trans->aborted) - return 0; - - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - metadata = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = head->bytenr; - - if (metadata) { - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = extent_op->level; - } else { - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = head->num_bytes; - } - -again: - path->reada = READA_FORWARD; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - if (metadata) { - if (path->slots[0] > 0) { - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == head->bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == head->num_bytes) - ret = 0; - } - if (ret > 0) { - btrfs_release_path(path); - metadata = 0; - - key.objectid = head->bytenr; - key.offset = head->num_bytes; - key.type = BTRFS_EXTENT_ITEM_KEY; - goto again; - } - } else { - err = -EIO; - goto out; - } - } - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - - if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); - goto out; - } - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - __run_delayed_extent_op(extent_op, leaf, ei); - - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return err; -} - -static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - struct btrfs_delayed_tree_ref *ref; - u64 parent = 0; - u64 ref_root = 0; - - ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); - - if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - ref_root = ref->root; - - if (node->ref_mod != 1) { - btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", - node->bytenr, node->ref_mod, node->action, ref_root, - parent); - return -EIO; - } - if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags); - ret = alloc_reserved_tree_block(trans, node, extent_op); - } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); - } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); - } else { - BUG(); - } - return ret; -} - -/* helper function to actually process a single delayed ref entry */ -static int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - - if (trans->aborted) { - if (insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); - return 0; - } - - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, node, extent_op, - insert_reserved); - else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, node, extent_op, - insert_reserved); - else - BUG(); - if (ret && insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); - return ret; -} - -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - -static struct btrfs_delayed_extent_op *cleanup_extent_op( - struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_extent_op *extent_op = head->extent_op; - - if (!extent_op) - return NULL; - - if (head->must_insert_reserved) { - head->extent_op = NULL; - btrfs_free_delayed_extent_op(extent_op); - return NULL; - } - return extent_op; -} - -static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_extent_op *extent_op; - int ret; - - extent_op = cleanup_extent_op(head); - if (!extent_op) - return 0; - head->extent_op = NULL; - spin_unlock(&head->lock); - ret = run_delayed_extent_op(trans, head, extent_op); - btrfs_free_delayed_extent_op(extent_op); - return ret ? ret : 1; -} - -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - int nr_items = 1; /* Dropping this ref head update. */ - - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; - - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - - /* - * We had csum deletions accounted for in our delayed refs rsv, - * we need to drop the csum leaves for this update from our - * delayed_refs_rsv. - */ - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, - head->num_bytes); - } - } - - btrfs_delayed_refs_rsv_release(fs_info, nr_items); -} - -static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) -{ - - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs; - int ret; - - delayed_refs = &trans->transaction->delayed_refs; - - ret = run_and_cleanup_extent_op(trans, head); - if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); - btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); - return ret; - } else if (ret) { - return ret; - } - - /* - * Need to drop our head ref lock and re-acquire the delayed ref lock - * and then re-check to make sure nobody got added. - */ - spin_unlock(&head->lock); - spin_lock(&delayed_refs->lock); - spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - return 1; - } - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - - if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); - if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, head->bytenr, - head->num_bytes); - } - } - - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - - trace_run_delayed_ref_head(fs_info, head, 0); - btrfs_delayed_ref_unlock(head); - btrfs_put_delayed_ref_head(head); - return 0; -} - -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - -static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref, - unsigned long *run_refs) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_extent_op *extent_op; - struct btrfs_delayed_ref_node *ref; - int must_insert_reserved = 0; - int ret; - - delayed_refs = &trans->transaction->delayed_refs; - - lockdep_assert_held(&locked_ref->mutex); - lockdep_assert_held(&locked_ref->lock); - - while ((ref = select_delayed_ref(locked_ref))) { - if (ref->seq && - btrfs_check_delayed_seq(fs_info, ref->seq)) { - spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); - return -EAGAIN; - } - - (*run_refs)++; - ref->in_tree = 0; - rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - /* - * When we play the delayed ref, also correct the ref_mod on - * head - */ - switch (ref->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - locked_ref->ref_mod -= ref->ref_mod; - break; - case BTRFS_DROP_DELAYED_REF: - locked_ref->ref_mod += ref->ref_mod; - break; - default: - WARN_ON(1); - } - atomic_dec(&delayed_refs->num_entries); - - /* - * Record the must_insert_reserved flag before we drop the - * spin lock. - */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - spin_unlock(&locked_ref->lock); - - ret = run_one_delayed_ref(trans, ref, extent_op, - must_insert_reserved); - - btrfs_free_delayed_extent_op(extent_op); - if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); - btrfs_put_delayed_ref(ref); - btrfs_debug(fs_info, "run_one_delayed_ref returned %d", - ret); - return ret; - } - - btrfs_put_delayed_ref(ref); - cond_resched(); - - spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - } - - return 0; -} - -/* - * Returns 0 on success or if called with an already aborted transaction. - * Returns -ENOMEM or -EIO on failure and will abort the transaction. - */ -static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *locked_ref = NULL; - ktime_t start = ktime_get(); - int ret; - unsigned long count = 0; - unsigned long actual_count = 0; - - delayed_refs = &trans->transaction->delayed_refs; - do { - if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); - if (IS_ERR_OR_NULL(locked_ref)) { - if (PTR_ERR(locked_ref) == -EAGAIN) { - continue; - } else { - break; - } - } - count++; - } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - * Or we can get node references of the same type that weren't - * merged when created due to bumps in the tree mod seq, and - * we need to merge them to prevent adding an inline extent - * backref before dropping it (triggering a BUG_ON at - * insert_inline_extent_backref()). - */ - spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, - &actual_count); - if (ret < 0 && ret != -EAGAIN) { - /* - * Error, btrfs_run_delayed_refs_for_head already - * unlocked everything so just bail out - */ - return ret; - } else if (!ret) { - /* - * Success, perform the usual cleanup of a processed - * head - */ - ret = cleanup_ref_head(trans, locked_ref); - if (ret > 0 ) { - /* We dropped our lock, we need to loop. */ - ret = 0; - continue; - } else if (ret) { - return ret; - } - } - - /* - * Either success case or btrfs_run_delayed_refs_for_head - * returned -EAGAIN, meaning we need to select another head - */ - - locked_ref = NULL; - cond_resched(); - } while ((nr != -1 && count < nr) || locked_ref); - - /* - * We don't want to include ref heads since we can have empty ref heads - * and those will drastically skew our runtime down since we just do - * accounting, no actual extent tree updates. - */ - if (actual_count > 0) { - u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); - u64 avg; - - /* - * We weigh the current average higher than our current runtime - * to avoid large swings in the average. - */ - spin_lock(&delayed_refs->lock); - avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ - spin_unlock(&delayed_refs->lock); - } - return 0; -} - -#ifdef SCRAMBLE_DELAYED_REFS -/* - * Normally delayed refs get processed in ascending bytenr order. This - * correlates in most cases to the order added. To expose dependencies on this - * order, we start to process the tree in the middle instead of the beginning - */ -static u64 find_middle(struct rb_root *root) -{ - struct rb_node *n = root->rb_node; - struct btrfs_delayed_ref_node *entry; - int alt = 1; - u64 middle; - u64 first = 0, last = 0; - - n = rb_first(root); - if (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - first = entry->bytenr; - } - n = rb_last(root); - if (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - last = entry->bytenr; - } - n = root->rb_node; - - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); - - middle = entry->bytenr; - - if (alt) - n = n->rb_left; - else - n = n->rb_right; - - alt = 1 - alt; - } - return middle; -} -#endif - -static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) -{ - u64 num_bytes; - - num_bytes = heads * (sizeof(struct btrfs_extent_item) + - sizeof(struct btrfs_extent_inline_ref)); - if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - num_bytes += heads * sizeof(struct btrfs_tree_block_info); - - /* - * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to use. - */ - return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); -} - -/* - * Takes the number of bytes to be csumm'ed and figures out how many leaves it - * would require to store the csums for that many bytes. - */ -u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - u64 csum_size; - u64 num_csums_per_leaf; - u64 num_csums; - - csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); - num_csums_per_leaf = div64_u64(csum_size, - (u64)btrfs_super_csum_size(fs_info->super_copy)); - num_csums = div64_u64(csum_bytes, fs_info->sectorsize); - num_csums += num_csums_per_leaf - 1; - num_csums = div64_u64(num_csums, num_csums_per_leaf); - return num_csums; -} - -/* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. - * - * Returns 0 on success or if called with an aborted transaction - * Returns <0 on error and aborts the transaction - */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *head; - int ret; - int run_all = count == (unsigned long)-1; - - /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) - return 0; - - if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) - return 0; - - delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) - count = atomic_read(&delayed_refs->num_entries) * 2; - -again: -#ifdef SCRAMBLE_DELAYED_REFS - delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); -#endif - ret = __btrfs_run_delayed_refs(trans, count); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - return ret; - } - - if (run_all) { - btrfs_create_pending_block_groups(trans); - - spin_lock(&delayed_refs->lock); - node = rb_first_cached(&delayed_refs->href_root); - if (!node) { - spin_unlock(&delayed_refs->lock); - goto out; - } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - refcount_inc(&head->refs); - spin_unlock(&delayed_refs->lock); - - /* Mutex was contended, block until it's released and retry. */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref_head(head); - cond_resched(); - goto again; - } -out: - return 0; -} - -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, - int level, int is_data) -{ - struct btrfs_delayed_extent_op *extent_op; - int ret; - - extent_op = btrfs_alloc_delayed_extent_op(); - if (!extent_op) - return -ENOMEM; - - extent_op->flags_to_set = flags; - extent_op->update_flags = true; - extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; - extent_op->level = level; - - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); - if (ret) - btrfs_free_delayed_extent_op(extent_op); - return ret; -} - -static noinline int check_delayed_ref(struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_transaction *cur_trans; - struct rb_node *node; - int ret = 0; - - spin_lock(&root->fs_info->trans_lock); - cur_trans = root->fs_info->running_transaction; - if (cur_trans) - refcount_inc(&cur_trans->use_count); - spin_unlock(&root->fs_info->trans_lock); - if (!cur_trans) - return 0; - - delayed_refs = &cur_trans->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); - if (!head) { - spin_unlock(&delayed_refs->lock); - btrfs_put_transaction(cur_trans); - return 0; - } - - if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's released and let - * caller try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref_head(head); - btrfs_put_transaction(cur_trans); - return -EAGAIN; - } - spin_unlock(&delayed_refs->lock); - - spin_lock(&head->lock); - /* - * XXX: We should replace this with a proper search function in the - * future. - */ - for (node = rb_first_cached(&head->ref_tree); node; - node = rb_next(node)) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); - /* If it's a shared ref we know a cross reference exists */ - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { - ret = 1; - break; - } - - data_ref = btrfs_delayed_node_to_data_ref(ref); - - /* - * If our ref doesn't match the one we're currently looking at - * then we have a cross reference. - */ - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || - data_ref->offset != offset) { - ret = 1; - break; - } - } - spin_unlock(&head->lock); - mutex_unlock(&head->mutex); - btrfs_put_transaction(cur_trans); - return ret; -} - -static noinline int check_committed_ref(struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; - struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref; - struct btrfs_extent_inline_ref *iref; - struct btrfs_extent_item *ei; - struct btrfs_key key; - u32 item_size; - int type; - int ret; - - key.objectid = bytenr; - key.offset = (u64)-1; - key.type = BTRFS_EXTENT_ITEM_KEY; - - ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); /* Corruption */ - - ret = -ENOENT; - if (path->slots[0] == 0) - goto out; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; - - ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) - goto out; - - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out; - - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); - if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; - - ref = (struct btrfs_extent_data_ref *)(&iref->offset); - if (btrfs_extent_refs(leaf, ei) != - btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || - btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; - - ret = 0; -out: - return ret; -} - -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr) -{ - struct btrfs_path *path; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr); - if (ret && ret != -ENOENT) - goto out; - - ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); - -out: - btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) - WARN_ON(ret > 0); - return ret; -} - -static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - int full_backref, int inc) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 num_bytes; - u64 parent; - u64 ref_root; - u32 nritems; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct btrfs_ref generic_ref = { 0 }; - bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); - int i; - int action; - int level; - int ret = 0; - - if (btrfs_is_testing(fs_info)) - return 0; - - ref_root = btrfs_header_owner(buf); - nritems = btrfs_header_nritems(buf); - level = btrfs_header_level(buf); - - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) - return 0; - - if (full_backref) - parent = buf->start; - else - parent = 0; - if (inc) - action = BTRFS_ADD_DELAYED_REF; - else - action = BTRFS_DROP_DELAYED_REF; - - for (i = 0; i < nritems; i++) { - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, i); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, i, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); - key.offset -= btrfs_file_extent_offset(buf, fi); - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset); - generic_ref.skip_qgroup = for_reloc; - if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); - else - ret = btrfs_free_extent(trans, &generic_ref); - if (ret) - goto fail; - } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = fs_info->nodesize; - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); - generic_ref.skip_qgroup = for_reloc; - if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); - else - ret = btrfs_free_extent(trans, &generic_ref); - if (ret) - goto fail; - } - } - return 0; -fail: - return ret; -} - -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) -{ - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); -} - -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) -{ - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); -} - -static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_block_group_cache *cache) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; - struct btrfs_root *extent_root = fs_info->extent_root; - unsigned long bi; - struct extent_buffer *leaf; - - ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto fail; - } - - leaf = path->nodes[0]; - bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); - btrfs_mark_buffer_dirty(leaf); -fail: - btrfs_release_path(path); - return ret; - -} - -static struct btrfs_block_group_cache *next_block_group( - struct btrfs_block_group_cache *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - struct rb_node *node; - - spin_lock(&fs_info->block_group_cache_lock); - - /* If our block group was removed, we need a full search. */ - if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->key.objectid + cache->key.offset; - - spin_unlock(&fs_info->block_group_cache_lock); - btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; - } - node = rb_next(&cache->cache_node); - btrfs_put_block_group(cache); - if (node) { - cache = rb_entry(node, struct btrfs_block_group_cache, - cache_node); - btrfs_get_block_group(cache); - } else - cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); - return cache; -} - -static int cache_save_setup(struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; - struct inode *inode = NULL; - struct extent_changeset *data_reserved = NULL; - u64 alloc_hint = 0; - int dcs = BTRFS_DC_ERROR; - u64 num_pages = 0; - int retries = 0; - int ret = 0; - - /* - * If this block group is smaller than 100 megs don't bother caching the - * block group. - */ - if (block_group->key.offset < (100 * SZ_1M)) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } - - if (trans->aborted) - return 0; -again: - inode = lookup_free_space_inode(block_group, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { - ret = PTR_ERR(inode); - btrfs_release_path(path); - goto out; - } - - if (IS_ERR(inode)) { - BUG_ON(retries); - retries++; - - if (block_group->ro) - goto out_free; - - ret = create_free_space_inode(trans, block_group, path); - if (ret) - goto out_free; - goto again; - } - - /* - * We want to set the generation to 0, that way if anything goes wrong - * from here on out we know not to trust this cache when we load up next - * time. - */ - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - /* - * So theoretically we could recover from this, simply set the - * super cache generation to 0 so we know to invalidate the - * cache, but then we'd have to keep track of the block groups - * that fail this way so we know we _have_ to reset this cache - * before the next commit or risk reading stale cache. So to - * limit our exposure to horrible edge cases lets just abort the - * transaction, this only happens in really bad situations - * anyway. - */ - btrfs_abort_transaction(trans, ret); - goto out_put; - } - WARN_ON(ret); - - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_check_trunc_cache_free_space(fs_info, - &fs_info->global_block_rsv); - if (ret) - goto out_put; - - ret = btrfs_truncate_free_space_cache(trans, NULL, inode); - if (ret) - goto out_put; - } - - spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(fs_info, SPACE_CACHE)) { - /* - * don't bother trying to write stuff out _if_ - * a) we're not cached, - * b) we're with nospace_cache mount option, - * c) we're with v2 space_cache (FREE_SPACE_TREE). - */ - dcs = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - goto out_put; - } - spin_unlock(&block_group->lock); - - /* - * We hit an ENOSPC when setting up the cache in this transaction, just - * skip doing the setup, we've already cleared the cache so we're safe. - */ - if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { - ret = -ENOSPC; - goto out_put; - } - - /* - * Try to preallocate enough space based on how big the block group is. - * Keep in mind this has to include any pinned space which could end up - * taking up quite a bit since it's not folded into the other space - * cache. - */ - num_pages = div_u64(block_group->key.offset, SZ_256M); - if (!num_pages) - num_pages = 1; - - num_pages *= 16; - num_pages *= PAGE_SIZE; - - ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, - &alloc_hint); - /* - * Our cache requires contiguous chunks so that we don't modify a bunch - * of metadata or split extents when writing the cache out, which means - * we can enospc if we are heavily fragmented in addition to just normal - * out of space conditions. So if we hit this just skip setting up any - * other block groups for this transaction, maybe we'll unpin enough - * space the next time around. - */ - if (!ret) - dcs = BTRFS_DC_SETUP; - else if (ret == -ENOSPC) - set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - -out_put: - iput(inode); -out_free: - btrfs_release_path(path); -out: - spin_lock(&block_group->lock); - if (!ret && dcs == BTRFS_DC_SETUP) - block_group->cache_generation = trans->transid; - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - - extent_changeset_free(data_reserved); - return ret; -} - -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache, *tmp; - struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; - - if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(fs_info, SPACE_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* Could add new block groups, use _safe just in case */ - list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, - dirty_list) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); - } - - btrfs_free_path(path); - return 0; -} - -/* - * transaction commit does final block group cache writeback during a - * critical section where nothing is allowed to change the FS. This is - * required in order for the cache to actually match the block group, - * but can introduce a lot of latency into the commit. - * - * So, btrfs_start_dirty_block_groups is here to kick off block group - * cache IO. There's a chance we'll have to redo some of it if the - * block group changes again during the commit, but it greatly reduces - * the commit latency by getting rid of the easy block groups while - * we're still allowing others to join the commit. - */ -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - struct btrfs_transaction *cur_trans = trans->transaction; - int ret = 0; - int should_put; - struct btrfs_path *path = NULL; - LIST_HEAD(dirty); - struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; - int loops = 0; - - spin_lock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cur_trans->dirty_bgs)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - return 0; - } - list_splice_init(&cur_trans->dirty_bgs, &dirty); - spin_unlock(&cur_trans->dirty_bgs_lock); - -again: - /* - * make sure all the block groups on our dirty list actually - * exist - */ - btrfs_create_pending_block_groups(trans); - - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } - - /* - * cache_write_mutex is here only to save us from balance or automatic - * removal of empty block groups deleting this block group while we are - * writing out the cache - */ - mutex_lock(&trans->transaction->cache_write_mutex); - while (!list_empty(&dirty)) { - bool drop_reserve = true; - - cache = list_first_entry(&dirty, - struct btrfs_block_group_cache, - dirty_list); - /* - * this can happen if something re-dirties a block - * group that is already under IO. Just wait for it to - * finish and then do it all again - */ - if (!list_empty(&cache->io_list)) { - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - } - - - /* - * btrfs_wait_cache_io uses the cache->dirty_list to decide - * if it should update the cache_state. Don't delete - * until after we wait. - * - * Since we're not running in the commit critical section - * we need the dirty_bgs_lock to protect from update_block_group - */ - spin_lock(&cur_trans->dirty_bgs_lock); - list_del_init(&cache->dirty_list); - spin_unlock(&cur_trans->dirty_bgs_lock); - - should_put = 1; - - cache_save_setup(cache, trans, path); - - if (cache->disk_cache_state == BTRFS_DC_SETUP) { - cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(trans, cache, path); - if (ret == 0 && cache->io_ctl.inode) { - num_started++; - should_put = 0; - - /* - * The cache_write_mutex is protecting the - * io_list, also refer to the definition of - * btrfs_transaction::io_bgs for more details - */ - list_add_tail(&cache->io_list, io); - } else { - /* - * if we failed to write the cache, the - * generation will be bad and life goes on - */ - ret = 0; - } - } - if (!ret) { - ret = write_one_cache_group(trans, path, cache); - /* - * Our block group might still be attached to the list - * of new block groups in the transaction handle of some - * other task (struct btrfs_trans_handle->new_bgs). This - * means its block group item isn't yet in the extent - * tree. If this happens ignore the error, as we will - * try again later in the critical section of the - * transaction commit. - */ - if (ret == -ENOENT) { - ret = 0; - spin_lock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &cur_trans->dirty_bgs); - btrfs_get_block_group(cache); - drop_reserve = false; - } - spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret) { - btrfs_abort_transaction(trans, ret); - } - } - - /* if it's not on the io list, we need to put the block group */ - if (should_put) - btrfs_put_block_group(cache); - if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); - /* - * Avoid blocking other tasks for too long. It might even save - * us from writing caches for block groups that are going to be - * removed. - */ - mutex_unlock(&trans->transaction->cache_write_mutex); - mutex_lock(&trans->transaction->cache_write_mutex); - } - mutex_unlock(&trans->transaction->cache_write_mutex); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; - /* - * go through delayed refs for all the stuff we've just kicked off - * and then loop back (just once) - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (!ret && loops == 0) { - loops++; - spin_lock(&cur_trans->dirty_bgs_lock); - list_splice_init(&cur_trans->dirty_bgs, &dirty); - /* - * dirty_bgs_lock protects us from concurrent block group - * deletes too (not just cache_write_mutex). - */ - if (!list_empty(&dirty)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - goto again; - } - spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { - btrfs_cleanup_dirty_bgs(cur_trans, fs_info); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); } - btrfs_free_path(path); - return ret; -} - -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; - struct btrfs_transaction *cur_trans = trans->transaction; - int ret = 0; - int should_put; - struct btrfs_path *path; - struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * Even though we are in the critical section of the transaction commit, - * we can still have concurrent tasks adding elements to this - * transaction's list of dirty block groups. These tasks correspond to - * endio free space workers started when writeback finishes for a - * space cache, which run inode.c:btrfs_finish_ordered_io(), and can - * allocate new block groups as a result of COWing nodes of the root - * tree when updating the free space inode. The writeback for the space - * caches is triggered by an earlier call to - * btrfs_start_dirty_block_groups() and iterations of the following - * loop. - * Also we want to do the cache_save_setup first and then run the - * delayed refs to make sure we have the best chance at doing this all - * in one shot. - */ - spin_lock(&cur_trans->dirty_bgs_lock); - while (!list_empty(&cur_trans->dirty_bgs)) { - cache = list_first_entry(&cur_trans->dirty_bgs, - struct btrfs_block_group_cache, - dirty_list); + if (owner >= BTRFS_FIRST_FREE_OBJECTID) + needed = BTRFS_REF_TYPE_DATA; + else + needed = BTRFS_REF_TYPE_BLOCK; - /* - * this can happen if cache_save_setup re-dirties a block - * group that is already under IO. Just wait for it to - * finish and then do it all again - */ - if (!list_empty(&cache->io_list)) { - spin_unlock(&cur_trans->dirty_bgs_lock); - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); - spin_lock(&cur_trans->dirty_bgs_lock); + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_REF_TYPE_INVALID) { + err = -EUCLEAN; + goto out; } - /* - * don't remove from the dirty list until after we've waited - * on any pending IO - */ - list_del_init(&cache->dirty_list); - spin_unlock(&cur_trans->dirty_bgs_lock); - should_put = 1; - - cache_save_setup(cache, trans, path); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } - if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); - - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { - cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(trans, cache, path); - if (ret == 0 && cache->io_ctl.inode) { - num_started++; - should_put = 0; - list_add_tail(&cache->io_list, io); - } else { - /* - * if we failed to write the cache, the - * generation will be bad and life goes on - */ - ret = 0; + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; } - } - if (!ret) { - ret = write_one_cache_group(trans, path, cache); - /* - * One of the free space endio workers might have - * created a new block group while updating a free space - * cache's inode (at inode.c:btrfs_finish_ordered_io()) - * and hasn't released its transaction handle yet, in - * which case the new block group is still attached to - * its transaction handle and its creation has not - * finished yet (no block group item in the extent tree - * yet, etc). If this is the case, wait for all free - * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and - * complex approach. - */ - if (ret == -ENOENT) { - wait_event(cur_trans->writer_wait, - atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, path, cache); + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; } - if (ret) - btrfs_abort_transaction(trans, ret); } - - /* if its not on the io list, we need to put the block group */ - if (should_put) - btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); - spin_lock(&cur_trans->dirty_bgs_lock); + ptr += btrfs_extent_inline_ref_size(type); } - spin_unlock(&cur_trans->dirty_bgs_lock); - - /* - * Refer to the definition of io_bgs member for details why it's safe - * to use it without any locking - */ - while (!list_empty(io)) { - cache = list_first_entry(io, struct btrfs_block_group_cache, - io_list); - list_del_init(&cache->io_list); - btrfs_wait_cache_io(trans, cache, path); - btrfs_put_block_group(cache); + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + err = -EAGAIN; + goto out; + } } - - btrfs_free_path(path); - return ret; -} - -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *block_group; - int readonly = 0; - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - if (!block_group || block_group->ro) - readonly = 1; - if (block_group) - btrfs_put_block_group(block_group); - return readonly; -} - -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *bg; - bool ret = true; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg) - return false; - - spin_lock(&bg->lock); - if (bg->ro) - ret = false; - else - atomic_inc(&bg->nocow_writers); - spin_unlock(&bg->lock); - - /* no put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) - btrfs_put_block_group(bg); - - return ret; - -} - -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group_cache *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); - if (atomic_dec_and_test(&bg->nocow_writers)) - wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); - btrfs_put_block_group(bg); -} - -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) -{ - wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); -} - -static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - write_seqlock(&fs_info->profiles_lock); - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - write_sequnlock(&fs_info->profiles_lock); -} - -/* - * returns target flags in extended format or 0 if restripe for this - * chunk_type is not in progress - * - * should be called with balance_lock held - */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - u64 target = 0; - - if (!bctl) - return 0; - - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); } - - return target; + return err; } /* - * @flags: available profiles in extended format (see ctree.h) - * - * Returns reduced profile in chunk format. If profile changing is in - * progress (either running or paused) picks the target profile (if it's - * already available), otherwise falls back to plain reducing. + * helper to add new inline back ref */ -static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices = fs_info->fs_devices->rw_devices; - u64 target; - u64 raid_type; - u64 allowed = 0; - - /* - * see if restripe for this chunk_type is in progress, if so - * try to reduce to the target profile - */ - spin_lock(&fs_info->balance_lock); - target = get_restripe_target(fs_info, flags); - if (target) { - /* pick target profile only if it's already available */ - if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&fs_info->balance_lock); - return extended_to_chunk(target); - } - } - spin_unlock(&fs_info->balance_lock); - - /* First, mask out the RAID levels which aren't possible */ - for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { - if (num_devices >= btrfs_raid_array[raid_type].devs_min) - allowed |= btrfs_raid_array[raid_type].bg_flag; - } - allowed &= flags; - - if (allowed & BTRFS_BLOCK_GROUP_RAID6) - allowed = BTRFS_BLOCK_GROUP_RAID6; - else if (allowed & BTRFS_BLOCK_GROUP_RAID5) - allowed = BTRFS_BLOCK_GROUP_RAID5; - else if (allowed & BTRFS_BLOCK_GROUP_RAID10) - allowed = BTRFS_BLOCK_GROUP_RAID10; - else if (allowed & BTRFS_BLOCK_GROUP_RAID1) - allowed = BTRFS_BLOCK_GROUP_RAID1; - else if (allowed & BTRFS_BLOCK_GROUP_RAID0) - allowed = BTRFS_BLOCK_GROUP_RAID0; - - flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; - - return extended_to_chunk(flags | allowed); -} - -static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) -{ - unsigned seq; - u64 flags; - - do { - flags = orig_flags; - seq = read_seqbegin(&fs_info->profiles_lock); - - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&fs_info->profiles_lock, seq)); - - return btrfs_reduce_alloc_profile(fs_info, flags); -} - -static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) +static noinline_for_stack +void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_fs_info *fs_info = root->fs_info; - u64 flags; - u64 ret; - - if (data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (root == fs_info->chunk_root) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; - ret = get_alloc_profile(fs_info, flags); - return ret; -} + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; -u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); -} + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); -u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); -} + btrfs_extend_item(path, size); -u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) -{ - return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); -} + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); -static void force_metadata_allocation(struct btrfs_fs_info *info) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = CHUNK_ALLOC_FORCE; + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - rcu_read_unlock(); + btrfs_mark_buffer_dirty(leaf); } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) { - u64 bytes_used = btrfs_space_info_used(sinfo, false); - u64 thresh; + int ret; - if (force == CHUNK_ALLOC_FORCE) - return 1; + ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, + num_bytes, parent, root_objectid, + owner, offset, 0); + if (ret != -ENOENT) + return ret; - /* - * in limited mode, we want to have some free space up to - * about 1% of the FS size. - */ - if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + btrfs_release_path(path); + *ref_ret = NULL; - if (sinfo->total_bytes - bytes_used < thresh) - return 1; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset); } - - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) - return 0; - return 1; -} - -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) -{ - u64 num_dev; - - num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; - if (!num_dev) - num_dev = fs_info->fs_devices->rw_devices; - - return num_dev; + return ret; } /* - * If @is_allocation is true, reserve space in the system space info necessary - * for allocating a chunk, otherwise if it's false, reserve space necessary for - * removing a chunk. + * helper to update/remove inline back ref */ -void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +static noinline_for_stack +void update_inline_extent_backref(struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *info; - u64 left; - u64 thresh; - int ret = 0; - u64 num_devs; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + u64 refs; + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); /* - * Needed because we can end up allocating a system chunk and for an - * atomic and race free space reservation in the chunk block reserve. + * If type is invalid, we should have bailed out after + * lookup_inline_extent_backref(). */ - lockdep_assert_held(&fs_info->chunk_mutex); - - info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - spin_lock(&info->lock); - left = info->total_bytes - btrfs_space_info_used(info, true); - spin_unlock(&info->lock); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + ASSERT(type != BTRFS_REF_TYPE_INVALID); - num_devs = get_profile_num_devs(fs_info, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + BUG_ON(refs_to_mod != -1); + } - /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + - btrfs_calc_trans_metadata_size(fs_info, 1); + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; - if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); - btrfs_dump_space_info(fs_info, info, 0, 0); + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + *last_ref = 1; + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + btrfs_truncate_item(path, item_size, 1); } + btrfs_mark_buffer_dirty(leaf); +} - if (left < thresh) { - u64 flags = btrfs_system_alloc_profile(fs_info); +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; - /* - * Ignore failure to create system chunk. We might end up not - * needing it, as we might not need to COW all nodes/leafs from - * the paths we visit in the chunk tree (they were already COWed - * or created in the current transaction for example). - */ - ret = btrfs_alloc_chunk(trans, flags); + ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, + num_bytes, parent, root_objectid, + owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + update_inline_extent_backref(path, iref, refs_to_add, + extent_op, NULL); + } else if (ret == -ENOENT) { + setup_inline_extent_backref(trans->fs_info, path, iref, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + ret = 0; } + return ret; +} - if (!ret) { - ret = btrfs_block_rsv_add(fs_info->chunk_root, - &fs_info->chunk_block_rsv, - thresh, BTRFS_RESERVE_NO_FLUSH); - if (!ret) - trans->chunk_bytes_reserved += thresh; +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); } + return ret; } -/* - * If force is CHUNK_ALLOC_FORCE: - * - return 1 if it successfully allocates a chunk, - * - return errors including -ENOSPC otherwise. - * If force is NOT CHUNK_ALLOC_FORCE: - * - return 0 if it doesn't need to allocate a new chunk, - * - return 1 if it successfully allocates a chunk, - * - return errors including -ENOSPC otherwise. - */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - enum btrfs_chunk_alloc_enum force) +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data, int *last_ref) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; - bool wait_for_alloc = false; - bool should_alloc = false; int ret = 0; - /* Don't re-enter if we're already allocating a chunk */ - if (trans->allocating_chunk) - return -ENOSPC; - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - - do { - spin_lock(&space_info->lock); - if (force < space_info->force_alloc) - force = space_info->force_alloc; - should_alloc = should_alloc_chunk(fs_info, space_info, force); - if (space_info->full) { - /* No more free physical space */ - if (should_alloc) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&space_info->lock); - return ret; - } else if (!should_alloc) { - spin_unlock(&space_info->lock); - return 0; - } else if (space_info->chunk_alloc) { - /* - * Someone is already allocating, so we need to block - * until this someone is finished and then loop to - * recheck if we should continue with our allocation - * attempt. - */ - wait_for_alloc = true; - spin_unlock(&space_info->lock); - mutex_lock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->chunk_mutex); - } else { - /* Proceed with allocation */ - space_info->chunk_alloc = 1; - wait_for_alloc = false; - spin_unlock(&space_info->lock); - } - - cond_resched(); - } while (wait_for_alloc); - - mutex_lock(&fs_info->chunk_mutex); - trans->allocating_chunk = true; + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + update_inline_extent_backref(path, iref, -refs_to_drop, NULL, + last_ref); + } else if (is_data) { + ret = remove_extent_data_ref(trans, path, refs_to_drop, + last_ref); + } else { + *last_ref = 1; + ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); + } + return ret; +} - /* - * If we have mixed data/metadata chunks we want to make sure we keep - * allocating mixed chunks instead of individual chunks. - */ - if (btrfs_mixed_space_info(space_info)) - flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) +{ + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << 9); - /* - * if we're doing a data chunk, go ahead and make sure that - * we keep a reasonable number of metadata chunks allocated in the - * FS as well. - */ - if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { - fs_info->data_chunk_allocations++; - if (!(fs_info->data_chunk_allocations % - fs_info->metadata_ratio)) - force_metadata_allocation(fs_info); + if (WARN_ON(start != aligned_start)) { + len -= aligned_start - start; + len = round_down(len, 1 << 9); + start = aligned_start; } - /* - * Check if we have enough space in SYSTEM chunk because we may need - * to update devices. - */ - check_system_chunk(trans, flags); - - ret = btrfs_alloc_chunk(trans, flags); - trans->allocating_chunk = false; + *discarded_bytes = 0; - spin_lock(&space_info->lock); - if (ret < 0) { - if (ret == -ENOSPC) - space_info->full = 1; - else - goto out; - } else { - ret = 1; - space_info->max_extent_size = 0; - } + if (!len) + return 0; - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; -out: - space_info->chunk_alloc = 0; - spin_unlock(&space_info->lock); - mutex_unlock(&fs_info->chunk_mutex); - /* - * When we allocate a new chunk we reserve space in the chunk block - * reserve to make sure we can COW nodes/leafs in the chunk tree or - * add new nodes/leafs to it if we end up needing to do it when - * inserting the chunk item and updating device items as part of the - * second phase of chunk allocation, performed by - * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a - * large number of new block groups to create in our transaction - * handle's new_bgs list to avoid exhausting the chunk block reserve - * in extreme cases - like having a single transaction create many new - * block groups when starting to write out the free space caches of all - * the block groups that were made dirty during the lifetime of the - * transaction. - */ - if (trans->chunk_bytes_reserved >= (u64)SZ_2M) - btrfs_create_pending_block_groups(trans); + end = start + len; + bytes_left = len; - return ret; -} + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; -static int update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc) -{ - struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group_cache *cache = NULL; - u64 total = num_bytes; - u64 old_val; - u64 byte_in_group; - int factor; - int ret = 0; + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; - /* block accounting for super block */ - spin_lock(&info->delalloc_root_lock); - old_val = btrfs_super_bytes_used(info->super_copy); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_root_lock); - - while (total) { - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; } - factor = btrfs_bg_type_to_factor(cache->flags); - - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, 1); - - byte_in_group = bytenr - cache->key.objectid; - WARN_ON(byte_in_group > cache->key.offset); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; - - old_val = btrfs_block_group_used(&cache->item); - num_bytes = min(total, cache->key.offset - byte_in_group); - if (alloc) { - old_val += num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; - cache->space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - } else { - old_val -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, - cache->space_info, num_bytes); - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - trace_btrfs_space_reservation(info, "pinned", - cache->space_info->flags, - num_bytes, 1); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; } - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); + start = sb_end; + if (start > end) { + bytes_left = 0; + break; } - spin_unlock(&trans->transaction->dirty_bgs_lock); - - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) - btrfs_mark_bg_unused(cache); - - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; + bytes_left = end - start; } - /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) { - struct btrfs_block_group_cache *cache; - u64 bytenr; + int ret; + u64 discarded_bytes = 0; + struct btrfs_bio *bbio = NULL; - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); - if (bytenr < (u64)-1) - return bytenr; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(fs_info); + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, + &bbio, 0); + /* Error condition is -ENOMEM */ + if (!ret) { + struct btrfs_bio_stripe *stripe = bbio->stripes; + int i; - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; - bytenr = cache->key.objectid; - btrfs_put_block_group(cache); + for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; + struct request_queue *req_q; - return bytenr; -} + if (!stripe->dev->bdev) { + ASSERT(btrfs_test_opt(fs_info, DEGRADED)); + continue; + } + req_q = bdev_get_queue(stripe->dev->bdev); + if (!blk_queue_discard(req_q)) + continue; -static int pin_down_extent(struct btrfs_block_group_cache *cache, - u64 bytenr, u64 num_bytes, int reserved) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; + ret = btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + stripe->length, + &bytes); + if (!ret) + discarded_bytes += bytes; + else if (ret != -EOPNOTSUPP) + break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); - if (reserved) { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; + } + btrfs_put_bbio(bbio); } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + btrfs_bio_counter_dec(fs_info); - trace_btrfs_space_reservation(fs_info, "pinned", - cache->space_info->flags, num_bytes, 1); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(fs_info->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - return 0; + if (actual_bytes) + *actual_bytes = discarded_bytes; + + + if (ret == -EOPNOTSUPP) + ret = 0; + return ret; } -/* - * this function must be called within transaction - */ -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, int reserved) +/* Can return -ENOMEM */ +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref) { - struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = trans->fs_info; + int old_ref_mod, new_ref_mod; + int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); /* Logic error */ + ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && + generic_ref->action); + BUG_ON(generic_ref->type == BTRFS_REF_METADATA && + generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); - pin_down_extent(cache, bytenr, num_bytes, reserved); + if (generic_ref->type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, + NULL, &old_ref_mod, &new_ref_mod); + else + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, + &old_ref_mod, &new_ref_mod); - btrfs_put_block_group(cache); - return 0; + btrfs_ref_tree_mod(fs_info, generic_ref); + + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + sub_pinned_bytes(fs_info, generic_ref); + + return ret; } /* - * this function must be called within transaction + * __btrfs_inc_extent_ref - insert backreference for a given extent + * + * @trans: Handle of transaction + * + * @node: The delayed ref node used to get the bytenr/length for + * extent whose references are incremented. + * + * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ + * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical + * bytenr of the parent block. Since new extents are always + * created with indirect references, this will only be the case + * when relocating a shared extent. In that case, root_objectid + * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must + * be 0 + * + * @root_objectid: The id of the root where this modification has originated, + * this can be either one of the well-known metadata trees or + * the subvolume id which references this extent. + * + * @owner: For data extents it is the inode number of the owning file. + * For metadata extents this parameter holds the level in the + * tree of the extent. + * + * @offset: For metadata extents the offset is ignored and is currently + * always passed as 0. For data extents it is the fileoffset + * this extent belongs to. + * + * @refs_to_add Number of references to add + * + * @extent_op Pointer to a structure, holding information necessary when + * updating a tree block's flags + * */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes) +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_block_group_cache *cache; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *item; + struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + u64 refs; int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); - if (!cache) - return -EINVAL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = READA_FORWARD; + path->leave_spinning = 1; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, + parent, root_objectid, owner, + offset, refs_to_add, extent_op); + if ((ret < 0 && ret != -EAGAIN) || !ret) + goto out; /* - * pull in the free space cache (if any) so that our pin - * removes the free space from the cache. We have load_only set - * to one because the slow code to read in the free extents does check - * the pinned extents. + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. */ - cache_block_group(cache, 1); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); - pin_down_extent(cache, bytenr, num_bytes, 0); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); - /* remove us from the free space cache (if we're there at all) */ - ret = btrfs_remove_free_space(cache, bytenr, num_bytes); - btrfs_put_block_group(cache); + path->reada = READA_FORWARD; + path->leave_spinning = 1; + /* now insert the actual backref */ + ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); + if (ret) + btrfs_abort_transaction(trans, ret); +out: + btrfs_free_path(path); return ret; } -static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) -{ - int ret; - struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; - block_group = btrfs_lookup_block_group(fs_info, start); - if (!block_group) - return -EINVAL; + ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + ref_root = ref->root; - if (!caching_ctl) { - /* Logic error */ - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) + flags |= extent_op->flags_to_set; + ret = alloc_reserved_file_extent(trans, parent, ref_root, + flags, ref->objectid, + ref->offset, &ins, + node->ref_mod); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ref->objectid, ref->offset, + node->ref_mod, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, node, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); } else { - mutex_lock(&caching_ctl->mutex); + BUG(); + } + return ret; +} - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(fs_info, start, num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } - num_bytes = (start + num_bytes) - - caching_ctl->progress; - start = caching_ctl->progress; - ret = add_excluded_extent(fs_info, start, num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); } - btrfs_put_block_group(block_group); - return ret; } -int btrfs_exclude_logged_extents(struct extent_buffer *eb) +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_file_extent_item *item; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key key; - int found_type; - int i; - int ret = 0; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; + int ret; + int err = 0; + int metadata = !extent_op->is_data; - if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) + if (trans->aborted) return 0; - for (i = 0; i < btrfs_header_nritems(eb); i++) { - btrfs_item_key_to_cpu(eb, &key, i); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(eb, item); - if (found_type == BTRFS_FILE_EXTENT_INLINE) - continue; - if (btrfs_file_extent_disk_bytenr(eb, item) == 0) - continue; - key.objectid = btrfs_file_extent_disk_bytenr(eb, item); - key.offset = btrfs_file_extent_disk_num_bytes(eb, item); - ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); - if (ret) - break; - } + if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + metadata = 0; - return ret; -} + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; -static void -btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) -{ - atomic_inc(&bg->reservations); -} + key.objectid = head->bytenr; -void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, - const u64 start) -{ - struct btrfs_block_group_cache *bg; + if (metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = extent_op->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; + } - bg = btrfs_lookup_block_group(fs_info, start); - ASSERT(bg); - if (atomic_dec_and_test(&bg->reservations)) - wake_up_var(&bg->reservations); - btrfs_put_block_group(bg); -} +again: + path->reada = READA_FORWARD; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (metadata) { + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == head->bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == head->num_bytes) + ret = 0; + } + if (ret > 0) { + btrfs_release_path(path); + metadata = 0; -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) -{ - struct btrfs_space_info *space_info = bg->space_info; + key.objectid = head->bytenr; + key.offset = head->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } + } else { + err = -EIO; + goto out; + } + } - ASSERT(bg->ro); + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) - return; + if (unlikely(item_size < sizeof(*ei))) { + err = -EINVAL; + btrfs_print_v0_err(fs_info); + btrfs_abort_transaction(trans, err); + goto out; + } - /* - * Our block group is read only but before we set it to read only, - * some task might have had allocated an extent from it already, but it - * has not yet created a respective ordered extent (and added it to a - * root's list of ordered extents). - * Therefore wait for any task currently allocating extents, since the - * block group's reservations counter is incremented while a read lock - * on the groups' semaphore is held and decremented after releasing - * the read access on that semaphore and creating the ordered extent. - */ - down_write(&space_info->groups_sem); - up_write(&space_info->groups_sem); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); - wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; } -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @ram_bytes: The number of bytes of file content, and will be same to - * @num_bytes except for the compress path. - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write - * - * This is called by the allocator when it reserves space. If this is a - * reservation and the block group has become read only we cannot make the - * reservation and return -EAGAIN, otherwise this function always succeeds. - */ -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 ram_bytes, u64 num_bytes, int delalloc) +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { - struct btrfs_space_info *space_info = cache->space_info; int ret = 0; + struct btrfs_delayed_tree_ref *ref; + u64 parent = 0; + u64 ref_root = 0; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) { - ret = -EAGAIN; + ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + ref_root = ref->root; + + if (node->ref_mod != 1) { + btrfs_err(trans->fs_info, + "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + node->bytenr, node->ref_mod, node->action, ref_root, + parent); + return -EIO; + } + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags); + ret = alloc_reserved_tree_block(trans, node, extent_op); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, node, parent, ref_root, + ref->level, 0, 1, extent_op); } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; + BUG(); } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); return ret; } -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write - * - * This is called by somebody who is freeing space that was never actually used - * on disk. For example if you reserve some space for a new leaf in transaction - * A and before transaction A commits you free that leaf, you call this with - * reserve set to 0 in order to clear the reservation. - */ - -static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc) +/* helper function to actually process a single delayed ref entry */ +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { - struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->max_extent_size = 0; + if (trans->aborted) { + if (insert_reserved) + btrfs_pin_extent(trans->fs_info, node->bytenr, + node->num_bytes, 1); + return 0; + } - if (delalloc) - cache->delalloc_bytes -= num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); -} -void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_caching_control *next; - struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group_cache *cache; + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, node, extent_op, + insert_reserved); + else + BUG(); + if (ret && insert_reserved) + btrfs_pin_extent(trans->fs_info, node->bytenr, + node->num_bytes, 1); + return ret; +} - down_write(&fs_info->commit_root_sem); +static inline struct btrfs_delayed_ref_node * +select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - cache = caching_ctl->block_group; - if (block_group_cache_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); - up_write(&fs_info->commit_root_sem); + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} - btrfs_update_global_block_rsv(fs_info); +static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); } -/* - * Returns the free cluster for the given space info and sets empty_cluster to - * what it should be based on the mount options. - */ -static struct btrfs_free_cluster * -fetch_cluster_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 *empty_cluster) +static struct btrfs_delayed_extent_op *cleanup_extent_op( + struct btrfs_delayed_ref_head *head) { - struct btrfs_free_cluster *ret = NULL; + struct btrfs_delayed_extent_op *extent_op = head->extent_op; - *empty_cluster = 0; - if (btrfs_mixed_space_info(space_info)) - return ret; + if (!extent_op) + return NULL; - if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - ret = &fs_info->meta_alloc_cluster; - if (btrfs_test_opt(fs_info, SSD)) - *empty_cluster = SZ_2M; - else - *empty_cluster = SZ_64K; - } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && - btrfs_test_opt(fs_info, SSD_SPREAD)) { - *empty_cluster = SZ_2M; - ret = &fs_info->data_alloc_cluster; + if (head->must_insert_reserved) { + head->extent_op = NULL; + btrfs_free_delayed_extent_op(extent_op); + return NULL; } - - return ret; + return extent_op; } -static int unpin_extent_range(struct btrfs_fs_info *fs_info, - u64 start, u64 end, - const bool return_free_space) +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) { - struct btrfs_block_group_cache *cache = NULL; - struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - struct btrfs_free_cluster *cluster = NULL; - u64 len; - u64 total_unpinned = 0; - u64 empty_cluster = 0; - bool readonly; - - while (start <= end) { - readonly = false; - if (!cache || - start >= cache->key.objectid + cache->key.offset) { - if (cache) - btrfs_put_block_group(cache); - total_unpinned = 0; - cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ + struct btrfs_delayed_extent_op *extent_op; + int ret; - cluster = fetch_cluster_info(fs_info, - cache->space_info, - &empty_cluster); - empty_cluster <<= 1; - } + extent_op = cleanup_extent_op(head); + if (!extent_op) + return 0; + head->extent_op = NULL; + spin_unlock(&head->lock); + ret = run_delayed_extent_op(trans, head, extent_op); + btrfs_free_delayed_extent_op(extent_op); + return ret ? ret : 1; +} - len = cache->key.objectid + cache->key.offset - start; - len = min(len, end + 1 - start); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + int nr_items = 1; /* Dropping this ref head update. */ - if (start < cache->last_byte_to_unpin) { - len = min(len, cache->last_byte_to_unpin - start); - if (return_free_space) - btrfs_add_free_space(cache, start, len); - } + if (head->total_ref_mod < 0) { + struct btrfs_space_info *space_info; + u64 flags; - start += len; - total_unpinned += len; - space_info = cache->space_info; + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -head->num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); /* - * If this space cluster has been marked as fragmented and we've - * unpinned enough in this block group to potentially allow a - * cluster to be created inside of it go ahead and clear the - * fragmented check. + * We had csum deletions accounted for in our delayed refs rsv, + * we need to drop the csum leaves for this update from our + * delayed_refs_rsv. */ - if (cluster && cluster->fragmented && - total_unpinned > empty_cluster) { - spin_lock(&cluster->lock); - cluster->fragmented = 0; - spin_unlock(&cluster->lock); - } - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); - - trace_btrfs_space_reservation(fs_info, "pinned", - space_info->flags, len, 0); - space_info->max_extent_size = 0; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); - if (cache->ro) { - space_info->bytes_readonly += len; - readonly = true; - } - spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - u64 to_add = len; - - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - trace_btrfs_space_reservation(fs_info, - "space_info", - space_info->flags, - to_add, 1); - len -= to_add; - } - spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_space_info_add_new_bytes(fs_info, - space_info, len); + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, + head->num_bytes); } - spin_unlock(&space_info->lock); } - if (cache) - btrfs_put_block_group(cache); - return 0; + btrfs_delayed_refs_rsv_release(fs_info, nr_items); } -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) +static int cleanup_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) { + struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; - struct list_head *deleted_bgs; - struct extent_io_tree *unpin; - u64 start; - u64 end; + struct btrfs_delayed_ref_root *delayed_refs; int ret; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; - - while (!trans->aborted) { - struct extent_state *cached_state = NULL; - - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } - - if (btrfs_test_opt(fs_info, DISCARD)) - ret = btrfs_discard_extent(fs_info, start, - end + 1 - start, NULL); - - clear_extent_dirty(unpin, start, end, &cached_state); - unpin_extent_range(fs_info, start, end, true); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); - } - - /* - * Transaction is finished. We don't need the lock anymore. We - * do need to clean up the block groups in case of a transaction - * abort. - */ - deleted_bgs = &trans->transaction->deleted_bgs; - list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; + delayed_refs = &trans->transaction->delayed_refs; - ret = -EROFS; - if (!trans->aborted) - ret = btrfs_discard_extent(fs_info, - block_group->key.objectid, - block_group->key.offset, - &trimmed); + ret = run_and_cleanup_extent_op(trans, head); + if (ret < 0) { + unselect_delayed_ref_head(delayed_refs, head); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + return ret; + } else if (ret) { + return ret; + } - list_del_init(&block_group->bg_list); - btrfs_put_block_group_trimming(block_group); - btrfs_put_block_group(block_group); + /* + * Need to drop our head ref lock and re-acquire the delayed ref lock + * and then re-check to make sure nobody got added. + */ + spin_unlock(&head->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + return 1; + } + btrfs_delete_ref_head(delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); - if (ret) { - const char *errstr = btrfs_decode_error(ret); - btrfs_warn(fs_info, - "discard failed while removing blockgroup: errno=%d %s", - ret, errstr); + if (head->must_insert_reserved) { + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + if (head->is_data) { + ret = btrfs_del_csums(trans, fs_info, head->bytenr, + head->num_bytes); } } + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + + trace_run_delayed_ref_head(fs_info, head, 0); + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); return 0; } -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) +static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( + struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_root *extent_root = info->extent_root; - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_delayed_ref_head *head = NULL; int ret; - int is_data; - int extent_slot = 0; - int found_extent = 0; - int num_to_del = 1; - u32 item_size; - u64 refs; - u64 bytenr = node->bytenr; - u64 num_bytes = node->num_bytes; - int last_ref = 0; - bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = READA_FORWARD; - path->leave_spinning = 1; - - is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); - if (is_data) - skinny_metadata = false; + spin_lock(&delayed_refs->lock); + head = btrfs_select_ref_head(delayed_refs); + if (!head) { + spin_unlock(&delayed_refs->lock); + return head; + } - ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, - parent, root_objectid, owner_objectid, - owner_offset); - if (ret == 0) { - extent_slot = path->slots[0]; - while (extent_slot >= 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - extent_slot); - if (key.objectid != bytenr) - break; - if (key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == num_bytes) { - found_extent = 1; - break; - } - if (key.type == BTRFS_METADATA_ITEM_KEY && - key.offset == owner_objectid) { - found_extent = 1; - break; - } - if (path->slots[0] - extent_slot > 5) - break; - extent_slot--; - } + /* + * Grab the lock that says we are going to process all the refs for + * this head + */ + ret = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); - if (!found_extent) { - BUG_ON(iref); - ret = remove_extent_backref(trans, path, NULL, - refs_to_drop, - is_data, &last_ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - btrfs_release_path(path); - path->leave_spinning = 1; + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (ret == -EAGAIN) + head = ERR_PTR(-EAGAIN); - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; + return head; +} - if (!is_data && skinny_metadata) { - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = owner_objectid; - } +static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *locked_ref, + unsigned long *run_refs) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_delayed_ref_node *ref; + int must_insert_reserved = 0; + int ret; - ret = btrfs_search_slot(trans, extent_root, - &key, path, -1, 1); - if (ret > 0 && skinny_metadata && path->slots[0]) { - /* - * Couldn't find our skinny metadata item, - * see if we have ye olde extent item. - */ - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == num_bytes) - ret = 0; - } + delayed_refs = &trans->transaction->delayed_refs; - if (ret > 0 && skinny_metadata) { - skinny_metadata = false; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - btrfs_release_path(path); - ret = btrfs_search_slot(trans, extent_root, - &key, path, -1, 1); - } + lockdep_assert_held(&locked_ref->mutex); + lockdep_assert_held(&locked_ref->lock); - if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); - if (ret > 0) - btrfs_print_leaf(path->nodes[0]); - } - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - extent_slot = path->slots[0]; + while ((ref = select_delayed_ref(locked_ref))) { + if (ref->seq && + btrfs_check_delayed_seq(fs_info, ref->seq)) { + spin_unlock(&locked_ref->lock); + unselect_delayed_ref_head(delayed_refs, locked_ref); + return -EAGAIN; } - } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(path->nodes[0]); - btrfs_err(info, - "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - bytenr, parent, root_objectid, owner_objectid, - owner_offset); - btrfs_abort_transaction(trans, ret); - goto out; - } else { - btrfs_abort_transaction(trans, ret); - goto out; - } - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); - if (unlikely(item_size < sizeof(*ei))) { - ret = -EINVAL; - btrfs_print_v0_err(info); - btrfs_abort_transaction(trans, ret); - goto out; - } - ei = btrfs_item_ptr(leaf, extent_slot, - struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && - key.type == BTRFS_EXTENT_ITEM_KEY) { - struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); - bi = (struct btrfs_tree_block_info *)(ei + 1); - WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); - } - refs = btrfs_extent_refs(leaf, ei); - if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", - refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; - } - refs -= refs_to_drop; + (*run_refs)++; + ref->in_tree = 0; + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + /* + * When we play the delayed ref, also correct the ref_mod on + * head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); + } + atomic_dec(&delayed_refs->num_entries); - if (refs > 0) { - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); /* - * In the case of inline back ref, reference count will - * be updated by remove_extent_backref + * Record the must_insert_reserved flag before we drop the + * spin lock. */ - if (iref) { - BUG_ON(!found_extent); - } else { - btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); - } - if (found_extent) { - ret = remove_extent_backref(trans, path, iref, - refs_to_drop, is_data, - &last_ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - } else { - if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); - if (iref) { - BUG_ON(path->slots[0] != extent_slot); - } else { - BUG_ON(path->slots[0] != extent_slot + 1); - path->slots[0] = extent_slot; - num_to_del = 2; - } - } + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; - last_ref = 1; - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], - num_to_del); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - btrfs_release_path(path); + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + spin_unlock(&locked_ref->lock); - if (is_data) { - ret = btrfs_del_csums(trans, info, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } + ret = run_one_delayed_ref(trans, ref, extent_op, + must_insert_reserved); - ret = add_to_free_space_tree(trans, bytenr, num_bytes); + btrfs_free_delayed_extent_op(extent_op); if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; + unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_put_delayed_ref(ref); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", + ret); + return ret; } - ret = update_block_group(trans, bytenr, num_bytes, 0); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + btrfs_put_delayed_ref(ref); + cond_resched(); + + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); } - btrfs_release_path(path); -out: - btrfs_free_path(path); - return ret; + return 0; } /* - * when we free an block, it is possible (and likely) that we free the last - * delayed ref for that extent as well. This searches the delayed ref tree for - * a given extent, and if there are no other delayed refs to be processed, it - * removes it from the tree. + * Returns 0 on success or if called with an already aborted transaction. + * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ -static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, - u64 bytenr) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + unsigned long nr) { - struct btrfs_delayed_ref_head *head; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; - int ret = 0; + struct btrfs_delayed_ref_head *locked_ref = NULL; + ktime_t start = ktime_get(); + int ret; + unsigned long count = 0; + unsigned long actual_count = 0; delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); - if (!head) - goto out_delayed_unlock; + do { + if (!locked_ref) { + locked_ref = btrfs_obtain_ref_head(trans); + if (IS_ERR_OR_NULL(locked_ref)) { + if (PTR_ERR(locked_ref) == -EAGAIN) { + continue; + } else { + break; + } + } + count++; + } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); - spin_lock(&head->lock); - if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - goto out; + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, + &actual_count); + if (ret < 0 && ret != -EAGAIN) { + /* + * Error, btrfs_run_delayed_refs_for_head already + * unlocked everything so just bail out + */ + return ret; + } else if (!ret) { + /* + * Success, perform the usual cleanup of a processed + * head + */ + ret = cleanup_ref_head(trans, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; + continue; + } else if (ret) { + return ret; + } + } - if (cleanup_extent_op(head) != NULL) - goto out; + /* + * Either success case or btrfs_run_delayed_refs_for_head + * returned -EAGAIN, meaning we need to select another head + */ + + locked_ref = NULL; + cond_resched(); + } while ((nr != -1 && count < nr) || locked_ref); /* - * waiting for the lock here would deadlock. If someone else has it - * locked they are already in the process of dropping it anyway + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. */ - if (!mutex_trylock(&head->mutex)) - goto out; - - btrfs_delete_ref_head(delayed_refs, head); - head->processing = 0; - - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - - BUG_ON(head->extent_op); - if (head->must_insert_reserved) - ret = 1; - - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref_head(head); - return ret; -out: - spin_unlock(&head->lock); + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; -out_delayed_unlock: - spin_unlock(&delayed_refs->lock); + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ + spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ + spin_unlock(&delayed_refs->lock); + } return 0; } -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - u64 parent, int last_ref) +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ref generic_ref = { 0 }; - int pin = 1; - int ret; - - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root->root_key.objectid); - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - int old_ref_mod, new_ref_mod; + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; - btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, - &old_ref_mod, &new_ref_mod); - BUG_ON(ret); /* -ENOMEM */ - pin = old_ref_mod >= 0 && new_ref_mod < 0; + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; - if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group_cache *cache; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, buf->start); - if (!ret) - goto out; - } - - pin = 0; - cache = btrfs_lookup_block_group(fs_info, buf->start); - - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); - goto out; - } + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + middle = entry->bytenr; - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_free_reserved_bytes(cache, buf->len, 0); - btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); - } -out: - if (pin) - add_pinned_bytes(fs_info, &generic_ref); + if (alt) + n = n->rb_left; + else + n = n->rb_right; - if (last_ref) { - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + alt = 1 - alt; } + return middle; } +#endif -/* Can return -ENOMEM */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) +static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) { - struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; - int ret; + u64 num_bytes; - if (btrfs_is_testing(fs_info)) - return 0; + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); /* - * tree log blocks never actually go into the extent allocation - * tree, just update pinning info and exit early. + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to use. */ - if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { - /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); - old_ref_mod = new_ref_mod = 0; - ret = 0; - } else if (ref->type == BTRFS_REF_METADATA) { - ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, - &old_ref_mod, &new_ref_mod); - } else { - ret = btrfs_add_delayed_data_ref(trans, ref, 0, - &old_ref_mod, &new_ref_mod); - } - - if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) - btrfs_ref_tree_mod(fs_info, ref); + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); +} - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref); +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; - return ret; + csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, fs_info->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; } /* - * when we wait for progress in the block group caching, its because - * our allocation attempt failed at least once. So, we must sleep - * and let some progress happen before we try again. - * - * This function will sleep at least once waiting for new free space to - * show up, and then it will check the block group free space numbers - * for our min num_bytes. Another option is to have it go ahead - * and look in the rbtree for a free extent of a given size, but this - * is a good start. + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. * - * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using - * any of the information in this block group. + * Returns 0 on success or if called with an aborted transaction + * Returns <0 on error and aborts the transaction */ -static noinline void -wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, - u64 num_bytes) +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + unsigned long count) { - struct btrfs_caching_control *caching_ctl; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; + int ret; + int run_all = count == (unsigned long)-1; - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return; + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) + return 0; - wait_event(caching_ctl->wait, block_group_cache_done(cache) || - (cache->free_space_ctl->free_space >= num_bytes)); + if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) + return 0; - put_caching_control(caching_ctl); -} + delayed_refs = &trans->transaction->delayed_refs; + if (count == 0) + count = atomic_read(&delayed_refs->num_entries) * 2; -static noinline int -wait_block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *caching_ctl; - int ret = 0; +again: +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + ret = __btrfs_run_delayed_refs(trans, count); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; + if (run_all) { + btrfs_create_pending_block_groups(trans); - wait_event(caching_ctl->wait, block_group_cache_done(cache)); - if (cache->cached == BTRFS_CACHE_ERROR) - ret = -EIO; - put_caching_control(caching_ctl); - return ret; -} + spin_lock(&delayed_refs->lock); + node = rb_first_cached(&delayed_refs->href_root); + if (!node) { + spin_unlock(&delayed_refs->lock); + goto out; + } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); -enum btrfs_loop_type { - LOOP_CACHING_NOWAIT, - LOOP_CACHING_WAIT, - LOOP_ALLOC_CHUNK, - LOOP_NO_EMPTY_SIZE, -}; + /* Mutex was contended, block until it's released and retry. */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); -static inline void -btrfs_lock_block_group(struct btrfs_block_group_cache *cache, - int delalloc) -{ - if (delalloc) - down_read(&cache->data_rwsem); + btrfs_put_delayed_ref_head(head); + cond_resched(); + goto again; + } +out: + return 0; } -static inline void -btrfs_grab_block_group(struct btrfs_block_group_cache *cache, - int delalloc) +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 flags, + int level, int is_data) { - btrfs_get_block_group(cache); - if (delalloc) - down_read(&cache->data_rwsem); + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; + extent_op->level = level; + + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + if (ret) + btrfs_free_delayed_extent_op(extent_op); + return ret; } -static struct btrfs_block_group_cache * -btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - int delalloc) +static noinline int check_delayed_ref(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) { - struct btrfs_block_group_cache *used_bg = NULL; - - spin_lock(&cluster->refill_lock); - while (1) { - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_transaction *cur_trans; + struct rb_node *node; + int ret = 0; - if (used_bg == block_group) - return used_bg; + spin_lock(&root->fs_info->trans_lock); + cur_trans = root->fs_info->running_transaction; + if (cur_trans) + refcount_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + if (!cur_trans) + return 0; - btrfs_get_block_group(used_bg); + delayed_refs = &cur_trans->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return 0; + } - if (!delalloc) - return used_bg; + if (!mutex_trylock(&head->mutex)) { + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + btrfs_release_path(path); - spin_unlock(&cluster->refill_lock); + /* + * Mutex was contended, block until it's released and let + * caller try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + spin_unlock(&delayed_refs->lock); - /* We should only have one-level nested. */ - down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); + spin_lock(&head->lock); + /* + * XXX: We should replace this with a proper search function in the + * future. + */ + for (node = rb_first_cached(&head->ref_tree); node; + node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - spin_lock(&cluster->refill_lock); - if (used_bg == cluster->block_group) - return used_bg; + data_ref = btrfs_delayed_node_to_data_ref(ref); - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + btrfs_put_transaction(cur_trans); + return ret; } -static inline void -btrfs_release_block_group(struct btrfs_block_group_cache *cache, - int delalloc) +static noinline int check_committed_ref(struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) { - if (delalloc) - up_read(&cache->data_rwsem); - btrfs_put_block_group(cache); -} - -/* - * Structure used internally for find_free_extent() function. Wraps needed - * parameters. - */ -struct find_free_extent_ctl { - /* Basic allocation info */ - u64 ram_bytes; - u64 num_bytes; - u64 empty_size; - u64 flags; - int delalloc; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u32 item_size; + int type; + int ret; - /* Where to start the search inside the bg */ - u64 search_start; + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; - /* For clustered allocation */ - u64 empty_cluster; + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); /* Corruption */ - bool have_caching_bg; - bool orig_have_caching_bg; + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; - /* RAID index, converted from flags */ - int index; + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - /* - * Current loop number, check find_free_extent_update_loop() for details - */ - int loop; + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; - /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. - */ - bool retry_clustered; + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; + /* If extent item has more than 1 inline ref then it's shared */ + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; - /* If current block group is cached */ - int cached; + /* If extent created before last snapshot => it's definitely shared */ + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; - /* Max contiguous hole found */ - u64 max_extent_size; + iref = (struct btrfs_extent_inline_ref *)(ei + 1); - /* Total free space from free space cache, not always contiguous */ - u64 total_free_space; + /* If this extent has SHARED_DATA_REF then it's shared */ + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (type != BTRFS_EXTENT_DATA_REF_KEY) + goto out; - /* Found result */ - u64 found_offset; -}; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + ret = 0; +out: + return ret; +} -/* - * Helper function for find_free_extent(). - * - * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group - * Return >0 to inform caller that we find nothing - * Return 0 means we have found a location and set ffe_ctl->found_offset. - */ -static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group_cache **cluster_bg_ret) +int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, + u64 bytenr) { - struct btrfs_block_group_cache *cluster_bg; - u64 aligned_cluster; - u64 offset; + struct btrfs_path *path; int ret; - cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); - if (!cluster_bg) - goto refill_cluster; - if (cluster_bg != bg && (cluster_bg->ro || - !block_group_bits(cluster_bg, ffe_ctl->flags))) - goto release_cluster; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, - ffe_ctl->num_bytes, cluster_bg->key.objectid, - &ffe_ctl->max_extent_size); - if (offset) { - /* We have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(cluster_bg, - ffe_ctl->search_start, ffe_ctl->num_bytes); - *cluster_bg_ret = cluster_bg; - ffe_ctl->found_offset = offset; - return 0; - } - WARN_ON(last_ptr->block_group != cluster_bg); + do { + ret = check_committed_ref(root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) + goto out; -release_cluster: - /* - * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so - * lets just skip it and let the allocator find whatever block it can - * find. If we reach this point, we will have tried the cluster - * allocator plenty of times and not have found anything, so we are - * likely way too fragmented for the clustering stuff to find anything. - * - * However, if the cluster is taken from the current block group, - * release the cluster first, so that we stand a better chance of - * succeeding in the unclustered allocation. - */ - if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { - spin_unlock(&last_ptr->refill_lock); - btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); - return -ENOENT; - } + ret = check_delayed_ref(root, path, objectid, offset, bytenr); + } while (ret == -EAGAIN); - /* This cluster didn't work out, free it and start over */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); +out: + btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); + return ret; +} - if (cluster_bg != bg) - btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + int full_backref, int inc) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 ref_root; + u32 nritems; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct btrfs_ref generic_ref = { 0 }; + bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); + int i; + int action; + int level; + int ret = 0; -refill_cluster: - if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - return -ENOENT; - } + if (btrfs_is_testing(fs_info)) + return 0; - aligned_cluster = max_t(u64, - ffe_ctl->empty_cluster + ffe_ctl->empty_size, - bg->full_stripe_len); - ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, - ffe_ctl->num_bytes, aligned_cluster); - if (ret == 0) { - /* Now pull our allocation out of this cluster */ - offset = btrfs_alloc_from_cluster(bg, last_ptr, - ffe_ctl->num_bytes, ffe_ctl->search_start, - &ffe_ctl->max_extent_size); - if (offset) { - /* We found one, proceed */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(bg, - ffe_ctl->search_start, - ffe_ctl->num_bytes); - ffe_ctl->found_offset = offset; - return 0; - } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && - !ffe_ctl->retry_clustered) { - spin_unlock(&last_ptr->refill_lock); + ref_root = btrfs_header_owner(buf); + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); - ffe_ctl->retry_clustered = true; - wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_cluster + ffe_ctl->empty_size); - return -EAGAIN; - } - /* - * At this point we either didn't find a cluster or we weren't able to - * allocate a block from our cluster. Free the cluster we've been - * trying to use, and go to the next block group. - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - spin_unlock(&last_ptr->refill_lock); - return 1; -} + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) + return 0; -/* - * Return >0 to inform caller that we find nothing - * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group - */ -static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl) -{ - u64 offset; + if (full_backref) + parent = buf->start; + else + parent = 0; + if (inc) + action = BTRFS_ADD_DELAYED_REF; + else + action = BTRFS_DROP_DELAYED_REF; - /* - * We are doing an unclustered allocation, set the fragmented flag so - * we don't bother trying to setup a cluster again until we get more - * space. - */ - if (unlikely(last_ptr)) { - spin_lock(&last_ptr->lock); - last_ptr->fragmented = 1; - spin_unlock(&last_ptr->lock); - } - if (ffe_ctl->cached) { - struct btrfs_free_space_ctl *free_space_ctl; + for (i = 0; i < nritems; i++) { + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; - free_space_ctl = bg->free_space_ctl; - spin_lock(&free_space_ctl->tree_lock); - if (free_space_ctl->free_space < - ffe_ctl->num_bytes + ffe_ctl->empty_cluster + - ffe_ctl->empty_size) { - ffe_ctl->total_free_space = max_t(u64, - ffe_ctl->total_free_space, - free_space_ctl->free_space); - spin_unlock(&free_space_ctl->tree_lock); - return 1; + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, + key.offset); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = fs_info->nodesize; + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); + if (ret) + goto fail; } - spin_unlock(&free_space_ctl->tree_lock); } + return 0; +fail: + return ret; +} - offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, - ffe_ctl->num_bytes, ffe_ctl->empty_size, - &ffe_ctl->max_extent_size); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} - /* - * If we didn't find a chunk, and we haven't failed on this block group - * before, and this block group is in the middle of caching and we are - * ok with waiting, then go ahead and wait for progress to be made, and - * set @retry_unclustered to true. - * - * If @retry_unclustered is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && - ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); - ffe_ctl->retry_unclustered = true; - return -EAGAIN; - } else if (!offset) { - return 1; - } - ffe_ctl->found_offset = offset; - return 0; +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -/* - * Return >0 means caller needs to re-search for free extent - * Return 0 means we have the needed free extent. - * Return <0 means we failed to locate any free extent. - */ -static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - struct btrfs_free_cluster *last_ptr, - struct btrfs_key *ins, - struct find_free_extent_ctl *ffe_ctl, - int full_search, bool use_cluster) +int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_root *root = fs_info->extent_root; - int ret; + struct btrfs_block_group_cache *block_group; + int readonly = 0; - if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && - ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) - ffe_ctl->orig_have_caching_bg = true; + block_group = btrfs_lookup_block_group(fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} - if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && - ffe_ctl->have_caching_bg) - return 1; +static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 flags; + u64 ret; - if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) - return 1; + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; - if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } - return 0; - } + ret = btrfs_get_alloc_profile(fs_info, flags); + return ret; +} - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { - ffe_ctl->index = 0; - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (ffe_ctl->orig_have_caching_bg || !full_search) - ffe_ctl->loop = LOOP_CACHING_WAIT; - else - ffe_ctl->loop = LOOP_ALLOC_CHUNK; - } else { - ffe_ctl->loop++; - } +static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; - if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; + spin_lock(&fs_info->block_group_cache_lock); + bytenr = fs_info->first_logical_byte; + spin_unlock(&fs_info->block_group_cache_lock); - trans = current->journal_info; - if (trans) - exist = 1; - else - trans = btrfs_join_transaction(root); + if (bytenr < (u64)-1) + return bytenr; - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - return ret; - } + cache = btrfs_lookup_first_block_group(fs_info, search_start); + if (!cache) + return 0; - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + bytenr = cache->key.objectid; + btrfs_put_block_group(cache); - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + return bytenr; +} - /* Do not bail out on ENOSPC since we can do more. */ - if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - else - ret = 0; - if (!exist) - btrfs_end_transaction(trans); - if (ret) - return ret; - } +static int pin_down_extent(struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; - if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { - /* - * Don't loop again if we already have no empty_size and - * no empty_cluster. - */ - if (ffe_ctl->empty_size == 0 && - ffe_ctl->empty_cluster == 0) - return -ENOSPC; - ffe_ctl->empty_size = 0; - ffe_ctl->empty_cluster = 0; - } - return 1; + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, + num_bytes); + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; } - return -ENOSPC; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, + num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + set_extent_dirty(fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + return 0; } /* - * walks the btree of allocated extents and find a hole of a given size. - * The key ins is changed to record the hole: - * ins->objectid == start position - * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == the size of the hole. - * Any available blocks before search_start are skipped. - * - * If there is no suitable free space, we will record the max size of - * the free space extent currently. - * - * The overall logic and call chain: - * - * find_free_extent() - * |- Iterate through all block groups - * | |- Get a valid block group - * | |- Try to do clustered allocation in that block group - * | |- Try to do unclustered allocation in that block group - * | |- Check if the result is valid - * | | |- If valid, then exit - * | |- Jump to next block group - * | - * |- Push harder to find free extents - * |- If not found, re-iterate all block groups + * this function must be called within transaction */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, - u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) +int btrfs_pin_extent(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, int reserved) { - int ret = 0; - struct btrfs_free_cluster *last_ptr = NULL; - struct btrfs_block_group_cache *block_group = NULL; - struct find_free_extent_ctl ffe_ctl = {0}; - struct btrfs_space_info *space_info; - bool use_cluster = true; - bool full_search = false; + struct btrfs_block_group_cache *cache; - WARN_ON(num_bytes < fs_info->sectorsize); + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); /* Logic error */ - ffe_ctl.ram_bytes = ram_bytes; - ffe_ctl.num_bytes = num_bytes; - ffe_ctl.empty_size = empty_size; - ffe_ctl.flags = flags; - ffe_ctl.search_start = 0; - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - ffe_ctl.delalloc = delalloc; - ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); - ffe_ctl.have_caching_bg = false; - ffe_ctl.orig_have_caching_bg = false; - ffe_ctl.found_offset = 0; + pin_down_extent(cache, bytenr, num_bytes, reserved); - ins->type = BTRFS_EXTENT_ITEM_KEY; - ins->objectid = 0; - ins->offset = 0; + btrfs_put_block_group(cache); + return 0; +} - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); +/* + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + int ret; - space_info = btrfs_find_space_info(fs_info, flags); - if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", flags); - return -ENOSPC; - } + cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!cache) + return -EINVAL; /* - * If our free space is heavily fragmented we may not be able to make - * big contiguous allocations, so instead of doing the expensive search - * for free space, simply return ENOSPC with our max_extent_size so we - * can go ahead and search for a more manageable chunk. - * - * If our max_extent_size is large enough for our allocation simply - * disable clustering since we will likely not be able to find enough - * space to create a cluster and induce latency trying. + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. */ - if (unlikely(space_info->max_extent_size)) { - spin_lock(&space_info->lock); - if (space_info->max_extent_size && - num_bytes > space_info->max_extent_size) { - ins->offset = space_info->max_extent_size; - spin_unlock(&space_info->lock); - return -ENOSPC; - } else if (space_info->max_extent_size) { - use_cluster = false; - } - spin_unlock(&space_info->lock); - } + btrfs_cache_block_group(cache, 1); - last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (last_ptr) { - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - hint_byte = last_ptr->window_start; - if (last_ptr->fragmented) { - /* - * We still set window_start so we can keep track of the - * last place we found an allocation to try and save - * some time. - */ - hint_byte = last_ptr->window_start; - use_cluster = false; - } - spin_unlock(&last_ptr->lock); - } + pin_down_extent(cache, bytenr, num_bytes, 0); - ffe_ctl.search_start = max(ffe_ctl.search_start, - first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); - if (ffe_ctl.search_start == hint_byte) { - block_group = btrfs_lookup_block_group(fs_info, - ffe_ctl.search_start); - /* - * we don't want to use the block group if it doesn't match our - * allocation bits, or if its not cached. - * - * However if we are re-searching with an ideal block group - * picked out then we don't care that the block group is cached. - */ - if (block_group && block_group_bits(block_group, flags) && - block_group->cached != BTRFS_CACHE_NO) { - down_read(&space_info->groups_sem); - if (list_empty(&block_group->list) || - block_group->ro) { - /* - * someone is removing this block group, - * we can't jump into the have_block_group - * target because our list pointers are not - * valid - */ - btrfs_put_block_group(block_group); - up_read(&space_info->groups_sem); - } else { - ffe_ctl.index = btrfs_bg_flags_to_raid_index( - block_group->flags); - btrfs_lock_block_group(block_group, delalloc); - goto have_block_group; - } - } else if (block_group) { - btrfs_put_block_group(block_group); - } - } -search: - ffe_ctl.have_caching_bg = false; - if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || - ffe_ctl.index == 0) - full_search = true; - down_read(&space_info->groups_sem); - list_for_each_entry(block_group, - &space_info->block_groups[ffe_ctl.index], list) { - /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) - continue; + /* remove us from the free space cache (if we're there at all) */ + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return ret; +} - btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->key.objectid; +static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; - /* - * this can happen if we end up cycling through all the - * raid types, but we want to make sure we only allocate - * for the proper type. - */ - if (!block_group_bits(block_group, flags)) { - u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_RAID10; + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) + return -EINVAL; - /* - * if they asked for extra copies and this block group - * doesn't provide them, bail. This does allow us to - * fill raid0 from raid1. - */ - if ((flags & extra) && !(block_group->flags & extra)) - goto loop; - } + btrfs_cache_block_group(block_group, 0); + caching_ctl = btrfs_get_caching_control(block_group); -have_block_group: - ffe_ctl.cached = block_group_cache_done(block_group); - if (unlikely(!ffe_ctl.cached)) { - ffe_ctl.have_caching_bg = true; - ret = cache_block_group(block_group, 0); - BUG_ON(ret < 0); - ret = 0; + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!btrfs_block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = btrfs_add_excluded_extent(fs_info, start, + num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = btrfs_add_excluded_extent(fs_info, start, + num_bytes); } +out_lock: + mutex_unlock(&caching_ctl->mutex); + btrfs_put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} - if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) - goto loop; +int btrfs_exclude_logged_extents(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + int ret = 0; - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *cluster_bg = NULL; + if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) + return 0; - ret = find_free_extent_clustered(block_group, last_ptr, - &ffe_ctl, &cluster_bg); + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); + if (ret) + break; + } - if (ret == 0) { - if (cluster_bg && cluster_bg != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = cluster_bg; - } - goto checks; - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { - goto loop; - } - /* ret == -ENOENT case falls through */ - } + return ret; +} - ret = find_free_extent_unclustered(block_group, last_ptr, - &ffe_ctl); - if (ret == -EAGAIN) - goto have_block_group; - else if (ret > 0) - goto loop; - /* ret == 0 case falls through */ -checks: - ffe_ctl.search_start = round_up(ffe_ctl.found_offset, - fs_info->stripesize); +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} - /* move on to the next group */ - if (ffe_ctl.search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); - goto loop; - } +void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) +{ + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; - if (ffe_ctl.found_offset < ffe_ctl.search_start) - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + down_write(&fs_info->commit_root_sem); - ret = btrfs_add_reserved_bytes(block_group, ram_bytes, - num_bytes, delalloc); - if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); - goto loop; + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (btrfs_block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } else { + cache->last_byte_to_unpin = caching_ctl->progress; } - btrfs_inc_block_group_reservations(block_group); + } - /* we are all good, lets return */ - ins->objectid = ffe_ctl.search_start; - ins->offset = num_bytes; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; - trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, - num_bytes); - btrfs_release_block_group(block_group, delalloc); - break; -loop: - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - ffe_ctl.index); - btrfs_release_block_group(block_group, delalloc); - cond_resched(); - } - up_read(&space_info->groups_sem); + up_write(&fs_info->commit_root_sem); - ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, - full_search, use_cluster); - if (ret > 0) - goto search; + btrfs_update_global_block_rsv(fs_info); +} - if (ret == -ENOSPC) { - /* - * Use ffe_ctl->total_free_space as fallback if we can't find - * any contiguous hole. - */ - if (!ffe_ctl.max_extent_size) - ffe_ctl.max_extent_size = ffe_ctl.total_free_space; - spin_lock(&space_info->lock); - space_info->max_extent_size = ffe_ctl.max_extent_size; - spin_unlock(&space_info->lock); - ins->offset = ffe_ctl.max_extent_size; +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &fs_info->meta_alloc_cluster; + if (btrfs_test_opt(fs_info, SSD)) + *empty_cluster = SZ_2M; + else + *empty_cluster = SZ_64K; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && + btrfs_test_opt(fs_info, SSD_SPREAD)) { + *empty_cluster = SZ_2M; + ret = &fs_info->data_alloc_cluster; } + return ret; } -/* - * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a - * hole that is at least as big as @num_bytes. - * - * @root - The root that will contain this extent - * - * @ram_bytes - The amount of space in ram that @num_bytes take. This - * is used for accounting purposes. This value differs - * from @num_bytes only in the case of compressed extents. - * - * @num_bytes - Number of bytes to allocate on-disk. - * - * @min_alloc_size - Indicates the minimum amount of space that the - * allocator should try to satisfy. In some cases - * @num_bytes may be larger than what is required and if - * the filesystem is fragmented then allocation fails. - * However, the presence of @min_alloc_size gives a - * chance to try and satisfy the smaller allocation. - * - * @empty_size - A hint that you plan on doing more COW. This is the - * size in bytes the allocator should try to find free - * next to the block it returns. This is just a hint and - * may be ignored by the allocator. - * - * @hint_byte - Hint to the allocator to start searching above the byte - * address passed. It might be ignored. - * - * @ins - This key is modified to record the found hole. It will - * have the following values: - * ins->objectid == start position - * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == the size of the hole. - * - * @is_data - Boolean flag indicating whether an extent is - * allocated for data (true) or metadata (false) - * - * @delalloc - Boolean flag indicating whether this allocation is for - * delalloc or not. If 'true' data_rwsem of block groups - * is going to be acquired. - * - * - * Returns 0 when an allocation succeeded or < 0 when an error occurred. In - * case -ENOSPC is returned then @ins->offset will contain the size of the - * largest available hole the allocator managed to find. - */ -int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc) +static int unpin_extent_range(struct btrfs_fs_info *fs_info, + u64 start, u64 end, + const bool return_free_space) { - struct btrfs_fs_info *fs_info = root->fs_info; - bool final_tried = num_bytes == min_alloc_size; - u64 flags; - int ret; + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; + u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; + bool readonly; + + while (start <= end) { + readonly = false; + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + total_unpinned = 0; + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(fs_info, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; + } + + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + if (return_free_space) + btrfs_add_free_space(cache, start, len); + } - flags = get_alloc_profile_by_root(root, is_data); -again: - WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, - hint_byte, ins, flags, delalloc); - if (!ret && !is_data) { - btrfs_dec_block_group_reservations(fs_info, ins->objectid); - } else if (ret == -ENOSPC) { - if (!final_tried && ins->offset) { - num_bytes = min(num_bytes >> 1, ins->offset); - num_bytes = round_down(num_bytes, - fs_info->sectorsize); - num_bytes = max(num_bytes, min_alloc_size); - ram_bytes = num_bytes; - if (num_bytes == min_alloc_size) - final_tried = true; - goto again; - } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - struct btrfs_space_info *sinfo; + start += len; + total_unpinned += len; + space_info = cache->space_info; - sinfo = btrfs_find_space_info(fs_info, flags); - btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu", - flags, num_bytes); - if (sinfo) - btrfs_dump_space_info(fs_info, sinfo, - num_bytes, 1); + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + space_info->max_extent_size = 0; + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); + if (cache->ro) { + space_info->bytes_readonly += len; + readonly = true; + } + spin_unlock(&cache->lock); + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + u64 to_add = len; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + /* Add to any tickets we may have */ + if (len) + btrfs_try_granting_tickets(fs_info, + space_info); } + spin_unlock(&space_info->lock); } - return ret; + if (cache) + btrfs_put_block_group(cache); + return 0; } -static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, - int pin, int delalloc) +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { - struct btrfs_block_group_cache *cache; - int ret = 0; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + struct list_head *deleted_bgs; + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; - cache = btrfs_lookup_block_group(fs_info, start); - if (!cache) { - btrfs_err(fs_info, "Unable to find block group for %llu", - start); - return -ENOSPC; - } + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + + while (!trans->aborted) { + struct extent_state *cached_state = NULL; + + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + break; + } - if (pin) - pin_down_extent(cache, start, len, 1); - else { if (btrfs_test_opt(fs_info, DISCARD)) - ret = btrfs_discard_extent(fs_info, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); - trace_btrfs_reserved_extent_free(fs_info, start, len); + ret = btrfs_discard_extent(fs_info, start, + end + 1 - start, NULL); + + clear_extent_dirty(unpin, start, end, &cached_state); + unpin_extent_range(fs_info, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + free_extent_state(cached_state); + cond_resched(); } - btrfs_put_block_group(cache); - return ret; -} + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) -{ - return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); -} + ret = -EROFS; + if (!trans->aborted) + ret = btrfs_discard_extent(fs_info, + block_group->key.objectid, + block_group->key.offset, + &trimmed); -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len) -{ - return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); + list_del_init(&block_group->bg_list); + btrfs_put_block_group_trimming(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "discard failed while removing blockgroup: errno=%d %s", + ret, errstr); + } + } + + return 0; } -static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 parent, u64 root_objectid, - u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; - struct btrfs_extent_item *extent_item; - struct btrfs_extent_inline_ref *iref; + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_key key; struct btrfs_path *path; + struct btrfs_root *extent_root = info->extent_root; struct extent_buffer *leaf; - int type; - u32 size; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + int ret; + int is_data; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + u32 item_size; + u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + int last_ref = 0; + bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); - if (parent > 0) - type = BTRFS_SHARED_DATA_REF_KEY; - else - type = BTRFS_EXTENT_DATA_REF_KEY; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = READA_FORWARD; + path->leave_spinning = 1; + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + if (is_data) + skinny_metadata = false; + + ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, + parent, root_objectid, owner_objectid, + owner_offset); + if (ret == 0) { + extent_slot = path->slots[0]; + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + extent_slot); + if (key.objectid != bytenr) + break; + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { + found_extent = 1; + break; + } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + extent_slot--; + } + + if (!found_extent) { + BUG_ON(iref); + ret = remove_extent_backref(trans, path, NULL, + refs_to_drop, + is_data, &last_ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_release_path(path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); - if (ret) { - btrfs_free_path(path); - return ret; + if (ret) { + btrfs_err(info, + "umm, got %d back from search, was looking for %llu", + ret, bytenr); + if (ret > 0) + btrfs_print_leaf(path->nodes[0]); + } + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } + extent_slot = path->slots[0]; + } + } else if (WARN_ON(ret == -ENOENT)) { + btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", + bytenr, parent, root_objectid, owner_objectid, + owner_offset); + btrfs_abort_transaction(trans, ret); + goto out; + } else { + btrfs_abort_transaction(trans, ret); + goto out; } leaf = path->nodes[0]; - extent_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, extent_item, ref_mod); - btrfs_set_extent_generation(leaf, extent_item, trans->transid); - btrfs_set_extent_flags(leaf, extent_item, - flags | BTRFS_EXTENT_FLAG_DATA); + item_size = btrfs_item_size_nr(leaf, extent_slot); + if (unlikely(item_size < sizeof(*ei))) { + ret = -EINVAL; + btrfs_print_v0_err(info); + btrfs_abort_transaction(trans, ret); + goto out; + } + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } - iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - btrfs_set_extent_inline_ref_type(leaf, iref, type); - if (parent > 0) { - struct btrfs_shared_data_ref *ref; - ref = (struct btrfs_shared_data_ref *)(iref + 1); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); - } else { - struct btrfs_extent_data_ref *ref; - ref = (struct btrfs_extent_data_ref *)(&iref->offset); - btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, ref, owner); - btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + refs = btrfs_extent_refs(leaf, ei); + if (refs < refs_to_drop) { + btrfs_err(info, + "trying to drop %d refs but we only have %Lu for bytenr %Lu", + refs_to_drop, refs, bytenr); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; } + refs -= refs_to_drop; - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref + */ + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, path, iref, + refs_to_drop, is_data, + &last_ref); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + } else { + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } + } - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; + last_ref = 1; + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_release_path(path); - ret = update_block_group(trans, ins->objectid, ins->offset, 1); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); + if (is_data) { + ret = btrfs_del_csums(trans, info, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); + btrfs_release_path(path); + +out: + btrfs_free_path(path); return ret; } -static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) +/* + * when we free an block, it is possible (and likely) that we free the last + * delayed ref for that extent as well. This searches the delayed ref tree for + * a given extent, and if there are no other delayed refs to be processed, it + * removes it from the tree. + */ +static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, + u64 bytenr) { - struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; - struct btrfs_extent_item *extent_item; - struct btrfs_key extent_key; - struct btrfs_tree_block_info *block_info; - struct btrfs_extent_inline_ref *iref; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_delayed_tree_ref *ref; - u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; - u64 flags = extent_op->flags_to_set; - bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - - ref = btrfs_delayed_node_to_tree_ref(node); - - extent_key.objectid = node->bytenr; - if (skinny_metadata) { - extent_key.offset = ref->level; - extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; - } else { - extent_key.offset = node->num_bytes; - extent_key.type = BTRFS_EXTENT_ITEM_KEY; - size += sizeof(*block_info); - num_bytes = node->num_bytes; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + int ret = 0; - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - &extent_key, size); - if (ret) { - btrfs_free_path(path); - return ret; - } + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) + goto out_delayed_unlock; - leaf = path->nodes[0]; - extent_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, extent_item, 1); - btrfs_set_extent_generation(leaf, extent_item, trans->transid); - btrfs_set_extent_flags(leaf, extent_item, - flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + goto out; - if (skinny_metadata) { - iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - } else { - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); - btrfs_set_tree_block_level(leaf, block_info, ref->level); - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); - } + if (cleanup_extent_op(head) != NULL) + goto out; - if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - btrfs_set_extent_inline_ref_type(leaf, iref, - BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); - } else { - btrfs_set_extent_inline_ref_type(leaf, iref, - BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); - } + /* + * waiting for the lock here would deadlock. If someone else has it + * locked they are already in the process of dropping it anyway + */ + if (!mutex_trylock(&head->mutex)) + goto out; - btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); + btrfs_delete_ref_head(delayed_refs, head); + head->processing = 0; - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); - ret = update_block_group(trans, extent_key.objectid, - fs_info->nodesize, 1); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); return ret; +out: + spin_unlock(&head->lock); + +out_delayed_unlock: + spin_unlock(&delayed_refs->lock); + return 0; } -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner, - u64 offset, u64 ram_bytes, - struct btrfs_key *ins) +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ref generic_ref = { 0 }; + int pin = 1; int ret; - BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root->root_key.objectid); - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); - btrfs_ref_tree_mod(root->fs_info, &generic_ref); - ret = btrfs_add_delayed_data_ref(trans, &generic_ref, - ram_bytes, NULL, NULL); - return ret; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + int old_ref_mod, new_ref_mod; + + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, + &old_ref_mod, &new_ref_mod); + BUG_ON(ret); /* -ENOMEM */ + pin = old_ref_mod >= 0 && new_ref_mod < 0; + } + + if (last_ref && btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) + goto out; + } + + pin = 0; + cache = btrfs_lookup_block_group(fs_info, buf->start); + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + btrfs_free_reserved_bytes(cache, buf->len, 0); + btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + } +out: + if (pin) + add_pinned_bytes(fs_info, &generic_ref); + + if (last_ref) { + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); + } } -/* - * this is used by the tree logging recovery code. It records that - * an extent has been allocated and makes sure to clear the free - * space cache bits as well - */ -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins) +/* Can return -ENOMEM */ +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; + int old_ref_mod, new_ref_mod; int ret; - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; + + if (btrfs_is_testing(fs_info)) + return 0; /* - * Mixed block groups will exclude before processing the log so we only - * need to do the exclude dance if this fs isn't mixed. + * tree log blocks never actually go into the extent allocation + * tree, just update pinning info and exit early. */ - if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = __exclude_logged_extent(fs_info, ins->objectid, - ins->offset); - if (ret) - return ret; + if ((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { + /* unlocks the pinned mutex */ + btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); + old_ref_mod = new_ref_mod = 0; + ret = 0; + } else if (ref->type == BTRFS_REF_METADATA) { + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, + &old_ref_mod, &new_ref_mod); + } else { + ret = btrfs_add_delayed_data_ref(trans, ref, 0, + &old_ref_mod, &new_ref_mod); } - block_group = btrfs_lookup_block_group(fs_info, ins->objectid); - if (!block_group) - return -EINVAL; + if (!((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + btrfs_ref_tree_mod(fs_info, ref); - space_info = block_group->space_info; - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - space_info->bytes_reserved += ins->offset; - block_group->reserved += ins->offset; - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, ref); - ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, - offset, ins, 1); - btrfs_put_block_group(block_group); return ret; } -static struct extent_buffer * -btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *buf; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return buf; +enum btrfs_loop_type { + LOOP_CACHING_NOWAIT, + LOOP_CACHING_WAIT, + LOOP_ALLOC_CHUNK, + LOOP_NO_EMPTY_SIZE, +}; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); - free_extent_buffer(buf); - return ERR_PTR(-EUCLEAN); - } +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} - btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); - btrfs_tree_lock(buf); - btrfs_clean_tree_block(buf); - clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} - btrfs_set_lock_blocking_write(buf); - set_extent_buffer_uptodate(buf); +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg = NULL; - memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); - btrfs_set_header_level(buf, level); - btrfs_set_header_bytenr(buf, buf->start); - btrfs_set_header_generation(buf, trans->transid); - btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(buf, owner); - write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); - write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - buf->log_index = root->log_transid % 2; - /* - * we allow two log transactions at a time, use different - * EXTENT bit to differentiate dirty pages. - */ - if (buf->log_index == 0) - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); - else - set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1); - } else { - buf->log_index = -1; - set_extent_dirty(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + spin_lock(&cluster->refill_lock); + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); + + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); } - trans->dirty = true; - /* this returns a buffer locked for blocking */ - return buf; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); } /* - * finds a free extent and does all the dirty work required for allocation - * returns the tree buffer or an ERR_PTR on error. + * Structure used internally for find_free_extent() function. Wraps needed + * parameters. */ -struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - const struct btrfs_disk_key *key, - int level, u64 hint, - u64 empty_size) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key ins; - struct btrfs_block_rsv *block_rsv; - struct extent_buffer *buf; - struct btrfs_delayed_extent_op *extent_op; - struct btrfs_ref generic_ref = { 0 }; - u64 flags = 0; - int ret; - u32 blocksize = fs_info->nodesize; - bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 empty_size; + u64 flags; + int delalloc; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (btrfs_is_testing(fs_info)) { - buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); - if (!IS_ERR(buf)) - root->alloc_bytenr += blocksize; - return buf; - } -#endif + /* Where to start the search inside the bg */ + u64 search_start; - block_rsv = btrfs_use_block_rsv(trans, root, blocksize); - if (IS_ERR(block_rsv)) - return ERR_CAST(block_rsv); + /* For clustered allocation */ + u64 empty_cluster; - ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, - empty_size, hint, &ins, 0, 0); - if (ret) - goto out_unuse; + bool have_caching_bg; + bool orig_have_caching_bg; - buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto out_free_reserved; - } + /* RAID index, converted from flags */ + int index; - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins.objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - extent_op = btrfs_alloc_delayed_extent_op(); - if (!extent_op) { - ret = -ENOMEM; - goto out_free_buf; - } - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = skinny_metadata ? false : true; - extent_op->update_flags = true; - extent_op->is_data = false; - extent_op->level = level; + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level, root_objectid); - btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, - extent_op, NULL, NULL); - if (ret) - goto out_free_delayed; - } - return buf; + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; -out_free_delayed: - btrfs_free_delayed_extent_op(extent_op); -out_free_buf: - free_extent_buffer(buf); -out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); -out_unuse: - btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); - return ERR_PTR(ret); -} + /* If current block group is cached */ + int cached; -struct walk_control { - u64 refs[BTRFS_MAX_LEVEL]; - u64 flags[BTRFS_MAX_LEVEL]; - struct btrfs_key update_progress; - struct btrfs_key drop_progress; - int drop_level; - int stage; - int level; - int shared_level; - int update_ref; - int keep_locks; - int reada_slot; - int reada_count; - int restarted; + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; }; -#define DROP_REFERENCE 1 -#define UPDATE_BACKREF 2 -static noinline void reada_walk_down(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct walk_control *wc, - struct btrfs_path *path) +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group_cache **cluster_bg_ret) { - struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 generation; - u64 refs; - u64 flags; - u32 nritems; - struct btrfs_key key; - struct extent_buffer *eb; + struct btrfs_block_group_cache *cluster_bg; + u64 aligned_cluster; + u64 offset; int ret; - int slot; - int nread = 0; - - if (path->slots[wc->level] < wc->reada_slot) { - wc->reada_count = wc->reada_count * 2 / 3; - wc->reada_count = max(wc->reada_count, 2); - } else { - wc->reada_count = wc->reada_count * 3 / 2; - wc->reada_count = min_t(int, wc->reada_count, - BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - } - - eb = path->nodes[wc->level]; - nritems = btrfs_header_nritems(eb); - for (slot = path->slots[wc->level]; slot < nritems; slot++) { - if (nread >= wc->reada_count) - break; + cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || + !block_group_bits(cluster_bg, ffe_ctl->flags))) + goto release_cluster; - cond_resched(); - bytenr = btrfs_node_blockptr(eb, slot); - generation = btrfs_node_ptr_generation(eb, slot); + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, + ffe_ctl->num_bytes, cluster_bg->key.objectid, + &ffe_ctl->max_extent_size); + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(cluster_bg, + ffe_ctl->search_start, ffe_ctl->num_bytes); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; + } + WARN_ON(last_ptr->block_group != cluster_bg); - if (slot == path->slots[wc->level]) - goto reada; +release_cluster: + /* + * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so + * lets just skip it and let the allocator find whatever block it can + * find. If we reach this point, we will have tried the cluster + * allocator plenty of times and not have found anything, so we are + * likely way too fragmented for the clustering stuff to find anything. + * + * However, if the cluster is taken from the current block group, + * release the cluster first, so that we stand a better chance of + * succeeding in the unclustered allocation. + */ + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + return -ENOENT; + } - if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) - continue; + /* This cluster didn't work out, free it and start over */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); - /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, - wc->level - 1, 1, &refs, - &flags); - /* We don't care about errors in readahead. */ - if (ret < 0) - continue; - BUG_ON(refs == 0); + if (cluster_bg != bg) + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); - if (wc->stage == DROP_REFERENCE) { - if (refs == 1) - goto reada; +refill_cluster: + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + return -ENOENT; + } - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - if (!wc->update_ref || - generation <= root->root_key.offset) - continue; - btrfs_node_key_to_cpu(eb, &key, slot); - ret = btrfs_comp_cpu_keys(&key, - &wc->update_progress); - if (ret < 0) - continue; - } else { - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; + aligned_cluster = max_t(u64, + ffe_ctl->empty_cluster + ffe_ctl->empty_size, + bg->full_stripe_len); + ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, + ffe_ctl->num_bytes, aligned_cluster); + if (ret == 0) { + /* Now pull our allocation out of this cluster */ + offset = btrfs_alloc_from_cluster(bg, last_ptr, + ffe_ctl->num_bytes, ffe_ctl->search_start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(bg, + ffe_ctl->search_start, + ffe_ctl->num_bytes); + ffe_ctl->found_offset = offset; + return 0; } -reada: - readahead_tree_block(fs_info, bytenr); - nread++; + } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_clustered) { + spin_unlock(&last_ptr->refill_lock); + + ffe_ctl->retry_clustered = true; + btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + ffe_ctl->empty_size); + return -EAGAIN; } - wc->reada_slot = slot; + /* + * At this point we either didn't find a cluster or we weren't able to + * allocate a block from our cluster. Free the cluster we've been + * trying to use, and go to the next block group. + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + return 1; } /* - * helper to process tree block while walking down the tree. - * - * when wc->stage == UPDATE_BACKREF, this function updates - * back refs for pointers in the block. - * - * NOTE: return value 1 means we should stop walking down. + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + * Return -EAGAIN to inform caller that we need to re-search this block group */ -static noinline int walk_down_proc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc, int lookup_info) +static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl) { - struct btrfs_fs_info *fs_info = root->fs_info; - int level = wc->level; - struct extent_buffer *eb = path->nodes[level]; - u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; - int ret; - - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) - return 1; + u64 offset; /* - * when reference count of tree block is 1, it won't increase - * again. once full backref flag is set, we never clear it. + * We are doing an unclustered allocation, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get more + * space. */ - if (lookup_info && - ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { - BUG_ON(!path->locks[level]); - ret = btrfs_lookup_extent_info(trans, fs_info, - eb->start, level, 1, - &wc->refs[level], - &wc->flags[level]); - BUG_ON(ret == -ENOMEM); - if (ret) - return ret; - BUG_ON(wc->refs[level] == 0); + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); } + if (ffe_ctl->cached) { + struct btrfs_free_space_ctl *free_space_ctl; - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level] > 1) + free_space_ctl = bg->free_space_ctl; + spin_lock(&free_space_ctl->tree_lock); + if (free_space_ctl->free_space < + ffe_ctl->num_bytes + ffe_ctl->empty_cluster + + ffe_ctl->empty_size) { + ffe_ctl->total_free_space = max_t(u64, + ffe_ctl->total_free_space, + free_space_ctl->free_space); + spin_unlock(&free_space_ctl->tree_lock); return 1; - - if (path->locks[level] && !wc->keep_locks) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; } - return 0; + spin_unlock(&free_space_ctl->tree_lock); } - /* wc->stage == UPDATE_BACKREF */ - if (!(wc->flags[level] & flag)) { - BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb->start, - eb->len, flag, - btrfs_header_level(eb), 0); - BUG_ON(ret); /* -ENOMEM */ - wc->flags[level] |= flag; - } + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); /* - * the block is shared by multiple trees, so it's not good to - * keep the tree lock + * If we didn't find a chunk, and we haven't failed on this block group + * before, and this block group is in the middle of caching and we are + * ok with waiting, then go ahead and wait for progress to be made, and + * set @retry_unclustered to true. + * + * If @retry_unclustered is true then we've already waited on this + * block group once and should move on to the next block group. */ - if (path->locks[level] && level > 0) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; + if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && + ffe_ctl->loop > LOOP_CACHING_NOWAIT) { + btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); + ffe_ctl->retry_unclustered = true; + return -EAGAIN; + } else if (!offset) { + return 1; } + ffe_ctl->found_offset = offset; return 0; } /* - * This is used to verify a ref exists for this root to deal with a bug where we - * would have a drop_progress key that hadn't been updated properly. + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. */ -static int check_ref_exists(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 parent, - int level) +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + struct btrfs_free_cluster *last_ptr, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl, + int full_search, bool use_cluster) { - struct btrfs_path *path; - struct btrfs_extent_inline_ref *iref; + struct btrfs_root *root = fs_info->extent_root; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && + ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) + ffe_ctl->orig_have_caching_bg = true; + + if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && + ffe_ctl->have_caching_bg) + return 1; + + if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) + return 1; + + if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } + return 0; + } + + /* + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any uncached bgs and we've already done a + * full search through. + */ + if (ffe_ctl->orig_have_caching_bg || !full_search) + ffe_ctl->loop = LOOP_CACHING_WAIT; + else + ffe_ctl->loop = LOOP_ALLOC_CHUNK; + } else { + ffe_ctl->loop++; + } + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + return ret; + } + + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - ret = lookup_extent_backref(trans, path, &iref, bytenr, - root->fs_info->nodesize, parent, - root->root_key.objectid, level, 0); - btrfs_free_path(path); - if (ret == -ENOENT) - return 0; - if (ret < 0) - return ret; - return 1; + /* Do not bail out on ENOSPC since we can do more. */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans); + if (ret) + return ret; + } + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && + ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + return 1; + } + return -ENOSPC; } /* - * helper to process tree block pointer. + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * Any available blocks before search_start are skipped. * - * when wc->stage == DROP_REFERENCE, this function checks - * reference count of the block pointed to. if the block - * is shared and we need update back refs for the subtree - * rooted at the block, this function changes wc->stage to - * UPDATE_BACKREF. if the block is shared and there is no - * need to update back, this function drops the reference - * to the block. + * If there is no suitable free space, we will record the max size of + * the free space extent currently. * - * NOTE: return value 1 means we should stop walking down. + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * | |- Get a valid block group + * | |- Try to do clustered allocation in that block group + * | |- Try to do unclustered allocation in that block group + * | |- Check if the result is valid + * | | |- If valid, then exit + * | |- Jump to next block group + * | + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups */ -static noinline int do_walk_down(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc, int *lookup_info) +static noinline int find_free_extent(struct btrfs_fs_info *fs_info, + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) { - struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 generation; - u64 parent; - struct btrfs_key key; - struct btrfs_key first_key; - struct btrfs_ref ref = { 0 }; - struct extent_buffer *next; - int level = wc->level; - int reada = 0; int ret = 0; - bool need_account = false; - - generation = btrfs_node_ptr_generation(path->nodes[level], - path->slots[level]); - /* - * if the lower level block was created before the snapshot - * was created, we know there is no need to update back refs - * for the subtree - */ - if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { - *lookup_info = 1; - return 1; - } + struct btrfs_free_cluster *last_ptr = NULL; + struct btrfs_block_group_cache *block_group = NULL; + struct find_free_extent_ctl ffe_ctl = {0}; + struct btrfs_space_info *space_info; + bool use_cluster = true; + bool full_search = false; - bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, - path->slots[level]); + WARN_ON(num_bytes < fs_info->sectorsize); - next = find_extent_buffer(fs_info, bytenr); - if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(next)) - return PTR_ERR(next); + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.search_start = 0; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.delalloc = delalloc; + ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); + ffe_ctl.have_caching_bg = false; + ffe_ctl.orig_have_caching_bg = false; + ffe_ctl.found_offset = 0; - btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, - level - 1); - reada = 1; - } - btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); + ins->type = BTRFS_EXTENT_ITEM_KEY; + ins->objectid = 0; + ins->offset = 0; - ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, - &wc->refs[level - 1], - &wc->flags[level - 1]); - if (ret < 0) - goto out_unlock; + trace_find_free_extent(fs_info, num_bytes, empty_size, flags); - if (unlikely(wc->refs[level - 1] == 0)) { - btrfs_err(fs_info, "Missing references."); - ret = -EIO; - goto out_unlock; + space_info = btrfs_find_space_info(fs_info, flags); + if (!space_info) { + btrfs_err(fs_info, "No space info for %llu", flags); + return -ENOSPC; } - *lookup_info = 0; - - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level - 1] > 1) { - need_account = true; - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - - if (!wc->update_ref || - generation <= root->root_key.offset) - goto skip; - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); - if (ret < 0) - goto skip; - wc->stage = UPDATE_BACKREF; - wc->shared_level = level - 1; + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; } - } else { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; + spin_unlock(&space_info->lock); } - if (!btrfs_buffer_uptodate(next, generation, 0)) { - btrfs_tree_unlock(next); - free_extent_buffer(next); - next = NULL; - *lookup_info = 1; + last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl.empty_cluster); + if (last_ptr) { + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } + spin_unlock(&last_ptr->lock); } - if (!next) { - if (reada && level == 1) - reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, generation, level - 1, - &first_key); - if (IS_ERR(next)) { - return PTR_ERR(next); - } else if (!extent_buffer_uptodate(next)) { - free_extent_buffer(next); - return -EIO; + ffe_ctl.search_start = max(ffe_ctl.search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); + if (ffe_ctl.search_start == hint_byte) { + block_group = btrfs_lookup_block_group(fs_info, + ffe_ctl.search_start); + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + * + * However if we are re-searching with an ideal block group + * picked out then we don't care that the block group is cached. + */ + if (block_group && block_group_bits(block_group, flags) && + block_group->cached != BTRFS_CACHE_NO) { + down_read(&space_info->groups_sem); + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else { + ffe_ctl.index = btrfs_bg_flags_to_raid_index( + block_group->flags); + btrfs_lock_block_group(block_group, delalloc); + goto have_block_group; + } + } else if (block_group) { + btrfs_put_block_group(block_group); } - btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); } +search: + ffe_ctl.have_caching_bg = false; + if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || + ffe_ctl.index == 0) + full_search = true; + down_read(&space_info->groups_sem); + list_for_each_entry(block_group, + &space_info->block_groups[ffe_ctl.index], list) { + /* If the block group is read-only, we can skip it entirely. */ + if (unlikely(block_group->ro)) + continue; - level--; - ASSERT(level == btrfs_header_level(next)); - if (level != btrfs_header_level(next)) { - btrfs_err(root->fs_info, "mismatched level"); - ret = -EIO; - goto out_unlock; - } - path->nodes[level] = next; - path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - wc->level = level; - if (wc->level == 1) - wc->reada_slot = 0; - return 0; -skip: - wc->refs[level - 1] = 0; - wc->flags[level - 1] = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - ASSERT(root->root_key.objectid == - btrfs_header_owner(path->nodes[level])); - if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level])) { - btrfs_err(root->fs_info, - "mismatched block owner"); - ret = -EIO; - goto out_unlock; - } - parent = 0; - } + btrfs_grab_block_group(block_group, delalloc); + ffe_ctl.search_start = block_group->key.objectid; /* - * If we had a drop_progress we need to verify the refs are set - * as expected. If we find our ref then we know that from here - * on out everything should be correct, and we can clear the - * ->restarted flag. + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. */ - if (wc->restarted) { - ret = check_ref_exists(trans, root, bytenr, parent, - level - 1); - if (ret < 0) - goto out_unlock; - if (ret == 0) - goto no_delete; + if (!block_group_bits(block_group, flags)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((flags & extra) && !(block_group->flags & extra)) + goto loop; + + /* + * This block group has different flags than we want. + * It's possible that we have MIXED_GROUP flag but no + * block group is mixed. Just skip such block group. + */ + btrfs_release_block_group(block_group, delalloc); + continue; + } + +have_block_group: + ffe_ctl.cached = btrfs_block_group_cache_done(block_group); + if (unlikely(!ffe_ctl.cached)) { + ffe_ctl.have_caching_bg = true; + ret = btrfs_cache_block_group(block_group, 0); + BUG_ON(ret < 0); ret = 0; - wc->restarted = 0; } + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; + /* - * Reloc tree doesn't contribute to qgroup numbers, and we have - * already accounted them at merge time (replace_path), - * thus we could skip expensive subtree trace here. + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - need_account) { - ret = btrfs_qgroup_trace_subtree(trans, next, - generation, level - 1); - if (ret) { - btrfs_err_rl(fs_info, - "Error %d accounting shared subtree. Quota is out of sync, rescan required.", - ret); + if (last_ptr && use_cluster) { + struct btrfs_block_group_cache *cluster_bg = NULL; + + ret = find_free_extent_clustered(block_group, last_ptr, + &ffe_ctl, &cluster_bg); + + if (ret == 0) { + if (cluster_bg && cluster_bg != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = cluster_bg; + } + goto checks; + } else if (ret == -EAGAIN) { + goto have_block_group; + } else if (ret > 0) { + goto loop; } + /* ret == -ENOENT case falls through */ } - /* - * We need to update the next key in our walk control so we can - * update the drop_progress key accordingly. We don't care if - * find_next_key doesn't find a key because that means we're at - * the end and are going to clean up now. - */ - wc->drop_level = level; - find_next_key(path, level, &wc->drop_progress); + ret = find_free_extent_unclustered(block_group, last_ptr, + &ffe_ctl); + if (ret == -EAGAIN) + goto have_block_group; + else if (ret > 0) + goto loop; + /* ret == 0 case falls through */ +checks: + ffe_ctl.search_start = round_up(ffe_ctl.found_offset, + fs_info->stripesize); - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); - ret = btrfs_free_extent(trans, &ref); - if (ret) - goto out_unlock; + /* move on to the next group */ + if (ffe_ctl.search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); + goto loop; + } + + if (ffe_ctl.found_offset < ffe_ctl.search_start) + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); + + ret = btrfs_add_reserved_bytes(block_group, ram_bytes, + num_bytes, delalloc); + if (ret == -EAGAIN) { + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); + goto loop; + } + btrfs_inc_block_group_reservations(block_group); + + /* we are all good, lets return */ + ins->objectid = ffe_ctl.search_start; + ins->offset = num_bytes; + + trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, + num_bytes); + btrfs_release_block_group(block_group, delalloc); + break; +loop: + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl.index); + btrfs_release_block_group(block_group, delalloc); + cond_resched(); } -no_delete: - *lookup_info = 1; - ret = 1; + up_read(&space_info->groups_sem); -out_unlock: - btrfs_tree_unlock(next); - free_extent_buffer(next); + ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, + full_search, use_cluster); + if (ret > 0) + goto search; + if (ret == -ENOSPC) { + /* + * Use ffe_ctl->total_free_space as fallback if we can't find + * any contiguous hole. + */ + if (!ffe_ctl.max_extent_size) + ffe_ctl.max_extent_size = ffe_ctl.total_free_space; + spin_lock(&space_info->lock); + space_info->max_extent_size = ffe_ctl.max_extent_size; + spin_unlock(&space_info->lock); + ins->offset = ffe_ctl.max_extent_size; + } return ret; } /* - * helper to process tree block while walking up the tree. + * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a + * hole that is at least as big as @num_bytes. * - * when wc->stage == DROP_REFERENCE, this function drops - * reference count on the block. + * @root - The root that will contain this extent + * + * @ram_bytes - The amount of space in ram that @num_bytes take. This + * is used for accounting purposes. This value differs + * from @num_bytes only in the case of compressed extents. + * + * @num_bytes - Number of bytes to allocate on-disk. + * + * @min_alloc_size - Indicates the minimum amount of space that the + * allocator should try to satisfy. In some cases + * @num_bytes may be larger than what is required and if + * the filesystem is fragmented then allocation fails. + * However, the presence of @min_alloc_size gives a + * chance to try and satisfy the smaller allocation. + * + * @empty_size - A hint that you plan on doing more COW. This is the + * size in bytes the allocator should try to find free + * next to the block it returns. This is just a hint and + * may be ignored by the allocator. + * + * @hint_byte - Hint to the allocator to start searching above the byte + * address passed. It might be ignored. + * + * @ins - This key is modified to record the found hole. It will + * have the following values: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * + * @is_data - Boolean flag indicating whether an extent is + * allocated for data (true) or metadata (false) + * + * @delalloc - Boolean flag indicating whether this allocation is for + * delalloc or not. If 'true' data_rwsem of block groups + * is going to be acquired. * - * when wc->stage == UPDATE_BACKREF, this function changes - * wc->stage back to DROP_REFERENCE if we changed wc->stage - * to UPDATE_BACKREF previously while processing the block. * - * NOTE: return value 1 means we should stop walking up. + * Returns 0 when an allocation succeeded or < 0 when an error occurred. In + * case -ENOSPC is returned then @ins->offset will contain the size of the + * largest available hole the allocator managed to find. */ -static noinline int walk_up_proc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc) +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; + bool final_tried = num_bytes == min_alloc_size; + u64 flags; int ret; - int level = wc->level; - struct extent_buffer *eb = path->nodes[level]; - u64 parent = 0; - - if (wc->stage == UPDATE_BACKREF) { - BUG_ON(wc->shared_level < level); - if (level < wc->shared_level) - goto out; - - ret = find_next_key(path, level + 1, &wc->update_progress); - if (ret > 0) - wc->update_ref = 0; - - wc->stage = DROP_REFERENCE; - wc->shared_level = -1; - path->slots[level] = 0; - /* - * check reference count again if the block isn't locked. - * we should start walking down the tree again if reference - * count is one. - */ - if (!path->locks[level]) { - BUG_ON(level == 0); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + flags = get_alloc_profile_by_root(root, is_data); +again: + WARN_ON(num_bytes < fs_info->sectorsize); + ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(fs_info, ins->objectid); + } else if (ret == -ENOSPC) { + if (!final_tried && ins->offset) { + num_bytes = min(num_bytes >> 1, ins->offset); + num_bytes = round_down(num_bytes, + fs_info->sectorsize); + num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; - ret = btrfs_lookup_extent_info(trans, fs_info, - eb->start, level, 1, - &wc->refs[level], - &wc->flags[level]); - if (ret < 0) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; - return ret; - } - BUG_ON(wc->refs[level] == 0); - if (wc->refs[level] == 1) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; - return 1; - } + sinfo = btrfs_find_space_info(fs_info, flags); + btrfs_err(fs_info, + "allocation failed flags %llu, wanted %llu", + flags, num_bytes); + if (sinfo) + btrfs_dump_space_info(fs_info, sinfo, + num_bytes, 1); } } - /* wc->stage == DROP_REFERENCE */ - BUG_ON(wc->refs[level] > 1 && !path->locks[level]); + return ret; +} - if (wc->refs[level] == 1) { - if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1); - else - ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ - if (is_fstree(root->root_key.objectid)) { - ret = btrfs_qgroup_trace_leaf_items(trans, eb); - if (ret) { - btrfs_err_rl(fs_info, - "error %d accounting leaf items, quota is out of sync, rescan required", - ret); - } - } - } - /* make block locked assertion in btrfs_clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { - btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - } - btrfs_clean_tree_block(eb); +static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, + int pin, int delalloc) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "Unable to find block group for %llu", + start); + return -ENOSPC; } - if (eb == root->node) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - parent = eb->start; - else if (root->root_key.objectid != btrfs_header_owner(eb)) - goto owner_mismatch; - } else { - if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - parent = path->nodes[level + 1]->start; - else if (root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])) - goto owner_mismatch; + if (pin) + pin_down_extent(cache, start, len, 1); + else { + if (btrfs_test_opt(fs_info, DISCARD)) + ret = btrfs_discard_extent(fs_info, start, len, NULL); + btrfs_add_free_space(cache, start, len); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(fs_info, start, len); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); -out: - wc->refs[level] = 0; - wc->flags[level] = 0; - return 0; + btrfs_put_block_group(cache); + return ret; +} -owner_mismatch: - btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", - btrfs_header_owner(eb), root->root_key.objectid); - return -EUCLEAN; +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc) +{ + return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); } -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc) +int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len) { - int level = wc->level; - int lookup_info = 1; + return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); +} + +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + int type; + u32 size; - while (level >= 0) { - ret = walk_down_proc(trans, root, path, wc, lookup_info); - if (ret > 0) - break; + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; - if (level == 0) - break; + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); - if (path->slots[level] >= - btrfs_header_nritems(path->nodes[level])) - break; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - ret = do_walk_down(trans, root, path, wc, &lookup_info); - if (ret > 0) { - path->slots[level]++; - continue; - } else if (ret < 0) - return ret; - level = wc->level; + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); + if (ret) { + btrfs_free_path(path); + return ret; } - return 0; -} -static noinline int walk_up_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc, int max_level) -{ - int level = wc->level; - int ret; + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); - path->slots[level] = btrfs_header_nritems(path->nodes[level]); - while (level < max_level && path->nodes[level]) { - wc->level = level; - if (path->slots[level] + 1 < - btrfs_header_nritems(path->nodes[level])) { - path->slots[level]++; - return 0; - } else { - ret = walk_up_proc(trans, root, path, wc); - if (ret > 0) - return 0; - if (ret < 0) - return ret; + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); - if (path->locks[level]) { - btrfs_tree_unlock_rw(path->nodes[level], - path->locks[level]); - path->locks[level] = 0; - } - free_extent_buffer(path->nodes[level]); - path->nodes[level] = NULL; - level++; - } + ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + ins->objectid, ins->offset); + BUG(); } - return 1; + trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); + return ret; } -/* - * drop a subvolume tree. - * - * this function traverses the tree freeing any blocks that only - * referenced by the tree. - * - * when a shared tree block is found. this function decreases its - * reference count by one. if update_ref is true, this function - * also make sure backrefs for the shared block and all lower level - * blocks are properly updated. - * - * If called with for_reloc == 0, may exit early with -EAGAIN - */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root_item *root_item = &root->root_item; - struct walk_control *wc; - struct btrfs_key key; - int err = 0; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - int level; - bool root_dropped = false; + struct btrfs_extent_item *extent_item; + struct btrfs_key extent_key; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_delayed_tree_ref *ref; + u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes; + u64 flags = extent_op->flags_to_set; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + ref = btrfs_delayed_node_to_tree_ref(node); - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; + extent_key.objectid = node->bytenr; + if (skinny_metadata) { + extent_key.offset = ref->level; + extent_key.type = BTRFS_METADATA_ITEM_KEY; + num_bytes = fs_info->nodesize; + } else { + extent_key.offset = node->num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; + size += sizeof(*block_info); + num_bytes = node->num_bytes; } - wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); - err = -ENOMEM; - goto out; - } + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - trans = btrfs_start_transaction(tree_root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + &extent_key, size); + if (ret) { + btrfs_free_path(path); + return ret; } - err = btrfs_run_delayed_items(trans); - if (err) - goto out_end_trans; + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - if (block_rsv) - trans->block_rsv = block_rsv; + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); + btrfs_set_tree_block_level(leaf, block_info, ref->level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } - /* - * This will help us catch people modifying the fs tree while we're - * dropping it. It is unsafe to mess with the fs tree while it's being - * dropped as we unlock the root node and parent nodes as we walk down - * the tree, assuming nothing will change. If something does change - * then we'll have stale information and drop references to blocks we've - * already dropped. - */ - set_bit(BTRFS_ROOT_DELETING, &root->state); - if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - level = btrfs_header_level(root->node); - path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(path->nodes[level]); - path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - memset(&wc->update_progress, 0, - sizeof(wc->update_progress)); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); } else { - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - memcpy(&wc->update_progress, &key, - sizeof(wc->update_progress)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + } - level = root_item->drop_level; - BUG_ON(level == 0); - path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - path->lowest_level = 0; - if (ret < 0) { - err = ret; - goto out_end_trans; - } - WARN_ON(ret > 0); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); - /* - * unlock our path, this is safe because only this - * function is allowed to delete this snapshot - */ - btrfs_unlock_up_safe(path, 0); + ret = remove_from_free_space_tree(trans, extent_key.objectid, + num_bytes); + if (ret) + return ret; - level = btrfs_header_level(root->node); - while (1) { - btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking_write(path->nodes[level]); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + ret = btrfs_update_block_group(trans, extent_key.objectid, + fs_info->nodesize, 1); + if (ret) { /* -ENOENT, logic error */ + btrfs_err(fs_info, "update block group failed for %llu %llu", + extent_key.objectid, extent_key.offset); + BUG(); + } - ret = btrfs_lookup_extent_info(trans, fs_info, - path->nodes[level]->start, - level, 1, &wc->refs[level], - &wc->flags[level]); - if (ret < 0) { - err = ret; - goto out_end_trans; - } - BUG_ON(wc->refs[level] == 0); + trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, + fs_info->nodesize); + return ret; +} - if (level == root_item->drop_level) - break; +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) +{ + struct btrfs_ref generic_ref = { 0 }; + int ret; - btrfs_tree_unlock(path->nodes[level]); - path->locks[level] = 0; - WARN_ON(wc->refs[level] != 1); - level--; - } - } + BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); - wc->level = level; - wc->shared_level = -1; - wc->stage = DROP_REFERENCE; - wc->update_ref = update_ref; - wc->keep_locks = 0; - wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins->objectid, ins->offset, 0); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_ref_tree_mod(root->fs_info, &generic_ref); + ret = btrfs_add_delayed_data_ref(trans, &generic_ref, + ram_bytes, NULL, NULL); + return ret; +} - while (1) { +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; - ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { - err = ret; - break; - } + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exclude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + } - ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { - err = ret; - break; - } + block_group = btrfs_lookup_block_group(fs_info, ins->objectid); + if (!block_group) + return -EINVAL; - if (ret > 0) { - BUG_ON(wc->stage != DROP_REFERENCE); - break; - } + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); - if (wc->stage == DROP_REFERENCE) { - wc->drop_level = wc->level; - btrfs_node_key_to_cpu(path->nodes[wc->drop_level], - &wc->drop_progress, - path->slots[wc->drop_level]); - } - btrfs_cpu_key_to_disk(&root_item->drop_progress, - &wc->drop_progress); - root_item->drop_level = wc->drop_level; + ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, + offset, ins, 1); + btrfs_put_block_group(block_group); + return ret; +} - BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans) || - (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { - ret = btrfs_update_root(trans, tree_root, - &root->root_key, - root_item); - if (ret) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, int level, u64 owner) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *buf; - btrfs_end_transaction_throttle(trans); - if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { - btrfs_debug(fs_info, - "drop snapshot early exit"); - err = -EAGAIN; - goto out_free; - } + buf = btrfs_find_create_tree_block(fs_info, bytenr); + if (IS_ERR(buf)) + return buf; - trans = btrfs_start_transaction(tree_root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - if (block_rsv) - trans->block_rsv = block_rsv; - } + /* + * Extra safety check in case the extent tree is corrupted and extent + * allocator chooses to use a tree block which is already used and + * locked. + */ + if (buf->lock_owner == current->pid) { + btrfs_err_rl(fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + buf->start, btrfs_header_owner(buf), current->pid); + free_extent_buffer(buf); + return ERR_PTR(-EUCLEAN); } - btrfs_release_path(path); - if (err) - goto out_end_trans; - ret = btrfs_del_root(trans, &root->root_key); - if (ret) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); + btrfs_tree_lock(buf); + btrfs_clean_tree_block(buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_root(tree_root, &root->root_key, path, - NULL, NULL); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_end_trans; - } else if (ret > 0) { - /* if we fail to delete the orphan item this time - * around, it'll get picked up the next time. - * - * The most common failure here is just -ENOENT. - */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); - } - } + btrfs_set_lock_blocking_write(buf); + set_extent_buffer_uptodate(buf); - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { - btrfs_add_dropped_root(trans, root); + memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); + btrfs_set_header_level(buf, level); + btrfs_set_header_bytenr(buf, buf->start); + btrfs_set_header_generation(buf, trans->transid); + btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(buf, owner); + write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); + write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; + /* + * we allow two log transactions at a time, use different + * EXTENT bit to differentiate dirty pages. + */ + if (buf->log_index == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1); } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - btrfs_put_fs_root(root); + buf->log_index = -1; + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); } - root_dropped = true; -out_end_trans: - btrfs_end_transaction_throttle(trans); -out_free: - kfree(wc); - btrfs_free_path(path); -out: - /* - * So if we need to stop dropping the snapshot for whatever reason we - * need to make sure to add it back to the dead root list so that we - * keep trying to do the work later. This also cleans up roots if we - * don't have it in the radix (like when we recover after a power fail - * or unmount) so we don't leak memory. - */ - if (!for_reloc && !root_dropped) - btrfs_add_dead_root(root); - if (err && err != -EAGAIN) - btrfs_handle_fs_error(fs_info, err, NULL); - return err; + trans->dirty = true; + /* this returns a buffer locked for blocking */ + return buf; } /* - * drop subtree rooted at tree block 'node'. - * - * NOTE: this function will unlock and release tree block 'node' - * only used by relocation code + * finds a free extent and does all the dirty work required for allocation + * returns the tree buffer or an ERR_PTR on error. */ -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent) +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - struct walk_control *wc; - int level; - int parent_level; - int ret = 0; - int wret; - - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + struct btrfs_key ins; + struct btrfs_block_rsv *block_rsv; + struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { 0 }; + u64 flags = 0; + int ret; + u32 blocksize = fs_info->nodesize; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); - return -ENOMEM; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(fs_info)) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + level, root_objectid); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; } +#endif - btrfs_assert_tree_locked(parent); - parent_level = btrfs_header_level(parent); - extent_buffer_get(parent); - path->nodes[parent_level] = parent; - path->slots[parent_level] = btrfs_header_nritems(parent); + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); - btrfs_assert_tree_locked(node); - level = btrfs_header_level(node); - path->nodes[level] = node; - path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, + empty_size, hint, &ins, 0, 0); + if (ret) + goto out_unuse; - wc->refs[parent_level] = 1; - wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; - wc->level = level; - wc->shared_level = -1; - wc->stage = DROP_REFERENCE; - wc->update_ref = 0; - wc->keep_locks = 1; - wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, + root_objectid); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } - while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; - break; + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + extent_op = btrfs_alloc_delayed_extent_op(); + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; } + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; + extent_op->level = level; - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) - break; + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins.objectid, ins.offset, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, + extent_op, NULL, NULL); + if (ret) + goto out_free_delayed; } + return buf; - kfree(wc); - btrfs_free_path(path); - return ret; +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); +out_unuse: + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); + return ERR_PTR(ret); } -static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices; - u64 stripped; - - /* - * if restripe for this chunk_type is on pick target profile and - * return, otherwise do the usual balance - */ - stripped = get_restripe_target(fs_info, flags); - if (stripped) - return extended_to_chunk(stripped); - - num_devices = fs_info->fs_devices->rw_devices; - - stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + struct btrfs_key drop_progress; + int drop_level; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; + int restarted; +}; - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 generation; + u64 refs; + u64 flags; + u32 nritems; + struct btrfs_key key; + struct extent_buffer *eb; + int ret; + int slot; + int nread = 0; - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; - - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; - - /* this is drive concat, leave it alone */ + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); } - return flags; -} + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); -static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - u64 sinfo_used; - u64 min_allocable_bytes; - int ret = -ENOSPC; + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = SZ_1M; - else - min_allocable_bytes = 0; + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); + if (slot == path->slots[wc->level]) + goto reada; - if (cache->ro) { - cache->ro++; - ret = 0; - goto out; - } + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) + continue; - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo_used = btrfs_space_info_used(sinfo, true); + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, + wc->level - 1, 1, &refs, + &flags); + /* We don't care about errors in readahead. */ + if (ret < 0) + continue; + BUG_ON(refs == 0); - if (sinfo_used + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { - sinfo->bytes_readonly += num_bytes; - cache->ro++; - list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; - } -out: - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); - if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { - btrfs_info(cache->fs_info, - "unable to make block group %llu ro", - cache->key.objectid); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", - sinfo_used, num_bytes, min_allocable_bytes); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; + + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + } +reada: + readahead_tree_block(fs_info, bytenr); + nread++; } - return ret; + wc->reada_slot = slot; } -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) - +/* + * helper to process tree block while walking down the tree. + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int lookup_info) { - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_trans_handle *trans; - u64 alloc_flags; + struct btrfs_fs_info *fs_info = root->fs_info; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; -again: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; /* - * we're not allowed to set block groups readonly after the dirty - * block groups cache has started writing. If it already started, - * back off and let this transaction commit + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. */ - mutex_lock(&fs_info->ro_block_group_mutex); - if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { - u64 transid = trans->transid; - - mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans); - - ret = btrfs_wait_for_commit(fs_info, transid); + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, fs_info, + eb->start, level, 1, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret == -ENOMEM); if (ret) return ret; - goto again; - } - - /* - * if we are changing raid levels, try to allocate a corresponding - * block group with the new raid level. - */ - alloc_flags = update_block_group_flags(fs_info, cache->flags); - if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); - /* - * ENOSPC is allowed here, we may have enough space - * already allocated at the new raid level to - * carry on - */ - if (ret == -ENOSPC) - ret = 0; - if (ret < 0) - goto out; - } - - ret = inc_block_group_ro(cache, 0); - if (!ret) - goto out; - alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - ret = inc_block_group_ro(cache, 0); -out: - if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = update_block_group_flags(fs_info, cache->flags); - mutex_lock(&fs_info->chunk_mutex); - check_system_chunk(trans, alloc_flags); - mutex_unlock(&fs_info->chunk_mutex); + BUG_ON(wc->refs[level] == 0); } - mutex_unlock(&fs_info->ro_block_group_mutex); - - btrfs_end_transaction(trans); - return ret; -} - -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) -{ - u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - struct btrfs_block_group_cache *block_group; - u64 free_bytes = 0; - int factor; - - /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) - return 0; - spin_lock(&sinfo->lock); - list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { - spin_lock(&block_group->lock); + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; - if (!block_group->ro) { - spin_unlock(&block_group->lock); - continue; + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; } + return 0; + } - factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->key.offset - - btrfs_block_group_used(&block_group->item)) * - factor; - - spin_unlock(&block_group->lock); + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_set_disk_extent_flags(trans, eb->start, + eb->len, flag, + btrfs_header_level(eb), 0); + BUG_ON(ret); /* -ENOMEM */ + wc->flags[level] |= flag; } - spin_unlock(&sinfo->lock); - return free_bytes; + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + } + return 0; } -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) +/* + * This is used to verify a ref exists for this root to deal with a bug where we + * would have a drop_progress key that hadn't been updated properly. + */ +static int check_ref_exists(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 parent, + int level) { - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; + struct btrfs_path *path; + struct btrfs_extent_inline_ref *iref; + int ret; - BUG_ON(!cache->ro); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - if (!--cache->ro) { - num_bytes = cache->key.offset - cache->reserved - - cache->pinned - cache->bytes_super - - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - list_del_init(&cache->ro_list); - } - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); + ret = lookup_extent_backref(trans, path, &iref, bytenr, + root->fs_info->nodesize, parent, + root->root_key.objectid, level, 0); + btrfs_free_path(path); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + return 1; } /* - * Checks to see if it's even possible to relocate this block group. + * helper to process tree block pointer. * - * @return - -1 if it's not a good idea to relocate this block group, 0 if its - * ok to go ahead and try. + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. */ -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) { - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *device; - u64 min_free; - u64 dev_min = 1; - u64 dev_nr = 0; - u64 target; - int debug; - int index; - int full = 0; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 generation; + u64 parent; + struct btrfs_key key; + struct btrfs_key first_key; + struct btrfs_ref ref = { 0 }; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; int ret = 0; + bool need_account = false; - debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - - /* odd, couldn't find the block group, leave it alone */ - if (!block_group) { - if (debug) - btrfs_warn(fs_info, - "can't find block group for bytenr %llu", - bytenr); - return -1; - } - - min_free = btrfs_block_group_used(&block_group->item); - - /* no bytes used, we're good */ - if (!min_free) - goto out; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - - full = space_info->full; - + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); /* - * if this is the last block group we have in this space, we can't - * relocate it unless we're able to allocate a new chunk below. - * - * Otherwise, we need to make sure we have room in the space to handle - * all of the extents from this block group. If we can, we're good + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree */ - if ((space_info->total_bytes != block_group->key.offset) && - (btrfs_space_info_used(space_info, false) + min_free < - space_info->total_bytes)) { - spin_unlock(&space_info->lock); - goto out; + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; } - spin_unlock(&space_info->lock); - /* - * ok we don't have enough space, but maybe we have free space on our - * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. if this block - * group is going to be restriped, run checks against the target - * profile instead of the current one. - */ - ret = -1; + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + btrfs_node_key_to_cpu(path->nodes[level], &first_key, + path->slots[level]); - /* - * index: - * 0: raid10 - * 1: raid1 - * 2: dup - * 3: raid0 - * 4: single - */ - target = get_restripe_target(fs_info, block_group->flags); - if (target) { - index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); - } else { - /* - * this is just a balance, so if we were marked as full - * we know there is no space for a new chunk - */ - if (full) { - if (debug) - btrfs_warn(fs_info, - "no space to alloc new chunk for block group %llu", - block_group->key.objectid); - goto out; - } + next = find_extent_buffer(fs_info, bytenr); + if (!next) { + next = btrfs_find_create_tree_block(fs_info, bytenr); + if (IS_ERR(next)) + return PTR_ERR(next); - index = btrfs_bg_flags_to_raid_index(block_group->flags); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); + reada = 1; } + btrfs_tree_lock(next); + btrfs_set_lock_blocking_write(next); + + ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, + &wc->refs[level - 1], + &wc->flags[level - 1]); + if (ret < 0) + goto out_unlock; - if (index == BTRFS_RAID_RAID10) { - dev_min = 4; - /* Divide by 2 */ - min_free >>= 1; - } else if (index == BTRFS_RAID_RAID1) { - dev_min = 2; - } else if (index == BTRFS_RAID_DUP) { - /* Multiply by 2 */ - min_free <<= 1; - } else if (index == BTRFS_RAID_RAID0) { - dev_min = fs_devices->rw_devices; - min_free = div64_u64(min_free, dev_min); + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(fs_info, "Missing references."); + ret = -EIO; + goto out_unlock; } + *lookup_info = 0; - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 dev_offset; + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + need_account = true; + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; - /* - * check to make sure we can actually find a chunk with enough - * space to fit our block group in. - */ - if (device->total_bytes > device->bytes_used + min_free && - !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = find_free_dev_extent(device, min_free, - &dev_offset, NULL); - if (!ret) - dev_nr++; + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; - if (dev_nr >= dev_min) - break; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; - ret = -1; + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; } - if (debug && ret == -1) - btrfs_warn(fs_info, - "no space to allocate a new chunk for block group %llu", - block_group->key.objectid); - mutex_unlock(&fs_info->chunk_mutex); -out: - btrfs_put_block_group(block_group); - return ret; -} - -static int find_first_block_group(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - struct btrfs_key *key) -{ - struct btrfs_root *root = fs_info->extent_root; - int ret = 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_block_group_item bg; - u64 flags; - int slot; - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - goto out; + if (!btrfs_buffer_uptodate(next, generation, 0)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(fs_info, bytenr, generation, level - 1, + &first_key); + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { + free_extent_buffer(next); + return -EIO; } - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - if (found_key.objectid >= key->objectid && - found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - em_tree = &root->fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, found_key.objectid, - found_key.offset); - read_unlock(&em_tree->lock); - if (!em) { - btrfs_err(fs_info, - "logical %llu len %llu found bg but no related chunk", - found_key.objectid, found_key.offset); - ret = -ENOENT; - } else if (em->start != found_key.objectid || - em->len != found_key.offset) { - btrfs_err(fs_info, - "block group %llu len %llu mismatch with chunk %llu len %llu", - found_key.objectid, found_key.offset, - em->start, em->len); - ret = -EUCLEAN; - } else { - read_extent_buffer(leaf, &bg, - btrfs_item_ptr_offset(leaf, slot), - sizeof(bg)); - flags = btrfs_block_group_flags(&bg) & - BTRFS_BLOCK_GROUP_TYPE_MASK; - - if (flags != (em->map_lookup->type & - BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", - found_key.objectid, - found_key.offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & - em->map_lookup->type)); - ret = -EUCLEAN; - } else { - ret = 0; - } + btrfs_tree_lock(next); + btrfs_set_lock_blocking_write(next); + } + + level--; + ASSERT(level == btrfs_header_level(next)); + if (level != btrfs_header_level(next)) { + btrfs_err(root->fs_info, "mismatched level"); + ret = -EIO; + goto out_unlock; + } + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + ASSERT(root->root_key.objectid == + btrfs_header_owner(path->nodes[level])); + if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, + "mismatched block owner"); + ret = -EIO; + goto out_unlock; + } + parent = 0; + } + + /* + * If we had a drop_progress we need to verify the refs are set + * as expected. If we find our ref then we know that from here + * on out everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, bytenr, parent, + level - 1); + if (ret < 0) + goto out_unlock; + if (ret == 0) + goto no_delete; + ret = 0; + wc->restarted = 0; + } + + /* + * Reloc tree doesn't contribute to qgroup numbers, and we have + * already accounted them at merge time (replace_path), + * thus we could skip expensive subtree trace here. + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + need_account) { + ret = btrfs_qgroup_trace_subtree(trans, next, + generation, level - 1); + if (ret) { + btrfs_err_rl(fs_info, + "Error %d accounting shared subtree. Quota is out of sync, rescan required.", + ret); } - free_extent_map(em); - goto out; } - path->slots[0]++; - } -out: - return ret; -} -void btrfs_put_block_group_cache(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - u64 last = 0; + /* + * We need to update the next key in our walk control so we can + * update the drop_progress key accordingly. We don't care if + * find_next_key doesn't find a key because that means we're at + * the end and are going to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); - while (1) { - struct inode *inode; + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + fs_info->nodesize, parent); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + ret = btrfs_free_extent(trans, &ref); + if (ret) + goto out_unlock; + } +no_delete: + *lookup_info = 1; + ret = 1; - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - wait_block_group_cache_done(block_group); - spin_lock(&block_group->lock); - if (block_group->iref) - break; - spin_unlock(&block_group->lock); - block_group = next_block_group(block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } +out_unlock: + btrfs_tree_unlock(next); + free_extent_buffer(next); - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); - last = block_group->key.objectid + block_group->key.offset; - btrfs_put_block_group(block_group); - } + return ret; } /* - * Must be called only after stopping all workers, since we could have block - * group caching kthreads running, and therefore they could race with us if we - * freed the block groups before stopping them. + * helper to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. */ -int btrfs_free_block_groups(struct btrfs_fs_info *info) +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) { - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_caching_control *caching_ctl; - struct rb_node *n; - - down_write(&info->commit_root_sem); - while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); - list_del(&caching_ctl->list); - put_caching_control(caching_ctl); - } - up_write(&info->commit_root_sem); - - spin_lock(&info->unused_bgs_lock); - while (!list_empty(&info->unused_bgs)) { - block_group = list_first_entry(&info->unused_bgs, - struct btrfs_block_group_cache, - bg_list); - list_del_init(&block_group->bg_list); - btrfs_put_block_group(block_group); - } - spin_unlock(&info->unused_bgs_lock); - - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { - block_group = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); - RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); - up_write(&block_group->space_info->groups_sem); - - /* - * We haven't cached this block group, which means we could - * possibly have excluded extents on this block group. - */ - if (block_group->cached == BTRFS_CACHE_NO || - block_group->cached == BTRFS_CACHE_ERROR) - free_excluded_extents(block_group); - - btrfs_remove_free_space_cache(block_group); - ASSERT(block_group->cached != BTRFS_CACHE_STARTED); - ASSERT(list_empty(&block_group->dirty_list)); - ASSERT(list_empty(&block_group->io_list)); - ASSERT(list_empty(&block_group->bg_list)); - ASSERT(atomic_read(&block_group->count) == 1); - btrfs_put_block_group(block_group); - - spin_lock(&info->block_group_cache_lock); - } - spin_unlock(&info->block_group_cache_lock); - - /* now that all the block groups are freed, go through and - * free all the space_info structs. This is only called during - * the final stages of unmount, and so we know nobody is - * using them. We call synchronize_rcu() once before we start, - * just to be on the safe side. - */ - synchronize_rcu(); + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; - btrfs_release_global_block_rsv(info); + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; - while (!list_empty(&info->space_info)) { - int i; + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - list_del(&space_info->list); - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - struct kobject *kobj; - kobj = space_info->block_group_kobjs[i]; - space_info->block_group_kobjs[i] = NULL; - if (kobj) { - kobject_del(kobj); - kobject_put(kobj); + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking_write(eb); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + + ret = btrfs_lookup_extent_info(trans, fs_info, + eb->start, level, 1, + &wc->refs[level], + &wc->flags[level]); + if (ret < 0) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return ret; + } + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + return 1; } } - kobject_del(&space_info->kobj); - kobject_put(&space_info->kobj); } - return 0; -} -static void link_block_group(struct btrfs_block_group_cache *cache) -{ - struct btrfs_space_info *space_info = cache->space_info; - struct btrfs_fs_info *fs_info = cache->fs_info; - int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; - - down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); - - if (first) { - struct raid_kobject *rkobj; - unsigned int nofs_flag; - int ret; + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); - /* - * Setup a NOFS context because kobject_add(), deep in its call - * chain, does GFP_KERNEL allocations, and we are often called - * in a context where if reclaim is triggered we can deadlock - * (we are either holding a transaction handle or some lock - * required for a transaction commit). - */ - nofs_flag = memalloc_nofs_save(); - rkobj = kzalloc(sizeof(*rkobj), GFP_KERNEL); - if (!rkobj) { - memalloc_nofs_restore(nofs_flag); - btrfs_warn(cache->fs_info, - "couldn't alloc memory for raid level kobject"); - return; + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); /* -ENOMEM */ + if (is_fstree(root->root_key.objectid)) { + ret = btrfs_qgroup_trace_leaf_items(trans, eb); + if (ret) { + btrfs_err_rl(fs_info, + "error %d accounting leaf items, quota is out of sync, rescan required", + ret); + } + } } - rkobj->flags = cache->flags; - kobject_init(&rkobj->kobj, &btrfs_raid_ktype); - ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", - btrfs_bg_type_to_raid_name(rkobj->flags)); - memalloc_nofs_restore(nofs_flag); - if (ret) { - kobject_put(&rkobj->kobj); - btrfs_warn(fs_info, - "failed to add kobject for block cache, ignoring"); - return; + /* make block locked assertion in btrfs_clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking_write(eb); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - space_info->block_group_kobjs[index] = &rkobj->kobj; + btrfs_clean_tree_block(eb); } -} - -static struct btrfs_block_group_cache * -btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, - u64 start, u64 size) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else if (root->root_key.objectid != btrfs_header_owner(eb)) + goto owner_mismatch; + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])) + goto owner_mismatch; } - cache->key.objectid = start; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - - cache->fs_info = fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); - set_free_space_tree_thresholds(cache); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - init_rwsem(&cache->data_rwsem); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - INIT_LIST_HEAD(&cache->ro_list); - INIT_LIST_HEAD(&cache->dirty_list); - INIT_LIST_HEAD(&cache->io_list); - btrfs_init_free_space_ctl(cache); - atomic_set(&cache->trimming, 0); - mutex_init(&cache->free_space_lock); - btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); - - return cache; -} - - -/* - * Iterate all chunks and verify that each of them has the corresponding block - * group - */ -static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) -{ - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group_cache *bg; - u64 start = 0; - int ret = 0; - - while (1) { - read_lock(&map_tree->lock); - /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to - * get the first chunk. - */ - em = lookup_extent_mapping(map_tree, start, 1); - read_unlock(&map_tree->lock); - if (!em) - break; + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return 0; - bg = btrfs_lookup_block_group(fs_info, em->start); - if (!bg) { - btrfs_err(fs_info, - "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); - ret = -EUCLEAN; - free_extent_map(em); - break; - } - if (bg->key.objectid != em->start || - bg->key.offset != em->len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, - bg->key.objectid, bg->key.offset, - bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); - ret = -EUCLEAN; - free_extent_map(em); - btrfs_put_block_group(bg); - break; - } - start = em->start + em->len; - free_extent_map(em); - btrfs_put_block_group(bg); - } - return ret; +owner_mismatch: + btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", + btrfs_header_owner(eb), root->root_key.objectid); + return -EUCLEAN; } -int btrfs_read_block_groups(struct btrfs_fs_info *info) +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) { - struct btrfs_path *path; + int level = wc->level; + int lookup_info = 1; int ret; - struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *leaf; - int need_clear = 0; - u64 cache_gen; - u64 feature; - int mixed; - - feature = btrfs_super_incompat_flags(info->super_copy); - mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); - - key.objectid = 0; - key.offset = 0; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = READA_FORWARD; - cache_gen = btrfs_super_cache_generation(info->super_copy); - if (btrfs_test_opt(info, SPACE_CACHE) && - btrfs_super_generation(info->super_copy) != cache_gen) - need_clear = 1; - if (btrfs_test_opt(info, CLEAR_CACHE)) - need_clear = 1; - - while (1) { - ret = find_first_block_group(info, path, &key); + while (level >= 0) { + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; - if (ret != 0) - goto error; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - cache = btrfs_create_block_group_cache(info, found_key.objectid, - found_key.offset); - if (!cache) { - ret = -ENOMEM; - goto error; - } - - if (need_clear) { - /* - * When we mount with old space cache, we need to - * set BTRFS_DC_CLEAR and set dirty flag. - * - * a) Setting 'BTRFS_DC_CLEAR' makes sure that we - * truncate the old free space cache inode and - * setup a new one. - * b) Setting 'dirty flag' makes sure that we flush - * the new space cache info onto disk. - */ - if (btrfs_test_opt(info, SPACE_CACHE)) - cache->disk_cache_state = BTRFS_DC_CLEAR; - } - - read_extent_buffer(leaf, &cache->item, - btrfs_item_ptr_offset(leaf, path->slots[0]), - sizeof(cache->item)); - cache->flags = btrfs_block_group_flags(&cache->item); - if (!mixed && - ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && - (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(info, -"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", - cache->key.objectid); - ret = -EINVAL; - goto error; - } - - key.objectid = found_key.objectid + found_key.offset; - btrfs_release_path(path); - - /* - * We need to exclude the super stripes now so that the space - * info has super bytes accounted for, otherwise we'll think - * we have more space than we actually do. - */ - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - free_excluded_extents(cache); - btrfs_put_block_group(cache); - goto error; - } - - /* - * check for two cases, either we are full, and therefore - * don't need to bother with the caching work since we won't - * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _a_lot_ of - * time, particularly in the full case. - */ - if (found_key.offset == btrfs_block_group_used(&cache->item)) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - free_excluded_extents(cache); - } else if (btrfs_block_group_used(&cache->item) == 0) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, found_key.objectid, - found_key.objectid + - found_key.offset); - free_excluded_extents(cache); - } - - ret = btrfs_add_block_group_cache(info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - goto error; - } - trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - - cache->space_info = space_info; + if (level == 0) + break; - link_block_group(cache); + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; - set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->key.objectid)) { - inc_block_group_ro(cache, 1); - } else if (btrfs_block_group_used(&cache->item) == 0) { - ASSERT(list_empty(&cache->bg_list)); - btrfs_mark_bg_unused(cache); - } + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } else if (ret < 0) + return ret; + level = wc->level; } - - list_for_each_entry_rcu(space_info, &info->space_info, list) { - if (!(get_alloc_profile(info, space_info->flags) & - (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_DUP))) - continue; - /* - * avoid allocating from un-mirrored block group if there are - * mirrored block groups. - */ - list_for_each_entry(cache, - &space_info->block_groups[BTRFS_RAID_RAID0], - list) - inc_block_group_ro(cache, 1); - list_for_each_entry(cache, - &space_info->block_groups[BTRFS_RAID_SINGLE], - list) - inc_block_group_ro(cache, 1); - } - - btrfs_init_global_block_rsv(info); - ret = check_chunk_block_group_mappings(info); -error: - btrfs_free_path(path); - return ret; + return 0; } -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int max_level) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_block_group_item item; - struct btrfs_key key; - int ret = 0; - - if (!trans->can_flush_pending_bgs) - return; - - while (!list_empty(&trans->new_bgs)) { - block_group = list_first_entry(&trans->new_bgs, - struct btrfs_block_group_cache, - bg_list); - if (ret) - goto next; + int level = wc->level; + int ret; - spin_lock(&block_group->lock); - memcpy(&item, &block_group->item, sizeof(item)); - memcpy(&key, &block_group->key, sizeof(key)); - spin_unlock(&block_group->lock); + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; + return 0; + } else { + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; + if (ret < 0) + return ret; - ret = btrfs_insert_item(trans, extent_root, &key, &item, - sizeof(item)); - if (ret) - btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); - if (ret) - btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); - /* already aborted the transaction if it failed. */ -next: - btrfs_delayed_refs_rsv_release(fs_info, 1); - list_del_init(&block_group->bg_list); + if (path->locks[level]) { + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); + path->locks[level] = 0; + } + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; + } } - btrfs_trans_release_chunk_metadata(trans); + return 1; } -int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, - u64 type, u64 chunk_offset, u64 size) +/* + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN + */ +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; int ret; + int level; + bool root_dropped = false; - btrfs_set_log_full_commit(trans); - - cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); - if (!cache) - return -ENOMEM; + btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); - btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_objectid(&cache->item, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_block_group_flags(&cache->item, type); + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } - cache->flags = type; - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - cache->needs_free_space = 1; - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - free_excluded_extents(cache); - btrfs_put_block_group(cache); - return ret; + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + err = -ENOMEM; + goto out; } - add_new_free_space(cache, chunk_offset, chunk_offset + size); + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } - free_excluded_extents(cache); + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; + if (block_rsv) + trans->block_rsv = block_rsv; - bytes_used += new_bytes_used >> 1; - fragment_free_space(cache); - } -#endif /* - * Ensure the corresponding space_info object is created and - * assigned to our block group. We want our bg to be added to the rbtree - * with its ->space_info set. + * This will help us catch people modifying the fs tree while we're + * dropping it. It is unsafe to mess with the fs tree while it's being + * dropped as we unlock the root node and parent nodes as we walk down + * the tree, assuming nothing will change. If something does change + * then we'll have stale information and drop references to blocks we've + * already dropped. */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); - ASSERT(cache->space_info); - - ret = btrfs_add_block_group_cache(fs_info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; - } + set_bit(BTRFS_ROOT_DELETING, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking_write(path->nodes[level]); + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); - /* - * Now that our block group has its ->space_info set and is inserted in - * the rbtree, update the space info's counters. - */ - trace_btrfs_add_block_group(fs_info, cache, 1); - btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); - btrfs_update_global_block_rsv(fs_info); + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out_end_trans; + } + WARN_ON(ret > 0); - link_block_group(cache); + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + btrfs_unlock_up_safe(path, 0); - list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking_write(path->nodes[level]); + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - set_avail_alloc_bits(fs_info, type); - return 0; -} + ret = btrfs_lookup_extent_info(trans, fs_info, + path->nodes[level]->start, + level, 1, &wc->refs[level], + &wc->flags[level]); + if (ret < 0) { + err = ret; + goto out_end_trans; + } + BUG_ON(wc->refs[level] == 0); -static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - write_seqlock(&fs_info->profiles_lock); - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits &= ~extra_flags; - write_sequnlock(&fs_info->profiles_lock); -} + if (level == root_item->drop_level) + break; -/* - * Clear incompat bits for the following feature(s): - * - * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group - * in the whole filesystem - */ -static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { - struct list_head *head = &fs_info->space_info; - struct btrfs_space_info *sinfo; - - list_for_each_entry_rcu(sinfo, head, list) { - bool found = false; - - down_read(&sinfo->groups_sem); - if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) - found = true; - if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) - found = true; - up_read(&sinfo->groups_sem); - - if (found) - return; + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; + WARN_ON(wc->refs[level] != 1); + level--; } - btrfs_clear_fs_incompat(fs_info, RAID56); } -} - -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_path *path; - struct btrfs_block_group_cache *block_group; - struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_key key; - struct inode *inode; - struct kobject *kobj = NULL; - int ret; - int index; - int factor; - struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; - bool remove_rsv = false; - block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!block_group); - BUG_ON(!block_group->ro); - - trace_btrfs_remove_block_group(block_group); - /* - * Free the reserved super bytes from this block group before - * remove it. - */ - free_excluded_extents(block_group); - btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, - block_group->key.offset); + wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); - memcpy(&key, &block_group->key, sizeof(key)); - index = btrfs_bg_flags_to_raid_index(block_group->flags); - factor = btrfs_bg_type_to_factor(block_group->flags); + while (1) { - /* make sure this block group isn't part of an allocation cluster */ - cluster = &fs_info->data_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; + break; + } - /* - * make sure this block group isn't part of a metadata - * allocation cluster - */ - cluster = &fs_info->meta_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; + break; + } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); + break; + } - /* - * get the inode first so any iput calls done for the io_list - * aren't the final iput (no unlinks allowed now) - */ - inode = lookup_free_space_inode(block_group, path); + if (wc->stage == DROP_REFERENCE) { + wc->drop_level = wc->level; + btrfs_node_key_to_cpu(path->nodes[wc->drop_level], + &wc->drop_progress, + path->slots[wc->drop_level]); + } + btrfs_cpu_key_to_disk(&root_item->drop_progress, + &wc->drop_progress); + root_item->drop_level = wc->drop_level; - mutex_lock(&trans->transaction->cache_write_mutex); - /* - * Make sure our free space cache IO is done before removing the - * free space inode - */ - spin_lock(&trans->transaction->dirty_bgs_lock); - if (!list_empty(&block_group->io_list)) { - list_del_init(&block_group->io_list); + BUG_ON(wc->level == 0); + if (btrfs_should_end_transaction(trans) || + (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + if (ret) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } - WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + btrfs_end_transaction_throttle(trans); + if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { + btrfs_debug(fs_info, + "drop snapshot early exit"); + err = -EAGAIN; + goto out_free; + } - spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_wait_cache_io(trans, block_group, path); - btrfs_put_block_group(block_group); - spin_lock(&trans->transaction->dirty_bgs_lock); + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_free; + } + if (block_rsv) + trans->block_rsv = block_rsv; + } } + btrfs_release_path(path); + if (err) + goto out_end_trans; - if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - remove_rsv = true; - btrfs_put_block_group(block_group); + ret = btrfs_del_root(trans, &root->root_key); + if (ret) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; } - spin_unlock(&trans->transaction->dirty_bgs_lock); - mutex_unlock(&trans->transaction->cache_write_mutex); - if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { - btrfs_add_delayed_iput(inode); - goto out; - } - clear_nlink(inode); - /* One for the block groups ref */ - spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - } else { - spin_unlock(&block_group->lock); + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + err = ret; + goto out_end_trans; + } else if (ret > 0) { + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); } - /* One for our lookup ref */ - btrfs_add_delayed_iput(inode); } - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = block_group->key.objectid; - key.type = 0; - - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0) - btrfs_release_path(path); - if (ret == 0) { - ret = btrfs_del_item(trans, tree_root, path); - if (ret) - goto out; - btrfs_release_path(path); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + btrfs_add_dropped_root(trans, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + btrfs_put_fs_root(root); } - - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); - RB_CLEAR_NODE(&block_group->cache_node); - - if (fs_info->first_logical_byte == block_group->key.objectid) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); + root_dropped = true; +out_end_trans: + btrfs_end_transaction_throttle(trans); +out_free: + kfree(wc); + btrfs_free_path(path); +out: /* - * we must use list_del_init so people can check to see if they - * are still on the list after taking the semaphore + * So if we need to stop dropping the snapshot for whatever reason we + * need to make sure to add it back to the dead root list so that we + * keep trying to do the work later. This also cleans up roots if we + * don't have it in the radix (like when we recover after a power fail + * or unmount) so we don't leak memory. */ - list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) { - kobj = block_group->space_info->block_group_kobjs[index]; - block_group->space_info->block_group_kobjs[index] = NULL; - clear_avail_alloc_bits(fs_info, block_group->flags); - } - up_write(&block_group->space_info->groups_sem); - clear_incompat_bg_bits(fs_info, block_group->flags); - if (kobj) { - kobject_del(kobj); - kobject_put(kobj); - } - - if (block_group->has_caching_ctl) - caching_ctl = get_caching_control(block_group); - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - if (block_group->has_caching_ctl) { - down_write(&fs_info->commit_root_sem); - if (!caching_ctl) { - struct btrfs_caching_control *ctl; - - list_for_each_entry(ctl, - &fs_info->caching_block_groups, list) - if (ctl->block_group == block_group) { - caching_ctl = ctl; - refcount_inc(&caching_ctl->count); - break; - } - } - if (caching_ctl) - list_del_init(&caching_ctl->list); - up_write(&fs_info->commit_root_sem); - if (caching_ctl) { - /* Once for the caching bgs list and once for us. */ - put_caching_control(caching_ctl); - put_caching_control(caching_ctl); - } - } + if (!for_reloc && !root_dropped) + btrfs_add_dead_root(root); + if (err && err != -EAGAIN) + btrfs_handle_fs_error(fs_info, err, NULL); + return err; +} - spin_lock(&trans->transaction->dirty_bgs_lock); - WARN_ON(!list_empty(&block_group->dirty_list)); - WARN_ON(!list_empty(&block_group->io_list)); - spin_unlock(&trans->transaction->dirty_bgs_lock); +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code + */ +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + struct walk_control *wc; + int level; + int parent_level; + int ret = 0; + int wret; - btrfs_remove_free_space_cache(block_group); + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - spin_lock(&block_group->space_info->lock); - list_del_init(&block_group->ro_list); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - WARN_ON(block_group->space_info->total_bytes - < block_group->key.offset); - WARN_ON(block_group->space_info->bytes_readonly - < block_group->key.offset); - WARN_ON(block_group->space_info->disk_total - < block_group->key.offset * factor); + wc = kzalloc(sizeof(*wc), GFP_NOFS); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; } - block_group->space_info->total_bytes -= block_group->key.offset; - block_group->space_info->bytes_readonly -= block_group->key.offset; - block_group->space_info->disk_total -= block_group->key.offset * factor; - - spin_unlock(&block_group->space_info->lock); - - memcpy(&key, &block_group->key, sizeof(key)); - mutex_lock(&fs_info->chunk_mutex); - spin_lock(&block_group->lock); - block_group->removed = 1; - /* - * At this point trimming can't start on this block group, because we - * removed the block group from the tree fs_info->block_group_cache_tree - * so no one can't find it anymore and even if someone already got this - * block group before we removed it from the rbtree, they have already - * incremented block_group->trimming - if they didn't, they won't find - * any free space entries because we already removed them all when we - * called btrfs_remove_free_space_cache(). - * - * And we must not remove the extent map from the fs_info->mapping_tree - * to prevent the same logical address range and physical device space - * ranges from being reused for a new block group. This is because our - * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is - * completely transactionless, so while it is trimming a range the - * currently running transaction might finish and a new one start, - * allowing for new block groups to be created that can reuse the same - * physical device locations unless we take this special care. - * - * There may also be an implicit trim operation if the file system - * is mounted with -odiscard. The same protections must remain - * in place until the extents have been discarded completely when - * the transaction commit has completed. - */ - remove_em = (atomic_read(&block_group->trimming) == 0); - spin_unlock(&block_group->lock); - - mutex_unlock(&fs_info->chunk_mutex); - - ret = remove_block_group_free_space(trans, block_group); - if (ret) - goto out; - - btrfs_put_block_group(block_group); - btrfs_put_block_group(block_group); + btrfs_assert_tree_locked(parent); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -EIO; - if (ret < 0) - goto out; + btrfs_assert_tree_locked(node); + level = btrfs_header_level(node); + path->nodes[level] = node; + path->slots[level] = 0; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); - if (remove_em) { - struct extent_map_tree *em_tree; + while (1) { + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { + ret = wret; + break; + } - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); + wret = walk_up_tree(trans, root, path, wc, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; } -out: - if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); + + kfree(wc); btrfs_free_path(path); return ret; } -struct btrfs_trans_handle * -btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, - const u64 chunk_offset) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - unsigned int num_items; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); - - /* - * We need to reserve 3 + N units from the metadata space info in order - * to remove a block group (done at btrfs_remove_chunk() and at - * btrfs_remove_block_group()), which are used for: - * - * 1 unit for adding the free space inode's orphan (located in the tree - * of tree roots). - * 1 unit for deleting the block group item (located in the extent - * tree). - * 1 unit for deleting the free space item (located in tree of tree - * roots). - * N units for deleting N device extent items corresponding to each - * stripe (located in the device tree). - * - * In order to remove a block group we also need to reserve units in the - * system space info in order to update the chunk tree (update one or - * more device items and remove one chunk item), but this is done at - * btrfs_remove_chunk() through a call to check_system_chunk(). - */ - map = em->map_lookup; - num_items = 3 + map->num_stripes; - free_extent_map(em); - - return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items, 1); -} - /* - * Process the unused_bgs list and remove any that don't have any allocated - * space inside of them. + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. */ -void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - int ret = 0; - - if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) - return; - - spin_lock(&fs_info->unused_bgs_lock); - while (!list_empty(&fs_info->unused_bgs)) { - u64 start, end; - int trimming; - - block_group = list_first_entry(&fs_info->unused_bgs, - struct btrfs_block_group_cache, - bg_list); - list_del_init(&block_group->bg_list); - - space_info = block_group->space_info; - - if (ret || btrfs_mixed_space_info(space_info)) { - btrfs_put_block_group(block_group); - continue; - } - spin_unlock(&fs_info->unused_bgs_lock); + u64 free_bytes = 0; + int factor; - mutex_lock(&fs_info->delete_unused_bgs_mutex); + /* It's df, we don't care if it's racy */ + if (list_empty(&sinfo->ro_bgs)) + return 0; - /* Don't want to race with allocators so take the groups_sem */ - down_write(&space_info->groups_sem); + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); - if (block_group->reserved || block_group->pinned || - btrfs_block_group_used(&block_group->item) || - block_group->ro || - list_is_singular(&block_group->list)) { - /* - * We want to bail if we made new allocations or have - * outstanding allocations in this block group. We do - * the ro check in case balance is currently acting on - * this block group. - */ - trace_btrfs_skip_unused_block_group(block_group); - spin_unlock(&block_group->lock); - up_write(&space_info->groups_sem); - goto next; - } - spin_unlock(&block_group->lock); - - /* We don't want to force the issue, only flip if it's ok. */ - ret = inc_block_group_ro(block_group, 0); - up_write(&space_info->groups_sem); - if (ret < 0) { - ret = 0; - goto next; - } - - /* - * Want to do this before we do anything else so we can recover - * properly if we fail to join the transaction. - */ - trans = btrfs_start_trans_remove_block_group(fs_info, - block_group->key.objectid); - if (IS_ERR(trans)) { - btrfs_dec_block_group_ro(block_group); - ret = PTR_ERR(trans); - goto next; - } - /* - * We could have pending pinned extents for this block group, - * just delete them, we don't care about them anymore. - */ - start = block_group->key.objectid; - end = start + block_group->key.offset - 1; - /* - * Hold the unused_bg_unpin_mutex lock to avoid racing with - * btrfs_finish_extent_commit(). If we are at transaction N, - * another task might be running finish_extent_commit() for the - * previous transaction N - 1, and have seen a range belonging - * to the block group in freed_extents[] before we were able to - * clear the whole block group range from freed_extents[]. This - * means that task can lookup for the block group after we - * unpinned it from freed_extents[] and removed it, leading to - * a BUG_ON() at btrfs_unpin_extent_range(). - */ - mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; - } - ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY); - if (ret) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(block_group); - goto end_trans; + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; } - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - - /* Reset pinned so btrfs_put_block_group doesn't complain */ - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); - space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - block_group->pinned = 0; + factor = btrfs_bg_type_to_factor(block_group->flags); + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD); - - /* Implicit trim during transaction commit. */ - if (trimming) - btrfs_get_block_group_trimming(block_group); - - /* - * Btrfs_remove_chunk will abort the transaction if things go - * horribly wrong. - */ - ret = btrfs_remove_chunk(trans, block_group->key.objectid); - - if (ret) { - if (trimming) - btrfs_put_block_group_trimming(block_group); - goto end_trans; - } - - /* - * If we're not mounted with -odiscard, we can just forget - * about this block group. Otherwise we'll need to wait - * until transaction commit to do the actual discard. - */ - if (trimming) { - spin_lock(&fs_info->unused_bgs_lock); - /* - * A concurrent scrub might have added us to the list - * fs_info->unused_bgs, so use a list_move operation - * to add the block group to the deleted_bgs list. - */ - list_move(&block_group->bg_list, - &trans->transaction->deleted_bgs); - spin_unlock(&fs_info->unused_bgs_lock); - btrfs_get_block_group(block_group); - } -end_trans: - btrfs_end_transaction(trans); -next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - btrfs_put_block_group(block_group); - spin_lock(&fs_info->unused_bgs_lock); } - spin_unlock(&fs_info->unused_bgs_lock); + spin_unlock(&sinfo->lock); + + return free_bytes; } int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, @@ -8985,7 +5646,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return -EINVAL; cache = btrfs_lookup_first_block_group(fs_info, range->start); - for (; cache; cache = next_block_group(cache)) { + for (; cache; cache = btrfs_next_block_group(cache)) { if (cache->key.objectid >= range_end) { btrfs_put_block_group(cache); break; @@ -8995,14 +5656,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) end = min(range_end, cache->key.objectid + cache->key.offset); if (end - start >= range->minlen) { - if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, 0); + if (!btrfs_block_group_cache_done(cache)) { + ret = btrfs_cache_block_group(cache, 0); if (ret) { bg_failed++; bg_ret = ret; continue; } - ret = wait_block_group_cache_done(cache); + ret = btrfs_wait_block_group_cache_done(cache); if (ret) { bg_failed++; bg_ret = ret; @@ -9095,16 +5756,3 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) !atomic_read(&root->will_be_snapshotted)); } } - -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) -{ - struct btrfs_fs_info *fs_info = bg->fs_info; - - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->unused_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); -} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eeb75281894ed6a3ad41dfe53d9f1fa7df2d84dc..7b32b6af322d6ca2b9a4bb8e77b1f8c7daf579ef 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1938,9 +1938,9 @@ static int __process_pages_contig(struct address_space *mapping, } void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) + struct page *locked_page, + unsigned clear_bits, + unsigned long page_ops) { clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, NULL); @@ -4339,10 +4339,8 @@ int extent_invalidatepage(struct extent_io_tree *tree, lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state); + clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 401423b169765c49162f8e109ae953c6e96278ae..cf3424d58fec75733ffc8292222946dd4e12204e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -494,9 +494,9 @@ int map_private_extent_buffer(const struct extent_buffer *eb, void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned bits_to_clear, - unsigned long page_ops); + struct page *locked_page, + unsigned bits_to_clear, + unsigned long page_ops); struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9558d79faf1e4c3d1d9931e5f600f3732463fed3..9d30acca55e170498a25a8325c65c82ed33d26a9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -384,6 +384,8 @@ int add_extent_mapping(struct extent_map_tree *tree, { int ret = 0; + lockdep_assert_held_write(&tree->lock); + ret = tree_insert(&tree->map, em); if (ret) goto out; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58a18ed11546f652e73ee8811e6bc0aca0659043..8fe4eb7e504527914965012b048c0b27a8736b03 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -537,8 +537,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * we can set things up properly */ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached); if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { if (start_pos >= isize && @@ -559,7 +559,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, } err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - extra_bits, cached, 0); + extra_bits, cached); if (err) return err; @@ -1882,10 +1882,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, u64 start_pos; u64 end_pos; ssize_t num_written = 0; - bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + const bool sync = iocb->ki_flags & IOCB_DSYNC; ssize_t err; loff_t pos; - size_t count = iov_iter_count(from); + size_t count; loff_t oldsize; int clean_page = 0; @@ -1906,6 +1906,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } pos = iocb->ki_pos; + count = iov_iter_count(from); if (iocb->ki_flags & IOCB_NOWAIT) { /* * We will allocate space in case nodatacow is not set, @@ -2439,27 +2440,286 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } +static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_clone_extent_info *clone_info, + const u64 clone_len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int slot; + struct btrfs_ref ref = { 0 }; + u64 ref_offset; + int ret; + + if (clone_len == 0) + return 0; + + if (clone_info->disk_offset == 0 && + btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_info->file_offset; + ret = btrfs_insert_empty_item(trans, root, path, &key, + clone_info->item_size); + if (ret) + return ret; + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, clone_info->extent_buf, + btrfs_item_ptr_offset(leaf, slot), + clone_info->item_size); + extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + /* If it's a hole, nothing more needs to be done. */ + if (clone_info->disk_offset == 0) + return 0; + + inode_add_bytes(inode, clone_len); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + clone_info->disk_offset, + clone_info->disk_len, 0); + ref_offset = clone_info->file_offset - clone_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + + return ret; +} + +/* + * The respective range must have been previously locked, as well as the inode. + * The end offset is inclusive (last byte of the range). + * @clone_info is NULL for fallocate's hole punching and non-NULL for extent + * cloning. + * When cloning, we don't want to end up in a state where we dropped extents + * without inserting a new one, so we must abort the transaction to avoid a + * corruption. + */ +int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, + const u64 start, const u64 end, + struct btrfs_clone_extent_info *clone_info, + struct btrfs_trans_handle **trans_out) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_block_rsv *rsv; + unsigned int rsv_count; + u64 cur_offset; + u64 drop_end; + u64 len = end - start; + int ret = 0; + + if (end <= start) + return -EINVAL; + + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) { + ret = -ENOMEM; + goto out; + } + rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv->failfast = 1; + + /* + * 1 - update the inode + * 1 - removing the extents in the range + * 1 - adding the hole extent if no_holes isn't set or if we are cloning + * an extent + */ + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + rsv_count = 3; + else + rsv_count = 2; + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto out_free; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + min_size, false); + BUG_ON(ret); + trans->block_rsv = rsv; + + cur_offset = start; + while (cur_offset < end) { + ret = __btrfs_drop_extents(trans, root, inode, path, + cur_offset, end + 1, &drop_end, + 1, 0, 0, NULL); + if (ret != -ENOSPC) { + /* + * When cloning we want to avoid transaction aborts when + * nothing was done and we are attempting to clone parts + * of inline extents, in such cases -EOPNOTSUPP is + * returned by __btrfs_drop_extents() without having + * changed anything in the file. + */ + if (clone_info && ret && ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, ret); + break; + } + + trans->block_rsv = &fs_info->trans_block_rsv; + + if (!clone_info && cur_offset < drop_end && + cur_offset < ino_size) { + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); + if (ret) { + /* + * If we failed then we didn't insert our hole + * entries for the area we dropped, so now the + * fs is corrupted, so we must abort the + * transaction. + */ + btrfs_abort_transaction(trans, ret); + break; + } + } + + if (clone_info) { + u64 clone_len = drop_end - cur_offset; + + ret = btrfs_insert_clone_extent(trans, inode, path, + clone_info, clone_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + clone_info->data_len -= clone_len; + clone_info->data_offset += clone_len; + clone_info->file_offset += clone_len; + } + + cur_offset = drop_end; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + break; + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + trans = btrfs_start_transaction(root, rsv_count); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + break; + } + + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, + rsv, min_size, false); + BUG_ON(ret); /* shouldn't happen */ + trans->block_rsv = rsv; + + if (!clone_info) { + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } + } + } + + /* + * If we were cloning, force the next fsync to be a full one since we + * we replaced (or just dropped in the case of cloning holes when + * NO_HOLES is enabled) extents and extent maps. + * This is for the sake of simplicity, and cloning into files larger + * than 16Mb would force the full fsync any way (when + * try_release_extent_mapping() is invoked during page cache truncation. + */ + if (clone_info) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + if (ret) + goto out_trans; + + trans->block_rsv = &fs_info->trans_block_rsv; + /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= end) + drop_end = end + 1; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, BTRFS_I(inode), path, + cur_offset, drop_end); + if (ret) { + /* Same comment as above. */ + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + if (clone_info) { + ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, + clone_info->data_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_trans; + } + } + +out_trans: + if (!trans) + goto out_free; + + trans->block_rsv = &fs_info->trans_block_rsv; + if (ret) + btrfs_end_transaction(trans); + else + *trans_out = trans; +out_free: + btrfs_free_block_rsv(fs_info, rsv); +out: + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_block_rsv *rsv; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; u64 lockstart; u64 lockend; u64 tail_start; u64 tail_len; u64 orig_start = offset; - u64 cur_offset; - u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 drop_end; int ret = 0; - int err = 0; - unsigned int rsv_count; bool same_block; - bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES); u64 ino_size; bool truncated_block = false; bool updated_inode = false; @@ -2566,145 +2826,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - ret = -ENOMEM; - goto out_free; - } - rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1); - rsv->failfast = 1; - - /* - * 1 - update the inode - * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set - */ - rsv_count = no_holes ? 2 : 3; - trans = btrfs_start_transaction(root, rsv_count); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, false); - BUG_ON(ret); - trans->block_rsv = rsv; - - cur_offset = lockstart; - len = lockend - cur_offset; - while (cur_offset < lockend) { - ret = __btrfs_drop_extents(trans, root, inode, path, - cur_offset, lockend + 1, - &drop_end, 1, 0, 0, NULL); - if (ret != -ENOSPC) - break; - - trans->block_rsv = &fs_info->trans_block_rsv; - - if (cur_offset < drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); - if (ret) { - /* - * If we failed then we didn't insert our hole - * entries for the area we dropped, so now the - * fs is corrupted, so we must abort the - * transaction. - */ - btrfs_abort_transaction(trans, ret); - err = ret; - break; - } - } - - cur_offset = drop_end; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - err = ret; - break; - } - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); - - trans = btrfs_start_transaction(root, rsv_count); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - break; - } - - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ - trans->block_rsv = rsv; - - ret = find_first_non_hole(inode, &cur_offset, &len); - if (unlikely(ret < 0)) - break; - if (ret && !len) { - ret = 0; - break; - } - } - - if (ret) { - err = ret; - goto out_trans; - } - - trans->block_rsv = &fs_info->trans_block_rsv; - /* - * If we are using the NO_HOLES feature we might have had already an - * hole that overlaps a part of the region [lockstart, lockend] and - * ends at (or beyond) lockend. Since we have no file extent items to - * represent holes, drop_end can be less than lockend and so we must - * make sure we have an extent map representing the existing hole (the - * call to __btrfs_drop_extents() might have dropped the existing extent - * map representing the existing hole), otherwise the fast fsync path - * will not record the existence of the hole region - * [existing_hole_start, lockend]. - */ - if (drop_end <= lockend) - drop_end = lockend + 1; - /* - * Don't insert file hole extent item if it's for a range beyond eof - * (because it's useless) or if it represents a 0 bytes range (when - * cur_offset == drop_end). - */ - if (cur_offset < ino_size && cur_offset < drop_end) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); - if (ret) { - /* Same comment as above. */ - btrfs_abort_transaction(trans, ret); - err = ret; - goto out_trans; - } - } - -out_trans: - if (!trans) - goto out_free; + ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + &trans); + btrfs_free_path(path); + if (ret) + goto out; + ASSERT(trans != NULL); inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - - trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); -out_free: - btrfs_free_path(path); - btrfs_free_block_rsv(fs_info, rsv); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); out_only_mutex: - if (!updated_inode && truncated_block && !ret && !err) { + if (!updated_inode && truncated_block && !ret) { /* * If we only end up zeroing part of a page, we still need to * update the inode item, so that all the time fields are @@ -2719,16 +2858,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); } else { - err = btrfs_update_inode(trans, root, inode); - ret = btrfs_end_transaction(trans); + int ret2; + + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + if (!ret) + ret = ret2; } } inode_unlock(inode); - if (ret && !err) - err = ret; - return err; + return ret; } /* Helper structure to record which range is already reserved */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 062be9dde4c62344128c93f556aae30301d7ccde..d54dcd0ab23058cc8fad6ef093ac0a552f00577a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "space-info.h" #include "delalloc-space.h" +#include "block-group.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K @@ -210,8 +211,8 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int ret; /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) + - btrfs_calc_trans_metadata_size(fs_info, 1); + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); spin_lock(&rsv->lock); if (rsv->reserved < needed_bytes) @@ -764,7 +765,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } else { ASSERT(num_bitmaps); num_bitmaps--; - e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); + e->bitmap = kmem_cache_zalloc( + btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { kmem_cache_free( btrfs_free_space_cachep, e); @@ -1004,7 +1006,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1016,9 +1018,8 @@ update_cache_item(struct btrfs_trans_handle *trans, if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL); + inode->i_size - 1, EXTENT_DELALLOC, 0, + 0, NULL); btrfs_release_path(path); goto fail; } @@ -1114,7 +1115,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); return ret; } @@ -1881,7 +1882,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { unlink_free_space(ctl, bitmap_info); - kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); @@ -2135,7 +2136,8 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, } /* allocate the bitmap */ - info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); + info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, + GFP_NOFS); spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -2146,7 +2148,9 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, out: if (info) { - kfree(info->bitmap); + if (info->bitmap) + kmem_cache_free(btrfs_free_space_bitmap_cachep, + info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } @@ -2376,6 +2380,14 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, return ret; } +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size) +{ + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size); +} + int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { @@ -2802,7 +2814,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, if (entry->bytes == 0) { ctl->free_extents--; if (entry->bitmap) { - kfree(entry->bitmap); + kmem_cache_free(btrfs_free_space_bitmap_cachep, + entry->bitmap); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); } @@ -3606,7 +3619,7 @@ int test_add_free_space_entry(struct btrfs_block_group_cache *cache, } if (!map) { - map = kzalloc(PAGE_SIZE, GFP_NOFS); + map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; @@ -3635,7 +3648,8 @@ int test_add_free_space_entry(struct btrfs_block_group_cache *cache, if (info) kmem_cache_free(btrfs_free_space_cachep, info); - kfree(map); + if (map) + kmem_cache_free(btrfs_free_space_bitmap_cachep, map); return 0; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 8760acb55ffdae92ef8b96a2af96b56684e49ef1..39c32c8fc24f80215ed70a8ebbea695196aebeed 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -36,7 +36,19 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; -struct btrfs_io_ctl; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_fs_info *fs_info; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; struct inode *lookup_free_space_inode( struct btrfs_block_group_cache *block_group, @@ -73,14 +85,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, u64 bytenr, u64 size); -static inline int -btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size) -{ - return __btrfs_add_free_space(block_group->fs_info, - block_group->free_space_ctl, - bytenr, size); -} +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index f5dc115ebba069f70b74131b7627daeeee989f9a..48a03f5240f59bccd6a45d40023f904cf948e7ac 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -10,6 +10,7 @@ #include "locking.h" #include "free-space-tree.h" #include "transaction.h" +#include "block-group.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 22b7602bde253f6d5ec6182d765d3d581c3ee520..360d50e1cdeabe3520b261c8b64fba4b1e8a13c5 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_FREE_SPACE_TREE_H #define BTRFS_FREE_SPACE_TREE_H +struct btrfs_caching_control; + /* * The default size for new free space bitmap items. The last bitmap in a block * group may be truncated, and none of the free space tree code assumes that diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 30d62ef918b92e1154df44cca39f17992705ffe8..668701832845cb8c4daf42fcec6af8eac3c91d5d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -8,9 +8,9 @@ #include "transaction.h" #include "print-tree.h" -int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) +struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, + int slot, const char *name, + int name_len) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -28,19 +28,15 @@ int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, cur_offset += len + sizeof(*ref); if (len != name_len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - if (ref_ret) - *ref_ret = ref; - return 1; - } + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return ref; } - return 0; + return NULL; } -int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, - u64 ref_objectid, - const char *name, int name_len, - struct btrfs_inode_extref **extref_ret) +struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( + struct extent_buffer *leaf, int slot, u64 ref_objectid, + const char *name, int name_len) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -65,15 +61,12 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, if (ref_name_len == name_len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) { - if (extref_ret) - *extref_ret = extref; - return 1; - } + (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + return extref; cur_offset += ref_name_len + sizeof(*extref); } - return 0; + return NULL; } /* Returns NULL if no extref found */ @@ -87,7 +80,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct btrfs_inode_extref *extref; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; @@ -98,11 +90,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return NULL; - if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len, - &extref)) - return NULL; - return extref; + return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len); + } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, @@ -142,9 +132,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * This should always succeed so error here will make the FS * readonly. */ - if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, name_len, &extref)) { + extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len); + if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; @@ -213,8 +203,10 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, } else if (ret < 0) { goto out; } - if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) { + + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, + name_len); + if (!ref) { ret = -ENOENT; search_ext_refs = 1; goto out; @@ -285,7 +277,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len, NULL)) + name, name_len)) goto out; btrfs_extend_item(path, ins_len); @@ -341,9 +333,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ins_len); if (ret == -EEXIST) { u32 old_size; - - if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len); + if (ref) goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -359,7 +351,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len, &ref)) + name, name_len)) ret = -EEXIST; else ret = -EMLINK; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2e8bb402050b9593a4f88b7224e2b73618479b8b..63cad7865d751dd96f2e364fdbd6c92c967a7c6e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -13,6 +13,19 @@ #include "transaction.h" #include "delalloc-space.h" +static void fail_caching_thread(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + + btrfs_warn(fs_info, "failed to start inode caching task"); + btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, + "disabling inode map caching"); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_ERROR; + spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); +} + static int caching_kthread(void *data) { struct btrfs_root *root = data; @@ -29,8 +42,10 @@ static int caching_kthread(void *data) return 0; path = btrfs_alloc_path(); - if (!path) + if (!path) { + fail_caching_thread(root); return -ENOMEM; + } /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; @@ -146,6 +161,7 @@ static void start_caching(struct btrfs_root *root) spin_lock(&root->ino_cache_lock); root->ino_cache_state = BTRFS_CACHE_FINISHED; spin_unlock(&root->ino_cache_lock); + wake_up(&root->ino_cache_wait); return; } @@ -160,15 +176,13 @@ static void start_caching(struct btrfs_root *root) if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { __btrfs_add_free_space(fs_info, ctl, objectid, BTRFS_LAST_FREE_OBJECTID - objectid + 1); + wake_up(&root->ino_cache_wait); } tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); - if (IS_ERR(tsk)) { - btrfs_warn(fs_info, "failed to start inode caching task"); - btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); - } + if (IS_ERR(tsk)) + fail_caching_thread(root); } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -186,11 +200,14 @@ int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) wait_event(root->ino_cache_wait, root->ino_cache_state == BTRFS_CACHE_FINISHED || + root->ino_cache_state == BTRFS_CACHE_ERROR || root->free_ino_ctl->free_space > 0); if (root->ino_cache_state == BTRFS_CACHE_FINISHED && root->free_ino_ctl->free_space == 0) return -ENOSPC; + else if (root->ino_cache_state == BTRFS_CACHE_ERROR) + return btrfs_find_free_objectid(root, objectid); else goto again; } @@ -419,7 +436,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10); + trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); @@ -485,6 +502,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, prealloc, prealloc, &alloc_hint); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); + btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); goto out_put; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee582a36653d30ee1f833153f9bc00225f7978ae..a0546401bc0ab849e4bef60e2ad4ccc7441be6c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -30,6 +30,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -46,8 +47,8 @@ #include "backref.h" #include "props.h" #include "qgroup.h" -#include "dedupe.h" #include "delalloc-space.h" +#include "block-group.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -74,15 +75,15 @@ static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; +struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash); + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -178,6 +179,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; unsigned long offset; + ASSERT((compressed_size > 0 && compressed_pages) || + (compressed_size == 0 && !compressed_pages)); + if (compressed_size && compressed_pages) cur_size = compressed_size; @@ -462,8 +466,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, * are written in the same order that the flusher thread sent them * down. */ -static noinline void compress_file_range(struct async_chunk *async_chunk, - int *num_added) +static noinline int compress_file_range(struct async_chunk *async_chunk) { struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -479,6 +482,7 @@ static noinline void compress_file_range(struct async_chunk *async_chunk, int i; int will_compress; int compress_type = fs_info->compress_type; + int compressed_extents = 0; int redirty = 0; inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, @@ -615,14 +619,21 @@ static noinline void compress_file_range(struct async_chunk *async_chunk, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, end, - NULL, clear_flags, + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - goto free_pages_out; + + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); + + return 0; } } @@ -641,7 +652,7 @@ static noinline void compress_file_range(struct async_chunk *async_chunk, */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - *num_added += 1; + compressed_extents++; /* * The async work queues will take care of doing actual @@ -658,7 +669,7 @@ static noinline void compress_file_range(struct async_chunk *async_chunk, cond_resched(); goto again; } - return; + return compressed_extents; } } if (pages) { @@ -697,16 +708,9 @@ static noinline void compress_file_range(struct async_chunk *async_chunk, extent_range_redirty_for_io(inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - - return; + compressed_extents++; -free_pages_out: - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); + return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -762,10 +766,7 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0, - NULL); + &page_started, &nr_written, 0); /* JDM XXX */ @@ -853,8 +854,6 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, @@ -875,7 +874,7 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) btrfs_writepage_endio_finish_ordered(p, start, end, 0); p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); @@ -891,8 +890,6 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -953,9 +950,8 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash) + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -994,8 +990,7 @@ static noinline int cow_file_range(struct inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, - delalloc_end, NULL, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1078,7 +1073,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - delalloc_end, locked_page, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1122,7 +1117,6 @@ static noinline int cow_file_range(struct inode *inode, */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size, start + cur_alloc_size, locked_page, clear_bits, @@ -1131,8 +1125,7 @@ static noinline int cow_file_range(struct inode *inode, if (start >= end) goto out; } - extent_clear_unlock_delalloc(inode, start, end, delalloc_end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; @@ -1144,12 +1137,12 @@ static noinline int cow_file_range(struct inode *inode, static noinline void async_cow_start(struct btrfs_work *work) { struct async_chunk *async_chunk; - int num_added = 0; + int compressed_extents; async_chunk = container_of(work, struct async_chunk, work); - compress_file_range(async_chunk, &num_added); - if (num_added == 0) { + compressed_extents = compress_file_range(async_chunk); + if (compressed_extents == 0) { btrfs_add_delayed_iput(async_chunk->inode); async_chunk->inode = NULL; } @@ -1235,7 +1228,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR; - extent_clear_unlock_delalloc(inode, start, end, 0, locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); return -ENOMEM; } @@ -1310,36 +1303,25 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, */ static noinline int run_delalloc_nocow(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, int force, - unsigned long *nr_written) + const u64 start, const u64 end, + int *page_started, int force, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *leaf; struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct btrfs_key found_key; - struct extent_map *em; - u64 cow_start; - u64 cur_offset; - u64 extent_end; - u64 extent_offset; - u64 disk_bytenr; - u64 num_bytes; - u64 disk_num_bytes; - u64 ram_bytes; - int extent_type; + u64 cow_start = (u64)-1; + u64 cur_offset = start; int ret; - int type; - int nocow; - int check_prev = 1; - bool nolock; + bool check_prev = true; + const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); u64 ino = btrfs_ino(BTRFS_I(inode)); + bool nocow = false; + u64 disk_bytenr = 0; path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | @@ -1349,15 +1331,29 @@ static noinline int run_delalloc_nocow(struct inode *inode, return -ENOMEM; } - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); - - cow_start = (u64)-1; - cur_offset = start; while (1) { + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u64 extent_end; + u64 extent_offset; + u64 num_bytes = 0; + u64 disk_num_bytes; + u64 ram_bytes; + int extent_type; + + nocow = false; + ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; + + /* + * If there is no extent for our range when doing the initial + * search, then go back to the previous slot as it will be the + * one containing the search offset + */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1366,8 +1362,9 @@ static noinline int run_delalloc_nocow(struct inode *inode, found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } - check_prev = 0; + check_prev = false; next_slot: + /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -1381,28 +1378,40 @@ static noinline int run_delalloc_nocow(struct inode *inode, leaf = path->nodes[0]; } - nocow = 0; - disk_bytenr = 0; - num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; + /* + * Keep searching until we find an EXTENT_ITEM or there are no + * more extents for this inode + */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } + + /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; + /* + * If the found extent starts after requested offset, then + * adjust extent_end to be right before this extent begins + */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto out_check; } + /* + * Found extent which begins before our range and potentially + * intersect it + */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1416,26 +1425,36 @@ static noinline int run_delalloc_nocow(struct inode *inode, btrfs_file_extent_num_bytes(leaf, fi); disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + /* + * If extent we got ends before our range starts, skip + * to next extent + */ if (extent_end <= start) { path->slots[0]++; goto next_slot; } + /* Skip holes */ if (disk_bytenr == 0) goto out_check; + /* Skip compressed/encrypted/encoded extents */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; /* - * Do the same check as in btrfs_cross_ref_exist but - * without the unnecessary search. + * If extent is created before the last volume's snapshot + * this implies the extent is shared, hence we can't do + * nocow. This is the same check as in + * btrfs_cross_ref_exist but without calling + * btrfs_search_slot. */ - if (!nolock && + if (!freespace_inode && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; + /* If extent is RO, we must COW it */ if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; ret = btrfs_cross_ref_exist(root, ino, @@ -1452,17 +1471,17 @@ static noinline int run_delalloc_nocow(struct inode *inode, goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* - * if there are pending snapshots for this root, - * we fall into common COW way. + * If there are pending snapshots for this root, we + * fall into common COW way */ - if (!nolock && atomic_read(&root->snapshot_force_cow)) + if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out_check; /* * force cow if csum exists in the range. @@ -1481,27 +1500,29 @@ static noinline int run_delalloc_nocow(struct inode *inode, cur_offset = cow_start; goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; - nocow = 1; + nocow = true; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + - btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_end, - fs_info->sectorsize); + extent_end = found_key.offset + ram_bytes; + extent_end = ALIGN(extent_end, fs_info->sectorsize); + /* Skip extents outside of our requested range */ + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } } else { + /* If this triggers then we have a memory corruption */ BUG(); } out_check: - if (extent_end <= start) { - path->slots[0]++; - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - goto next_slot; - } + /* + * If nocow is false then record the beginning of the range + * that needs to be COWed + */ if (!nocow) { if (cow_start == (u64)-1) cow_start = cur_offset; @@ -1513,11 +1534,16 @@ static noinline int run_delalloc_nocow(struct inode *inode, } btrfs_release_path(path); + + /* + * COW range from cow_start to found_key.offset - 1. As the key + * will contain the beginning of the first extent that can be + * NOCOW, following one which needs to be COW'ed + */ if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - end, page_started, nr_written, 1, - NULL); + page_started, nr_written, 1); if (ret) { if (nocow) btrfs_dec_nocow_writers(fs_info, @@ -1529,6 +1555,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 orig_start = found_key.offset - extent_offset; + struct extent_map *em; em = create_io_em(inode, cur_offset, num_bytes, orig_start, @@ -1545,19 +1572,29 @@ static noinline int run_delalloc_nocow(struct inode *inode, goto error; } free_extent_map(em); - } - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - type = BTRFS_ORDERED_PREALLOC; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_PREALLOC); + if (ret) { + btrfs_drop_extent_cache(BTRFS_I(inode), + cur_offset, + cur_offset + num_bytes - 1, + 0); + goto error; + } } else { - type = BTRFS_ORDERED_NOCOW; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_NOCOW); + if (ret) + goto error; } - ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, - num_bytes, num_bytes, type); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); - BUG_ON(ret); /* -ENOMEM */ + nocow = false; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) @@ -1570,7 +1607,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, num_bytes); extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, end, + cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1595,15 +1632,18 @@ static noinline int run_delalloc_nocow(struct inode *inode, if (cow_start != (u64)-1) { cur_offset = end; - ret = cow_file_range(inode, locked_page, cow_start, end, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); if (ret) goto error; } error: + if (nocow) + btrfs_dec_nocow_writers(fs_info, disk_bytenr); + if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, cur_offset, end, end, + extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1654,8 +1694,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -2090,7 +2130,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, - struct extent_state **cached_state, int dedupe) + struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, @@ -2156,7 +2196,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - &cached_state, 0); + &cached_state); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -3850,7 +3890,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); @@ -4946,12 +4986,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, } clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, - &cached_state, 0); + &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -5332,9 +5371,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, &cached_state); + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5347,59 +5386,50 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1); - int failures = 0; - - for (;;) { - struct btrfs_trans_handle *trans; - int ret; - - ret = btrfs_block_rsv_refill(root, rsv, - rsv->size + delayed_refs_extra, - BTRFS_RESERVE_FLUSH_LIMIT); - - if (ret && ++failures > 2) { - btrfs_warn(fs_info, - "could not allocate space for a delete; will truncate on mount"); - return ERR_PTR(-ENOSPC); - } - - /* - * Evict can generate a large amount of delayed refs without - * having a way to add space back since we exhaust our temporary - * block rsv. We aren't allowed to do FLUSH_ALL in this case - * because we could deadlock with so many things in the flushing - * code, so we have to try and hold some extra space to - * compensate for our delayed ref generation. If we can't get - * that space then we need see if we can steal our minimum from - * the global reserve. We will be ratelimited by the amount of - * space we have for the delayed refs rsv, so we'll end up - * committing and trying again. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) { - if (!IS_ERR(trans)) { - trans->block_rsv = &fs_info->trans_block_rsv; - trans->bytes_reserved = delayed_refs_extra; - btrfs_block_rsv_migrate(rsv, trans->block_rsv, - delayed_refs_extra, 1); - } - return trans; - } + struct btrfs_trans_handle *trans; + u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); + int ret; + /* + * Eviction should be taking place at some place safe because of our + * delayed iputs. However the normal flushing code will run delayed + * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. + * + * We reserve the delayed_refs_extra here again because we can't use + * btrfs_start_transaction(root, 0) for the same deadlocky reason as + * above. We reserve our extra bit here because we generate a ton of + * delayed refs activity by truncating. + * + * If we cannot make our reservation we'll attempt to steal from the + * global reserve, because we really want to be able to free up space. + */ + ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { /* * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) - return trans; + if (btrfs_check_space_for_delayed_refs(fs_info) || + btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + btrfs_warn(fs_info, + "could not allocate space for delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + delayed_refs_extra = 0; + } - /* If not, commit and try again. */ - ret = btrfs_commit_transaction(trans); - if (ret) - return ERR_PTR(ret); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return trans; + + if (delayed_refs_extra) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); } + return trans; } void btrfs_evict_inode(struct inode *inode) @@ -5446,7 +5476,7 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); + rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -7701,12 +7731,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - int unlock_bits = EXTENT_LOCKED; int ret = 0; - if (create) - unlock_bits |= EXTENT_DIRTY; - else + if (!create) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; @@ -7765,9 +7792,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (ret < 0) goto unlock_err; - /* clear and unlock the entire range */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); } else { ret = btrfs_get_blocks_direct_read(em, bh_result, inode, start, len); @@ -7783,9 +7809,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, */ lockstart = start + bh_result->b_size; if (lockstart < lockend) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); } else { free_extent_state(cached_state); } @@ -7796,8 +7821,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, return 0; unlock_err: - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: if (dio_data) current->journal_info = dio_data; @@ -8812,8 +8837,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8868,8 +8892,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (PageDirty(page)) btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | + clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state); @@ -8997,12 +9020,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, - &cached_state, 0); + &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); @@ -9060,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); + u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -9380,6 +9402,7 @@ void __cold btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -9409,6 +9432,12 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_RED_ZONE, NULL); + if (!btrfs_free_space_bitmap_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 818f7ec8bb0ee327b338a7577be98bf4f6b65ff0..de730e56d3f5106ee97d7fa316ee5e855487d954 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -45,6 +45,7 @@ #include "compression.h" #include "space-info.h" #include "delalloc-space.h" +#include "block-group.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -1332,9 +1333,8 @@ static int cluster_pages_for_defrag(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - &cached_state); + page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1840,8 +1840,15 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, goto free_args; } - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) { + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + btrfs_warn(fs_info, +"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7"); + ptr = &transid; + } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { @@ -3324,61 +3331,6 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, return ret; } -static void clone_update_extent_map(struct btrfs_inode *inode, - const struct btrfs_trans_handle *trans, - const struct btrfs_path *path, - const u64 hole_offset, - const u64 hole_len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - em = alloc_extent_map(); - if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); - return; - } - - if (path) { - struct btrfs_file_extent_item *fi; - - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - btrfs_extent_item_to_extent_map(inode, path, fi, false, em); - em->generation = -1; - if (btrfs_file_extent_type(path->nodes[0], fi) == - BTRFS_FILE_EXTENT_INLINE) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); - } else { - em->start = hole_offset; - em->len = hole_len; - em->ram_bytes = em->len; - em->orig_start = hole_offset; - em->block_start = EXTENT_MAP_HOLE; - em->block_len = 0; - em->orig_block_len = 0; - em->compress_type = BTRFS_COMPRESS_NONE; - em->generation = trans->transid; - } - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - } - - if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); -} - /* * Make sure we do not end up inserting an inline extent into a file that has * already other (non-inline) extents. If a file has an inline extent it can @@ -3519,6 +3471,7 @@ static int clone_copy_inline_extent(struct inode *dst, path->slots[0]), size); inode_add_bytes(dst, datal); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); return 0; } @@ -3570,6 +3523,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode, while (1) { u64 next_key_min_offset = key.offset + 1; + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + u64 drop_start; /* * note the key will change type as we walk through the @@ -3610,75 +3571,115 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.objectid != btrfs_ino(BTRFS_I(src))) break; - if (key.type == BTRFS_EXTENT_DATA_KEY) { - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 drop_start; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, - extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, - extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, - extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, - extent); - } + ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + disko = btrfs_file_extent_disk_bytenr(leaf, extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, extent); + } + + /* + * The first search might have left us at an extent item that + * ends before our target range's start, can happen if we have + * holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { + path->slots[0]++; + goto process_slot; + } else if (key.offset >= off + len) { + break; + } + next_key_min_offset = key.offset + datal; + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), + size); + + btrfs_release_path(path); + path->leave_spinning = 0; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = btrfs_ino(BTRFS_I(inode)); + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; + + /* + * Deal with a hole that doesn't have an extent item that + * represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning range or at + * the beginning (fully overlaps it or partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + struct btrfs_clone_extent_info clone_info; /* - * The first search might have left us at an extent - * item that ends before our target range's start, can - * happen if we have holes and NO_HOLES feature enabled. + * a | --- range to clone ---| b + * | ------------- extent ------------- | */ - if (key.offset + datal <= off) { - path->slots[0]++; - goto process_slot; - } else if (key.offset >= off + len) { - break; + + /* Subtract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* Subtract range a */ + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; } - next_key_min_offset = key.offset + datal; - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - btrfs_release_path(path); - path->leave_spinning = 0; + clone_info.disk_offset = disko; + clone_info.disk_len = diskl; + clone_info.data_offset = datao; + clone_info.data_len = datal; + clone_info.file_offset = new_key.offset; + clone_info.extent_buf = buf; + clone_info.item_size = size; + ret = btrfs_punch_hole_range(inode, path, + drop_start, + new_key.offset + datal - 1, + &clone_info, &trans); + if (ret) + goto out; + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(BTRFS_I(inode)); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } - /* - * Deal with a hole that doesn't have an extent item - * that represents it (NO_HOLES feature enabled). - * This hole is either in the middle of the cloning - * range or at the beginning (fully overlaps it or - * partially overlaps it). - */ - if (new_key.offset != last_dest_end) - drop_start = last_dest_end; - else - drop_start = new_key.offset; + if (key.offset + datal > off + len) + trim = key.offset + datal - (off + len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; /* + * If our extent is inline, we know we will drop or + * adjust at most 1 extent item in the destination root. + * * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3689,140 +3690,28 @@ static int btrfs_clone(struct inode *src, struct inode *inode, goto out; } - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* subtract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* subtract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - new_key.offset + datal, - 1); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { + ret = clone_copy_inline_extent(inode, trans, path, + &new_key, drop_start, + datal, skip, size, buf); + if (ret) { + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - goto out; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - /* disko == 0 means it's a hole */ - if (!disko) - datao = 0; - - btrfs_set_file_extent_offset(leaf, extent, - datao); - btrfs_set_file_extent_num_bytes(leaf, extent, - datal); - - if (disko) { - struct btrfs_ref ref = { 0 }; - inode_add_bytes(inode, datal); - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, disko, - diskl, 0); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), - new_key.offset - datao); - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - - } - } - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off + len) - trim = key.offset + datal - (off + len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - btrfs_end_transaction(trans); - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - ret = clone_copy_inline_extent(inode, - trans, path, - &new_key, - drop_start, - datal, - skip, size, buf); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, - ret); - btrfs_end_transaction(trans); - goto out; - } - leaf = path->nodes[0]; - slot = path->slots[0]; + btrfs_end_transaction(trans); + goto out; } + } - /* If we have an implicit hole (NO_HOLES feature). */ - if (drop_start < new_key.offset) - clone_update_extent_map(BTRFS_I(inode), trans, - NULL, drop_start, - new_key.offset - drop_start); - - clone_update_extent_map(BTRFS_I(inode), trans, - path, 0, 0); + btrfs_release_path(path); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + last_dest_end = ALIGN(new_key.offset + datal, + fs_info->sectorsize); + ret = clone_finish_inode_update(trans, inode, last_dest_end, + destoff, olen, no_time_update); + if (ret) + goto out; + if (new_key.offset + datal >= destoff + len) + break; - last_dest_end = ALIGN(new_key.offset + datal, - fs_info->sectorsize); - ret = clone_finish_inode_update(trans, inode, - last_dest_end, - destoff, olen, - no_time_update); - if (ret) - goto out; - if (new_key.offset + datal >= destoff + len) - break; - } btrfs_release_path(path); key.offset = next_key_min_offset; @@ -3834,32 +3723,27 @@ static int btrfs_clone(struct inode *src, struct inode *inode, ret = 0; if (last_dest_end < destoff + len) { + struct btrfs_clone_extent_info clone_info = { 0 }; /* * We have an implicit hole (NO_HOLES feature is enabled) that * fully or partially overlaps our cloning range at its end. */ btrfs_release_path(path); + path->leave_spinning = 0; /* - * 1 - remove extent(s) - * 1 - inode update + * We are dealing with a hole and our clone_info already has a + * disk_offset of 0, we only need to fill the data length and + * file offset. */ - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - ret = btrfs_drop_extents(trans, root, inode, - last_dest_end, destoff + len, 1); - if (ret) { - if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); + clone_info.data_len = destoff + len - last_dest_end; + clone_info.file_offset = last_dest_end; + ret = btrfs_punch_hole_range(inode, path, + last_dest_end, destoff + len - 1, + &clone_info, &trans); + if (ret) goto out; - } - clone_update_extent_map(BTRFS_I(inode), trans, NULL, - last_dest_end, - destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } @@ -4313,6 +4197,9 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; + btrfs_warn(root->fs_info, + "START_SYNC ioctl is deprecated and will be removed in kernel 5.7"); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -4340,6 +4227,9 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, { u64 transid; + btrfs_warn(fs_info, + "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7"); + if (argp) { if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; @@ -5381,7 +5271,7 @@ static int check_feature_bits(struct btrfs_fs_info *fs_info, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) { - const char *type = btrfs_feature_set_names[set]; + const char *type = btrfs_feature_set_name(set); char *names; u64 disallowed, unsupported; u64 set_mask = flags & change_mask; @@ -5562,6 +5452,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); + case FS_IOC_GETFSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case FS_IOC_SETFSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); case FITRIM: return btrfs_ioctl_fitrim(file, argp); case BTRFS_IOC_SNAP_CREATE: @@ -5673,10 +5567,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); - case BTRFS_IOC_GET_FSLABEL: - return btrfs_ioctl_get_fslabel(file, argp); - case BTRFS_IOC_SET_FSLABEL: - return btrfs_ioctl_set_fslabel(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 393eceda57c8a2d8d31eb3e4b5e45db3fe697c32..7f9a578a1a206baa7b04c657d5807e6b815ae7a0 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,6 +8,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" @@ -119,42 +120,6 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) } } -void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) -{ - trace_btrfs_clear_lock_blocking_read(eb); - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_nested && current->pid == eb->lock_owner) - return; - BUG_ON(atomic_read(&eb->blocking_readers) == 0); - read_lock(&eb->lock); - btrfs_assert_spinning_readers_get(eb); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); -} - -void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) -{ - trace_btrfs_clear_lock_blocking_write(eb); - /* - * no lock is required. The lock owner may change if - * we have a read lock, but it won't change to or away - * from us. If we have the write lock, we are the owner - * and it'll never change. - */ - if (eb->lock_nested && current->pid == eb->lock_owner) - return; - write_lock(&eb->lock); - BUG_ON(eb->blocking_writers != 1); - btrfs_assert_spinning_writers_get(eb); - if (--eb->blocking_writers == 0) - cond_wake_up(&eb->write_lock_wq); -} - /* * take a spinning read lock. This will wait for any blocking * writers diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 595014f64830f822d6d9135641e528353fdfe4f3..b775a4207ed9126a1d8315a380cee6d847297d18 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,8 +19,6 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); void btrfs_set_lock_blocking_read(struct extent_buffer *eb); void btrfs_set_lock_blocking_write(struct extent_buffer *eb); -void btrfs_clear_lock_blocking_read(struct extent_buffer *eb); -void btrfs_clear_lock_blocking_write(struct extent_buffer *eb); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 579d53ae256f3e63396d415f7efa8f976b7f5229..acad4174f68d314455bca77b59e200daaa679740 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -507,11 +507,6 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, return ret; } -static unsigned int lzo_set_level(unsigned int level) -{ - return 0; -} - const struct btrfs_compress_op btrfs_lzo_compress = { .init_workspace_manager = lzo_init_workspace_manager, .cleanup_workspace_manager = lzo_cleanup_workspace_manager, @@ -522,5 +517,6 @@ const struct btrfs_compress_op btrfs_lzo_compress = { .compress_pages = lzo_compress_pages, .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, - .set_level = lzo_set_level, + .max_level = 1, + .default_level = 1, }; diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h deleted file mode 100644 index 75246f2f56baf32473ab992982923f678f3f229d..0000000000000000000000000000000000000000 --- a/fs/btrfs/math.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 Fujitsu. All rights reserved. - * Written by Miao Xie - */ - -#ifndef BTRFS_MATH_H -#define BTRFS_MATH_H - -#include - -static inline u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); -} - -static inline u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); -} - -#endif diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h new file mode 100644 index 0000000000000000000000000000000000000000..7d564924dfeb9f463e1e067db1a8ccaf447f084e --- /dev/null +++ b/fs/btrfs/misc.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MISC_H +#define BTRFS_MISC_H + +#include +#include +#include + +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) + +static inline void cond_wake_up(struct wait_queue_head *wq) +{ + /* + * This implies a full smp_mb barrier, see comments for + * waitqueue_active why. + */ + if (wq_has_sleeper(wq)) + wake_up(wq); +} + +static inline void cond_wake_up_nomb(struct wait_queue_head *wq) +{ + /* + * Special case for conditional wakeup where the barrier required for + * waitqueue_active is implied by some of the preceding code. Eg. one + * of such atomic operations (atomic_dec_and_return, ...), or a + * unlock/lock sequence, etc. + */ + if (waitqueue_active(wq)) + wake_up(wq); +} + +static inline u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + return div_u64(num, 10); +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + return div_u64(num, 100); +} + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ae7f64a8facb8758822f5cbd05f0bd0094de361a..24b6c72b9a59054743d316fe7e5e71f42a89efec 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,6 +7,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index e0469816c678be793d13619d465c54646eef268a..1e664e0b59b80f82796f689c3abce8da0c544284 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -362,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * reservations if we do add more properties in the future. */ if (need_reserve) { - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); ret = btrfs_block_rsv_add(root, trans->block_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f8a3c1b0a15a81d5cb3a9ea5cdd9c6205fe32d96..8d3bd799ac7dadb06a42dff8799c886b758f048d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -21,7 +21,7 @@ #include "backref.h" #include "extent_io.h" #include "qgroup.h" - +#include "block-group.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -1312,8 +1312,9 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; struct ulist *tmp; + bool found = false; int ret = 0; - int err; + int ret2; tmp = ulist_alloc(GFP_KERNEL); if (!tmp) @@ -1327,28 +1328,39 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); - if (!member || !parent) { - ret = -EINVAL; - goto out; - } + /* + * The parent/member pair doesn't exist, then try to delete the dead + * relation items only. + */ + if (!member || !parent) + goto delete_item; /* check if such qgroup relation exist firstly */ list_for_each_entry(list, &member->groups, next_group) { - if (list->group == parent) - goto exist; + if (list->group == parent) { + found = true; + break; + } } - ret = -ENOENT; - goto out; -exist: + +delete_item: ret = del_qgroup_relation_item(trans, src, dst); - err = del_qgroup_relation_item(trans, dst, src); - if (err && !ret) - ret = err; + if (ret < 0 && ret != -ENOENT) + goto out; + ret2 = del_qgroup_relation_item(trans, dst, src); + if (ret2 < 0 && ret2 != -ENOENT) + goto out; - spin_lock(&fs_info->qgroup_lock); - del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); - spin_unlock(&fs_info->qgroup_lock); + /* At least one deletion succeeded, return 0 */ + if (!ret || !ret2) + ret = 0; + + if (found) { + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + spin_unlock(&fs_info->qgroup_lock); + } out: ulist_free(tmp); return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f3d0576dd327c4850acdb280455e338295d8df63..57a2ac721985062661329ffffc27e623eac5bcc4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -35,6 +35,22 @@ #define RBIO_CACHE_SIZE 1024 +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + spinlock_t lock; +}; + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index bb5bd49573b4cc116274d92d0bef180b0c1ec19f..ee6f60547a8d9eeb67d593bc76ce3f1c197524fa 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -14,6 +14,7 @@ #include "disk-io.h" #include "transaction.h" #include "dev-replace.h" +#include "block-group.h" #undef DEBUG @@ -638,6 +639,35 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } +static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + int ret; + + buf = btrfs_find_create_tree_block(fs_info, bytenr); + if (IS_ERR(buf)) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); + if (ret) { + free_extent_buffer_stale(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer_stale(buf); + return -EIO; + } else if (extent_buffer_uptodate(buf)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + static int reada_start_machine_dev(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f219851fa23630cd5b7a516ec03452d229912a2..2f0e25afa48605f1ff0938271c18f3f49cc95ce2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" +#include "block-group.h" /* * backref_node, mapping_node and tree_block start with this @@ -3311,7 +3312,7 @@ static int relocate_file_extent_cluster(struct inode *inode, } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - NULL, 0); + NULL); if (ret) { unlock_page(page); put_page(page); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 47733fb55df7f08da167777451ded39ba8eecff9..3b17b647d002f2c5cb747ade373500a0921f08b4 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -533,7 +533,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, return ret; } - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, items); rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0c99cf9fb595c25e8954223ef27ccc2da33f3777..f7d4e03f4c5d5ba78b5fd8e01b8ed3be3ea274a1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -18,6 +18,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "raid56.h" +#include "block-group.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c3c0c064c25dadd317841e1d2761e2629b6ec9f3..f3215028235c494c16d46b89c16733c7be4399d7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -260,6 +260,21 @@ struct name_cache_entry { char name[]; }; +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, +}; +typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, + struct btrfs_path *right_path, + struct btrfs_key *key, + enum btrfs_compare_tree_result result, + void *ctx); + __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, enum btrfs_compare_tree_result result, @@ -6514,6 +6529,366 @@ static int full_send_tree(struct send_ctx *sctx) return ret; } +static int tree_move_down(struct btrfs_path *path, int *level) +{ + struct extent_buffer *eb; + + BUG_ON(*level == 0); + eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; + path->slots[*level - 1] = 0; + (*level)--; + return 0; +} + +static int tree_move_next_or_upnext(struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(path, level, root_level); + } else { + ret = tree_move_down(path, level); + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +static int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, + btrfs_changed_cb_t changed_cb, void *ctx) +{ + struct btrfs_fs_info *fs_info = left_root->fs_info; + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + up_read(&fs_info->commit_root_sem); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + cond_resched(); + if (advance_left && !left_end_reached) { + ret = tree_advance(left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret == -1) + left_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret == -1) + right_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result result; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_path, right_path, + tmp_buf); + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_path, right_path, + &left_key, result, ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kvfree(tmp_buf); + return ret; +} + static int send_subvol(struct send_ctx *sctx) { int ret; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ab7b9ec4c240a6ea0b32d08a6027d28078b85363..98dc092a905e34648664c6d5db4eb2395fb90fca 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "misc.h" #include "ctree.h" #include "space-info.h" #include "sysfs.h" @@ -7,7 +8,7 @@ #include "free-space-cache.h" #include "ordered-data.h" #include "transaction.h" -#include "math.h" +#include "block-group.h" u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) @@ -33,23 +34,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -static const char *alloc_name(u64 flags) -{ - switch (flags) { - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: - return "mixed"; - case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; - case BTRFS_BLOCK_GROUP_DATA: - return "data"; - case BTRFS_BLOCK_GROUP_SYSTEM: - return "system"; - default: - WARN_ON(1); - return "invalid-combination"; - }; -} - static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -79,13 +63,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(space_info->flags)); - if (ret) { - kobject_put(&space_info->kobj); + ret = btrfs_sysfs_add_space_info_type(info, space_info); + if (ret) return ret; - } list_add_rcu(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -151,9 +131,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; - btrfs_space_info_add_new_bytes(info, found, - total_bytes - bytes_used - - bytes_readonly); + btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); *space_info = found; } @@ -187,9 +165,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush, bool system_chunk) { - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; - u64 space_size; u64 avail; u64 used; int factor; @@ -203,22 +179,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, else profile = btrfs_metadata_alloc_profile(fs_info); - used = btrfs_space_info_used(space_info, false); - - /* - * We only want to allow over committing if we have lots of actual space - * free, but if we don't have enough space to handle the global reserve - * space then we could end up having a real enospc problem when trying - * to allocate a chunk or some other such important allocation. - */ - spin_lock(&global_rsv->lock); - space_size = calc_global_rsv_need_space(global_rsv); - spin_unlock(&global_rsv->lock); - if (used + space_size >= space_info->total_bytes) - return 0; - - used += space_info->bytes_may_use; - + used = btrfs_space_info_used(space_info, true); avail = atomic64_read(&fs_info->free_chunk_space); /* @@ -249,103 +210,41 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ -void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) { - struct reserve_ticket *ticket; struct list_head *head; - u64 used; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; - bool check_overcommit = false; - spin_lock(&space_info->lock); - head = &space_info->priority_tickets; + lockdep_assert_held(&space_info->lock); - /* - * If we are over our limit then we need to check and see if we can - * overcommit, and if we can't then we just need to free up our space - * and not satisfy any requests. - */ - used = btrfs_space_info_used(space_info, true); - if (used - num_bytes >= space_info->total_bytes) - check_overcommit = true; + head = &space_info->priority_tickets; again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - /* - * We use 0 bytes because this space is already reserved, so - * adding the ticket space would be a double count. - */ - if (check_overcommit && - !can_overcommit(fs_info, space_info, 0, flush, false)) - break; - if (num_bytes >= ticket->bytes) { - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - flush = BTRFS_RESERVE_FLUSH_ALL; - goto again; - } - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); -} + while (!list_empty(head)) { + struct reserve_ticket *ticket; + u64 used = btrfs_space_info_used(space_info, true); -/* - * This is for newly allocated space that isn't accounted in - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent - * we use this helper. - */ -void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head = &space_info->priority_tickets; + ticket = list_first_entry(head, struct reserve_ticket, list); -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - if (num_bytes >= ticket->bytes) { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - ticket->bytes, 1); - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; + /* Check and see if our ticket can be satisified now. */ + if ((used + ticket->bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, ticket->bytes, flush, + false)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); + list_del_init(&ticket->list); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); } else { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 1); - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - num_bytes); - ticket->bytes -= num_bytes; - num_bytes = 0; + break; } } - if (num_bytes && head == &space_info->priority_tickets) { + if (head == &space_info->priority_tickets) { head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } } @@ -359,14 +258,11 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) +static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info) { - struct btrfs_block_group_cache *cache; - int index = 0; + lockdep_assert_held(&info->lock); - spin_lock(&info->lock); btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", info->flags, info->total_bytes - btrfs_space_info_used(info, true), @@ -376,7 +272,6 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly); - spin_unlock(&info->lock); DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); @@ -384,6 +279,19 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + __btrfs_dump_space_info(fs_info, info); + spin_unlock(&info->lock); + if (!dump_block_groups) return; @@ -432,7 +340,7 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 bytes; u64 nr; - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); nr = div64_u64(to_reclaim, bytes); if (!nr) nr = 1; @@ -557,12 +465,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; u64 bytes_needed; u64 reclaim_bytes = 0; + u64 cur_free_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; spin_lock(&space_info->lock); + cur_free_bytes = btrfs_space_info_used(space_info, true); + if (cur_free_bytes < space_info->total_bytes) + cur_free_bytes = space_info->total_bytes - cur_free_bytes; + else + cur_free_bytes = 0; + if (!list_empty(&space_info->priority_tickets)) ticket = list_first_entry(&space_info->priority_tickets, struct reserve_ticket, list); @@ -570,6 +485,11 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); bytes_needed = (ticket) ? ticket->bytes : 0; + + if (bytes_needed > cur_free_bytes) + bytes_needed -= cur_free_bytes; + else + bytes_needed = 0; spin_unlock(&space_info->lock); if (!bytes_needed) @@ -684,7 +604,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, if (ret > 0 || ret == -ENOSPC) ret = 0; break; - case COMMIT_TRANS: + case RUN_DELAYED_IPUTS: /* * If we have pending delayed iputs then we could free up a * bunch of pinned space, so make sure we run the iputs before @@ -692,7 +612,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, */ btrfs_run_delayed_iputs(fs_info); btrfs_wait_on_delayed_iputs(fs_info); - + break; + case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info); break; default: @@ -762,19 +683,70 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static bool wake_all_tickets(struct list_head *head) +/* + * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * @fs_info - fs_info for this fs + * @space_info - the space info we were flushing + * + * We call this when we've exhausted our flushing ability and haven't made + * progress in satisfying tickets. The reservation code handles tickets in + * order, so if there is a large ticket first and then smaller ones we could + * very well satisfy the smaller tickets. This will attempt to wake up any + * tickets in the list to catch this case. + * + * This function returns true if it was able to make progress by clearing out + * other tickets, or if it stumbles across a ticket that was smaller than the + * first ticket. + */ +static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket; + u64 tickets_id = space_info->tickets_id; + u64 first_ticket_bytes = 0; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); + __btrfs_dump_space_info(fs_info, space_info); + } + + while (!list_empty(&space_info->tickets) && + tickets_id == space_info->tickets_id) { + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + + /* + * may_commit_transaction will avoid committing the transaction + * if it doesn't feel like the space reclaimed by the commit + * would result in the ticket succeeding. However if we have a + * smaller ticket in the queue it may be small enough to be + * satisified by committing the transaction, so if any + * subsequent ticket is smaller than the first ticket go ahead + * and send us back for another loop through the enospc flushing + * code. + */ + if (first_ticket_bytes == 0) + first_ticket_bytes = ticket->bytes; + else if (first_ticket_bytes > ticket->bytes) + return true; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); - while (!list_empty(head)) { - ticket = list_first_entry(head, struct reserve_ticket, list); list_del_init(&ticket->list); ticket->error = -ENOSPC; wake_up(&ticket->wait); - if (ticket->bytes != ticket->orig_bytes) - return true; + + /* + * We're just throwing tickets away, so more flushing may not + * trip over btrfs_try_granting_tickets, so we need to call it + * here to see if we can make progress with the next ticket in + * the list. + */ + btrfs_try_granting_tickets(fs_info, space_info); } - return false; + return (tickets_id != space_info->tickets_id); } /* @@ -842,7 +814,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { - if (wake_all_tickets(&space_info->tickets)) { + if (maybe_fail_all_tickets(fs_info, space_info)) { flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { @@ -867,9 +839,22 @@ static const enum btrfs_flush_state priority_flush_states[] = { ALLOC_CHUNK, }; +static const enum btrfs_flush_state evict_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + FLUSH_DELAYED_REFS_NR, + FLUSH_DELAYED_REFS, + FLUSH_DELALLOC, + FLUSH_DELALLOC_WAIT, + ALLOC_CHUNK, + COMMIT_TRANS, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) { u64 to_reclaim; int flush_state; @@ -885,8 +870,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, - priority_flush_states[flush_state]); + flush_space(fs_info, space_info, to_reclaim, states[flush_state]); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -894,23 +878,22 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, return; } spin_unlock(&space_info->lock); - } while (flush_state < ARRAY_SIZE(priority_flush_states)); + } while (flush_state < states_nr); } -static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) +static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) { DEFINE_WAIT(wait); - u64 reclaim_bytes = 0; int ret = 0; spin_lock(&space_info->lock); while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); if (ret) { - ret = -EINTR; + ticket->error = -EINTR; break; } spin_unlock(&space_info->lock); @@ -920,17 +903,54 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, finish_wait(&ticket->wait, &wait); spin_lock(&space_info->lock); } - if (!ret) - ret = ticket->error; - if (!list_empty(&ticket->list)) - list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) - reclaim_bytes = ticket->orig_bytes - ticket->bytes; spin_unlock(&space_info->lock); +} + +/** + * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket + * @fs_info - the fs + * @space_info - the space_info for the reservation + * @ticket - the ticket for the reservation + * @flush - how much we can flush + * + * This does the work of figuring out how to flush for the ticket, waiting for + * the reservation, and returning the appropriate error if there is one. + */ +static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + switch (flush) { + case BTRFS_RESERVE_FLUSH_ALL: + wait_reserve_ticket(fs_info, space_info, ticket); + break; + case BTRFS_RESERVE_FLUSH_LIMIT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_flush_states, + ARRAY_SIZE(priority_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_EVICT: + priority_reclaim_metadata_space(fs_info, space_info, ticket, + evict_flush_states, + ARRAY_SIZE(evict_flush_states)); + break; + default: + ASSERT(0); + break; + } - if (reclaim_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - reclaim_bytes); + spin_lock(&space_info->lock); + ret = ticket->error; + if (ticket->bytes || ticket->error) { + list_del_init(&ticket->list); + if (!ret) + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket->list)); return ret; } @@ -956,8 +976,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, { struct reserve_ticket ticket; u64 used; - u64 reclaim_bytes = 0; int ret = 0; + bool pending_tickets; ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); @@ -965,18 +985,19 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); + pending_tickets = !list_empty(&space_info->tickets) || + !list_empty(&space_info->priority_tickets); /* * Carry on if we have enough space (short-circuit) OR call * can_overcommit() to ensure we can overcommit to continue. */ - if ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { + if (!pending_tickets && + ((used + orig_bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); ret = 0; } @@ -988,7 +1009,6 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ticket.orig_bytes = orig_bytes; ticket.bytes = orig_bytes; ticket.error = 0; init_waitqueue_head(&ticket.wait); @@ -1028,25 +1048,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) return ret; - if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket); - - ret = 0; - priority_reclaim_metadata_space(fs_info, space_info, &ticket); - spin_lock(&space_info->lock); - if (ticket.bytes) { - if (ticket.bytes < orig_bytes) - reclaim_bytes = orig_bytes - ticket.bytes; - list_del_init(&ticket.list); - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - reclaim_bytes); - ASSERT(list_empty(&ticket.list)); - return ret; + return handle_reserve_ticket(fs_info, space_info, &ticket, flush); } /** diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c2b54b8e1a147e4af3356d94f776ef3e27dde130..8867e84aa33dab3ea7355d0baac70461c606c9bd 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -70,7 +70,6 @@ struct btrfs_space_info { }; struct reserve_ticket { - u64 orig_bytes; u64 bytes; int error; struct list_head list; @@ -87,14 +86,18 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) * * Declare a helper function to detect underflow of various space info members */ -#define DECLARE_SPACE_INFO_UPDATE(name) \ +#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + trace_btrfs_space_reservation(fs_info, trace_name, \ + sinfo->flags, abs_bytes, \ + bytes > 0); \ if (bytes < 0 && sinfo->name < -bytes) { \ WARN_ON(1); \ sinfo->name = 0; \ @@ -103,15 +106,9 @@ btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ sinfo->name += bytes; \ } -DECLARE_SPACE_INFO_UPDATE(bytes_may_use); -DECLARE_SPACE_INFO_UPDATE(bytes_pinned); +DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); -void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); -void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, @@ -129,5 +126,18 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); +void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); + +static inline void btrfs_space_info_free_bytes_may_use( + struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + spin_lock(&space_info->lock); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +} #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 4c13b737f568db9c2917ba4b610edd188e0528cb..73f7987143dff881050747bfefee26e5cc3c6a36 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -33,6 +33,8 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The extent buffer api is used to do the page spanning work required to * have a metadata blocksize different from the page size. + * + * There are 2 variants defined, one with a token pointer and one without. */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ @@ -50,8 +52,10 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ int size = sizeof(u##bits); \ u##bits res; \ \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ + ASSERT(token); \ + ASSERT(token->eb == eb); \ + \ + if (token->kaddr && token->offset <= offset && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ @@ -68,11 +72,33 @@ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ } \ p = kaddr + part_offset - map_start; \ res = get_unaligned_le##bits(p + off); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ + token->kaddr = kaddr; \ + token->offset = map_start; \ + return res; \ +} \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + u##bits res; \ + \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + \ + read_extent_buffer(eb, &leres, offset, size); \ + return le##bits##_to_cpu(leres); \ } \ + p = kaddr + part_offset - map_start; \ + res = get_unaligned_le##bits(p + off); \ return res; \ } \ void btrfs_set_token_##bits(struct extent_buffer *eb, \ @@ -89,8 +115,10 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ unsigned long map_len; \ int size = sizeof(u##bits); \ \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ + ASSERT(token); \ + ASSERT(token->eb == eb); \ + \ + if (token->kaddr && token->offset <= offset && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ @@ -108,11 +136,32 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ } \ p = kaddr + part_offset - map_start; \ put_unaligned_le##bits(val, p + off); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ + token->kaddr = kaddr; \ + token->offset = map_start; \ +} \ +void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)ptr; \ + unsigned long offset = part_offset + off; \ + void *p; \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + int size = sizeof(u##bits); \ + \ + err = map_private_extent_buffer(eb, offset, size, \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + \ + val2 = cpu_to_le##bits(val); \ + write_extent_buffer(eb, &val2, offset, size); \ + return; \ } \ + p = kaddr + part_offset - map_start; \ + put_unaligned_le##bits(val, p + off); \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 78de9d5d80c6f0b63f19094c40b01f8569f5d447..1b151af2577288055d1743b23732294bf2864d13 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -43,7 +43,9 @@ #include "free-space-cache.h" #include "backref.h" #include "space-info.h" +#include "sysfs.h" #include "tests/btrfs-tests.h" +#include "block-group.h" #include "qgroup.h" #define CREATE_TRACE_POINTS @@ -1899,11 +1901,10 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 skip_space; u64 type; u64 avail_space; u64 min_stripe_size; - int min_stripes, num_stripes = 1; + int num_stripes = 1; int i = 0, nr_devices; const struct btrfs_raid_attr *rattr; @@ -1930,7 +1931,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; - min_stripes = rattr->devs_min; if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; @@ -1956,28 +1956,21 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); - avail_space *= BTRFS_STRIPE_LEN; + avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. + * + * This ensures we have at least min_stripe_size free space + * after excluding 1MB. */ - skip_space = SZ_1M; - - /* - * we can use the free space in [0, skip_space - 1], subtract - * it from the total. - */ - if (avail_space && avail_space >= skip_space) - avail_space -= skip_space; - else - avail_space = 0; - - if (avail_space < min_stripe_size) + if (avail_space <= SZ_1M + min_stripe_size) continue; + avail_space -= SZ_1M; + devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -1991,9 +1984,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, i = nr_devices - 1; avail_space = 0; - while (nr_devices >= min_stripes) { - if (num_stripes > nr_devices) - num_stripes = nr_devices; + while (nr_devices >= rattr->devs_min) { + num_stripes = min(num_stripes, nr_devices); if (devices_info[i].max_avail >= min_stripe_size) { int j; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9539f8143b7a5c2d433184536a049948fc8bcdb4..f6d3c80f2e289535fd7e32f47e03f33fdd4d1344 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -4,12 +4,11 @@ */ #include +#include #include #include #include -#include #include -#include #include "ctree.h" #include "disk-io.h" @@ -17,10 +16,75 @@ #include "sysfs.h" #include "volumes.h" #include "space-info.h" +#include "block-group.h" + +struct btrfs_feature_attr { + struct kobj_attribute kobj_attr; + enum btrfs_feature_set feature_set; + u64 feature_bit; +}; + +/* For raid type sysfs entries */ +struct raid_kobject { + u64 flags; + struct kobject kobj; +}; + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) + +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ + .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ + btrfs_feature_attr_show, \ + btrfs_feature_attr_store), \ + .feature_set = _feature_set, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ +} +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) + +#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) +#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) +#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ + BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static struct btrfs_feature_attr *attr_to_btrfs_feature_attr( + struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) { @@ -247,6 +311,25 @@ static const struct attribute_group btrfs_static_feature_attr_group = { .attrs = btrfs_supported_static_feature_attrs, }; +#ifdef CONFIG_BTRFS_DEBUG + +/* + * Runtime debugging exported via sysfs + * + * /sys/fs/btrfs/debug - applies to module or all filesystems + * /sys/fs/btrfs/UUID - applies only to the given filesystem + */ +static struct attribute *btrfs_debug_feature_attrs[] = { + NULL +}; + +static const struct attribute_group btrfs_debug_feature_attr_group = { + .name = "debug", + .attrs = btrfs_debug_feature_attrs, +}; + +#endif + static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) { u64 val; @@ -316,7 +399,7 @@ static void release_raid_kobj(struct kobject *kobj) kfree(to_raid_kobj(kobj)); } -struct kobj_type btrfs_raid_ktype = { +static struct kobj_type btrfs_raid_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = release_raid_kobj, .default_groups = raid_groups, @@ -375,7 +458,7 @@ static void space_info_release(struct kobject *kobj) kfree(sinfo); } -struct kobj_type space_info_ktype = { +static struct kobj_type space_info_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = space_info_release, .default_groups = space_info_groups, @@ -655,12 +738,17 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } -const char * const btrfs_feature_set_names[FEAT_MAX] = { +static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_COMPAT] = "compat", [FEAT_COMPAT_RO] = "compat_ro", [FEAT_INCOMPAT] = "incompat", }; +const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +{ + return btrfs_feature_set_names[set]; +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags) { size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */ @@ -730,6 +818,110 @@ static void init_feature_attrs(void) } } +/* + * Create a sysfs entry for a given block group type at path + * /sys/fs/btrfs/UUID/allocation/data/TYPE + */ +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; + struct raid_kobject *rkobj; + const int index = btrfs_bg_flags_to_raid_index(cache->flags); + unsigned int nofs_flag; + int ret; + + /* + * Setup a NOFS context because kobject_add(), deep in its call chain, + * does GFP_KERNEL allocations, and we are often called in a context + * where if reclaim is triggered we can deadlock (we are either holding + * a transaction handle or some lock required for a transaction + * commit). + */ + nofs_flag = memalloc_nofs_save(); + + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) { + memalloc_nofs_restore(nofs_flag); + btrfs_warn(cache->fs_info, + "couldn't alloc memory for raid level kobject"); + return; + } + + rkobj->flags = cache->flags; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", + btrfs_bg_type_to_raid_name(rkobj->flags)); + memalloc_nofs_restore(nofs_flag); + if (ret) { + kobject_put(&rkobj->kobj); + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); + return; + } + + space_info->block_group_kobjs[index] = &rkobj->kobj; +} + +/* + * Remove sysfs directories for all block group types of a given space info and + * the space info as well + */ +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) +{ + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +/* + * Create a sysfs entry for a space info type at path + * /sys/fs/btrfs/UUID/allocation/TYPE + */ +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + int ret; + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + fs_info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + return 0; +} + /* when one_device is NULL, it removes all device links */ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, @@ -806,14 +998,34 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, return error; } -/* /sys/fs/btrfs/ entry */ -static struct kset *btrfs_kset; +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) +{ + int ret; -/* /sys/kernel/debug/btrfs */ -static struct dentry *btrfs_debugfs_root_dentry; + ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); + if (ret) + pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), + &disk_to_dev(bdev->bd_disk)->kobj); +} -/* Debugging tunables and exported data */ -u64 btrfs_debugfs_test; +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + + /* + * Sprouting changes fsid of the mounted filesystem, rename the fsid + * directory + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) + btrfs_warn(fs_devices->fs_info, + "sysfs: failed to create fsid for sprout"); +} + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; /* * Can be called by the device discovery thread. @@ -859,6 +1071,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) if (error) goto failure; +#ifdef CONFIG_BTRFS_DEBUG + error = sysfs_create_group(fsid_kobj, + &btrfs_debug_feature_attr_group); + if (error) + goto failure; +#endif + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; @@ -913,25 +1132,6 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); } -static void btrfs_init_debugfs(void) -{ -#ifdef CONFIG_DEBUG_FS - btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); - - /* - * Example code, how to export data through debugfs. - * - * file: /sys/kernel/debug/btrfs/test - * contents of: btrfs_debugfs_test - */ -#ifdef CONFIG_BTRFS_DEBUG - debugfs_create_u64("test", S_IRUGO | S_IWUSR, btrfs_debugfs_root_dentry, - &btrfs_debugfs_test); -#endif - -#endif -} - int __init btrfs_init_sysfs(void) { int ret; @@ -940,8 +1140,6 @@ int __init btrfs_init_sysfs(void) if (!btrfs_kset) return -ENOMEM; - btrfs_init_debugfs(); - init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); if (ret) @@ -951,12 +1149,17 @@ int __init btrfs_init_sysfs(void) if (ret) goto out_remove_group; +#ifdef CONFIG_BTRFS_DEBUG + ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); + if (ret) + goto out2; +#endif + return 0; out_remove_group: sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); out2: - debugfs_remove_recursive(btrfs_debugfs_root_dentry); kset_unregister(btrfs_kset); return ret; @@ -968,6 +1171,5 @@ void __cold btrfs_exit_sysfs(void) &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); - debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 40716b357c1d516193958479ea28702b0014003b..610e9c36a94cd17423b388598b7be1dbe1e0ce53 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -3,10 +3,7 @@ #ifndef BTRFS_SYSFS_H #define BTRFS_SYSFS_H -/* - * Data exported through sysfs - */ -extern u64 btrfs_debugfs_test; +#include enum btrfs_feature_set { FEAT_COMPAT, @@ -15,71 +12,8 @@ enum btrfs_feature_set { FEAT_MAX }; -#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ -{ \ - .attr = { .name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -} - -#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ - static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ - __INIT_KOBJ_ATTR(_name, 0644, _show, _store) - -#define BTRFS_ATTR(_prefix, _name, _show) \ - static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) - -#define BTRFS_ATTR_PTR(_prefix, _name) \ - (&btrfs_attr_##_prefix##_##_name.attr) - - -struct btrfs_feature_attr { - struct kobj_attribute kobj_attr; - enum btrfs_feature_set feature_set; - u64 feature_bit; -}; - -#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ -static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ - .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ - btrfs_feature_attr_show, \ - btrfs_feature_attr_store), \ - .feature_set = _feature_set, \ - .feature_bit = _feature_prefix ##_## _feature_bit, \ -} -#define BTRFS_FEAT_ATTR_PTR(_name) \ - (&btrfs_attr_features_##_name.kobj_attr.attr) - -#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) -#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) -#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) - -/* convert from attribute */ -static inline struct btrfs_feature_attr * -to_btrfs_feature_attr(struct kobj_attribute *a) -{ - return container_of(a, struct btrfs_feature_attr, kobj_attr); -} - -static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) -{ - return container_of(attr, struct kobj_attribute, attr); -} - -static inline struct btrfs_feature_attr * -attr_to_btrfs_feature_attr(struct attribute *attr) -{ - return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); -} - char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -extern const char * const btrfs_feature_set_names[FEAT_MAX]; -extern struct kobj_type space_info_ktype; -extern struct kobj_type btrfs_raid_ktype; +const char * const btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, @@ -88,7 +22,19 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, + const u8 *fsid); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); +void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); + +int __init btrfs_init_sysfs(void); +void __cold btrfs_exit_sysfs(void); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache); +int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info); +void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 1e3ba4949399536929b2d6a951d04aec69357fb5..b5e80563efaa4fd616e6a1927dd35592e1bac118 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -15,6 +15,7 @@ #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../block-group.h" static struct vfsmount *test_mnt = NULL; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 1bf6b5a79191bc3339853a5e38694ffc8bff2fb7..123d9a614357278d4f35c20890f85bad3fb8c690 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -438,6 +438,7 @@ static int test_find_first_clear_extent_bit(void) { struct extent_io_tree tree; u64 start, end; + int ret = -EINVAL; test_msg("running find_first_clear_extent_bit test"); extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); @@ -452,9 +453,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - if (start != 0 || end != SZ_1M -1) + if (start != 0 || end != SZ_1M - 1) { test_err("error finding beginning range: start %llu end %llu", start, end); + goto out; + } /* Now add 32M-64M so that we have a hole between 4M-32M */ set_extent_bits(&tree, SZ_32M, SZ_64M - 1, @@ -466,9 +469,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - if (start != SZ_4M || end != SZ_32M - 1) + if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding trimmed range: start %llu end %llu", start, end); + goto out; + } /* * Search in the middle of allocated range, should get the next one @@ -477,9 +482,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - if (start != SZ_4M || end != SZ_32M -1) + if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding next unalloc range: start %llu end %llu", start, end); + goto out; + } /* * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag @@ -489,9 +496,11 @@ static int test_find_first_clear_extent_bit(void) find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, CHUNK_TRIMMED); - if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding exact range: start %llu end %llu", start, end); + goto out; + } find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, CHUNK_TRIMMED); @@ -500,21 +509,29 @@ static int test_find_first_clear_extent_bit(void) * Search in the middle of set range whose immediate neighbour doesn't * have the bits set so it must be returned */ - if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding next alloc range: start %llu end %llu", start, end); + goto out; + } /* * Search beyond any known range, shall return after last known range * and end should be -1 */ find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); - if (start != SZ_64M + SZ_8M || end != -1) + if (start != SZ_64M + SZ_8M || end != -1) { test_err( "error handling beyond end of range search: start %llu end %llu", start, end); + goto out; + } - return 0; + ret = 0; +out: + clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + + return ret; } int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index af89f66f9e6365663b1c2a8453241e6010f8630f..43ec7060fcd2982f7a49aec2ae97306fa1d6e663 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../disk-io.h" #include "../free-space-cache.h" +#include "../block-group.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index a90dad166971bdcfa619e2c39308bf10963abaf0..bc92df9776308d0e309f61747b0343ac9b5c3985 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -9,6 +9,7 @@ #include "../disk-io.h" #include "../free-space-tree.h" #include "../transaction.h" +#include "../block-group.h" struct free_space_extent { u64 start; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index bc6dbd1b42fd0e6275897217ce36011e6f3b1558..09ecf7dc7b0829b535b869f1350d76c75938bca0 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -957,7 +957,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, - NULL, 0); + NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -972,7 +972,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -988,8 +988,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1005,7 +1004,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1023,7 +1022,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, - 0, NULL, 0); + 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1040,7 +1039,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1056,8 +1055,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1075,7 +1073,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -1089,8 +1087,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1105,8 +1102,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e3adb714c04b364869e9da5c36e2f9fa0157b7fb..8624bdee8c5b62d3190605f56b0d85dabbc6230a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -19,6 +20,7 @@ #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" +#include "block-group.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -484,7 +486,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * worth of delayed refs updates in this trans handle, and * refill that amount for whatever is missing in the reserve. */ - num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (delayed_refs_rsv->full == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; @@ -635,7 +637,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( if (IS_ERR(trans)) return trans; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv, num_bytes, min_factor); if (ret) { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ccd5706199d76da7d0f1db7e900bb69f8fb9bd16..43e488f5d06314645440318519e4021c7a4d1657 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -821,6 +821,417 @@ static int check_inode_item(struct extent_buffer *leaf, return 0; } +static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_root_item ri; + const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | + BTRFS_ROOT_SUBVOL_DEAD; + + /* No such tree id */ + if (key->objectid == 0) { + generic_err(leaf, slot, "invalid root id 0"); + return -EUCLEAN; + } + + /* + * Some older kernel may create ROOT_ITEM with non-zero offset, so here + * we only check offset for reloc tree whose key->offset must be a + * valid tree. + */ + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } + + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + generic_err(leaf, slot, + "invalid root item size, have %u expect %zu", + btrfs_item_size_nr(leaf, slot), sizeof(ri)); + } + + read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), + sizeof(ri)); + + /* Generation related */ + if (btrfs_root_generation(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root generation, have %llu expect (0, %llu]", + btrfs_root_generation(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (btrfs_root_generation_v2(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root v2 generation, have %llu expect (0, %llu]", + btrfs_root_generation_v2(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (btrfs_root_last_snapshot(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1) { + generic_err(leaf, slot, + "invalid root last_snapshot, have %llu expect (0, %llu]", + btrfs_root_last_snapshot(&ri), + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + + /* Alignment and level check */ + if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) { + generic_err(leaf, slot, + "invalid root bytenr, have %llu expect to be aligned to %u", + btrfs_root_bytenr(&ri), fs_info->sectorsize); + return -EUCLEAN; + } + if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + if (ri.drop_level >= BTRFS_MAX_LEVEL) { + generic_err(leaf, slot, + "invalid root level, have %u expect [0, %u]", + ri.drop_level, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* Flags check */ + if (btrfs_root_flags(&ri) & ~valid_root_flags) { + generic_err(leaf, slot, + "invalid root flags, have 0x%llx expect mask 0x%llx", + btrfs_root_flags(&ri), valid_root_flags); + return -EUCLEAN; + } + return 0; +} + +__printf(3,4) +__cold +static void extent_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + u64 bytenr; + u64 len; + + btrfs_item_key_to_cpu(eb, &key, slot); + bytenr = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_TREE_BLOCK_REF_KEY || + key.type == BTRFS_SHARED_BLOCK_REF_KEY) + len = eb->fs_info->nodesize; + else + len = key.offset; + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + eb->start, slot, bytenr, len, &vaf); + va_end(args); +} + +static int check_extent_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_extent_item *ei; + bool is_tree_block = false; + unsigned long ptr; /* Current pointer inside inline refs */ + unsigned long end; /* Extent item end */ + const u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 flags; + u64 generation; + u64 total_refs; /* Total refs in btrfs_extent_item */ + u64 inline_refs = 0; /* found total inline refs */ + + if (key->type == BTRFS_METADATA_ITEM_KEY && + !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + generic_err(leaf, slot, +"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); + return -EUCLEAN; + } + /* key->objectid is the bytenr for both key types */ + if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) { + generic_err(leaf, slot, + "invalid key objectid, have %llu expect to be aligned to %u", + key->objectid, fs_info->sectorsize); + return -EUCLEAN; + } + + /* key->offset is tree level for METADATA_ITEM_KEY */ + if (key->type == BTRFS_METADATA_ITEM_KEY && + key->offset >= BTRFS_MAX_LEVEL) { + extent_err(leaf, slot, + "invalid tree level, have %llu expect [0, %u]", + key->offset, BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + + /* + * EXTENT/METADATA_ITEM consists of: + * 1) One btrfs_extent_item + * Records the total refs, type and generation of the extent. + * + * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only) + * Records the first key and level of the tree block. + * + * 2) Zero or more btrfs_extent_inline_ref(s) + * Each inline ref has one btrfs_extent_inline_ref shows: + * 2.1) The ref type, one of the 4 + * TREE_BLOCK_REF Tree block only + * SHARED_BLOCK_REF Tree block only + * EXTENT_DATA_REF Data only + * SHARED_DATA_REF Data only + * 2.2) Ref type specific data + * Either using btrfs_extent_inline_ref::offset, or specific + * data structure. + */ + if (item_size < sizeof(*ei)) { + extent_err(leaf, slot, + "invalid item size, have %u expect [%zu, %u)", + item_size, sizeof(*ei), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + end = item_size + btrfs_item_ptr_offset(leaf, slot); + + /* Checks against extent_item */ + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + total_refs = btrfs_extent_refs(leaf, ei); + generation = btrfs_extent_generation(leaf, ei); + if (generation > btrfs_super_generation(fs_info->super_copy) + 1) { + extent_err(leaf, slot, + "invalid generation, have %llu expect (0, %llu]", + generation, + btrfs_super_generation(fs_info->super_copy) + 1); + return -EUCLEAN; + } + if (!is_power_of_2(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK))) { + extent_err(leaf, slot, + "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", + flags, BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK); + return -EUCLEAN; + } + is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); + if (is_tree_block) { + if (key->type == BTRFS_EXTENT_ITEM_KEY && + key->offset != fs_info->nodesize) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect %u", + key->offset, fs_info->nodesize); + return -EUCLEAN; + } + } else { + if (key->type != BTRFS_EXTENT_ITEM_KEY) { + extent_err(leaf, slot, + "invalid key type, have %u expect %u for data backref", + key->type, BTRFS_EXTENT_ITEM_KEY); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid extent length, have %llu expect aligned to %u", + key->offset, fs_info->sectorsize); + return -EUCLEAN; + } + } + ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); + + /* Check the special case of btrfs_tree_block_info */ + if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)ptr; + if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) { + extent_err(leaf, slot, + "invalid tree block info level, have %u expect [0, %u]", + btrfs_tree_block_level(leaf, info), + BTRFS_MAX_LEVEL - 1); + return -EUCLEAN; + } + ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1); + } + + /* Check inline refs */ + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u64 dref_offset; + u64 inline_offset; + u8 inline_type; + + if (ptr + sizeof(*iref) > end) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %zu end %lu", + ptr, sizeof(*iref), end); + return -EUCLEAN; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + inline_type = btrfs_extent_inline_ref_type(leaf, iref); + inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) { + extent_err(leaf, slot, +"inline ref item overflows extent item, ptr %lu iref size %u end %lu", + ptr, inline_type, end); + return -EUCLEAN; + } + + switch (inline_type) { + /* inline_offset is subvolid of the owner, no need to check */ + case BTRFS_TREE_BLOCK_REF_KEY: + inline_refs++; + break; + /* Contains parent bytenr */ + case BTRFS_SHARED_BLOCK_REF_KEY: + if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs++; + break; + /* + * Contains owner subvolid, owner key objectid, adjusted offset. + * The only obvious corruption can happen in that offset. + */ + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid data ref offset, have %llu expect aligned to %u", + dref_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_extent_data_ref_count(leaf, dref); + break; + /* Contains parent bytenr and ref count */ + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid data parent bytenr, have %llu expect aligned to %u", + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } + inline_refs += btrfs_shared_data_ref_count(leaf, sref); + break; + default: + extent_err(leaf, slot, "unknown inline ref type: %u", + inline_type); + return -EUCLEAN; + } + ptr += btrfs_extent_inline_ref_size(inline_type); + } + /* No padding is allowed */ + if (ptr != end) { + extent_err(leaf, slot, + "invalid extent item size, padding bytes found"); + return -EUCLEAN; + } + + /* Finally, check the inline refs against total refs */ + if (inline_refs > total_refs) { + extent_err(leaf, slot, + "invalid extent refs, have %llu expect >= inline %llu", + total_refs, inline_refs); + return -EUCLEAN; + } + return 0; +} + +static int check_simple_keyed_refs(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 expect_item_size = 0; + + if (key->type == BTRFS_SHARED_DATA_REF_KEY) + expect_item_size = sizeof(struct btrfs_shared_data_ref); + + if (btrfs_item_size_nr(leaf, slot) != expect_item_size) { + generic_err(leaf, slot, + "invalid item size, have %u expect %u for key type %u", + btrfs_item_size_nr(leaf, slot), + expect_item_size, key->type); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + if (key->type != BTRFS_TREE_BLOCK_REF_KEY && + !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid tree parent bytenr, have %llu expect aligned to %u", + key->offset, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + return 0; +} + +static int check_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_extent_data_ref *dref; + unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); + const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); + + if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) { + generic_err(leaf, slot, + "invalid item size, have %u expect aligned to %zu for key type %u", + btrfs_item_size_nr(leaf, slot), + sizeof(*dref), key->type); + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +"invalid key objectid for shared block ref, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + for (; ptr < end; ptr += sizeof(*dref)) { + u64 root_objectid; + u64 owner; + u64 offset; + u64 hash; + + dref = (struct btrfs_extent_data_ref *)ptr; + root_objectid = btrfs_extent_data_ref_root(leaf, dref); + owner = btrfs_extent_data_ref_objectid(leaf, dref); + offset = btrfs_extent_data_ref_offset(leaf, dref); + hash = hash_extent_data_ref(root_objectid, owner, offset); + if (hash != key->offset) { + extent_err(leaf, slot, + "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", + hash, key->offset); + return -EUCLEAN; + } + if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) { + extent_err(leaf, slot, + "invalid extent data backref offset, have %llu expect aligned to %u", + offset, leaf->fs_info->sectorsize); + } + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -856,6 +1267,21 @@ static int check_leaf_item(struct extent_buffer *leaf, case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(leaf, key, slot); break; + case BTRFS_ROOT_ITEM_KEY: + ret = check_root_item(leaf, key, slot); + break; + case BTRFS_EXTENT_ITEM_KEY: + case BTRFS_METADATA_ITEM_KEY: + ret = check_extent_item(leaf, key, slot); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = check_simple_keyed_refs(leaf, key, slot); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + ret = check_extent_data_ref(leaf, key, slot); + break; } return ret; } @@ -899,6 +1325,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) owner); return -EUCLEAN; } + /* Unknown tree */ + if (owner == 0) { + generic_err(leaf, 0, + "invalid owner, root 0 is not defined"); + return -EUCLEAN; + } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1bfd7e34f31e33709f0e937f83c6d978d74564c6..29b82a7955227bc68daa7628ceeceff95e2204ee 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -8,6 +8,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "tree-log.h" #include "disk-io.h" @@ -24,10 +25,12 @@ * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay */ -#define LOG_INODE_ALL 0 -#define LOG_INODE_EXISTS 1 -#define LOG_OTHER_INODE 2 -#define LOG_OTHER_INODE_ALL 3 +enum { + LOG_INODE_ALL, + LOG_INODE_EXISTS, + LOG_OTHER_INODE, + LOG_OTHER_INODE_ALL, +}; /* * directory trouble cases @@ -81,10 +84,12 @@ * The last stage is to deal with directories and links and extents * and all the other fun semantics */ -#define LOG_WALK_PIN_ONLY 0 -#define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_DIR_INDEX 2 -#define LOG_WALK_REPLAY_ALL 3 +enum { + LOG_WALK_PIN_ONLY, + LOG_WALK_REPLAY_INODES, + LOG_WALK_REPLAY_DIR_INDEX, + LOG_WALK_REPLAY_ALL, +}; static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -188,10 +193,6 @@ static int join_running_log_trans(struct btrfs_root *root) { int ret = -ENOENT; - smp_mb(); - if (!root->log_root) - return -ENOENT; - mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; @@ -505,7 +506,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, ino_size != 0) { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, dst_eb); btrfs_set_token_inode_size(dst_eb, dst_item, ino_size, &token); } @@ -967,7 +968,7 @@ static noinline int backref_in_log(struct btrfs_root *log, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, namelen, NULL)) + name, namelen)) match = 1; goto out; @@ -1266,12 +1267,12 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen); else - ret = btrfs_find_name_in_backref(log_eb, log_slot, name, - namelen, NULL); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, + name, namelen); if (!ret) { struct inode *dir; @@ -1333,12 +1334,11 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, goto out; } if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, - name, namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, name, namelen); else - ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen, NULL); + ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen); out: btrfs_free_path(path); @@ -3842,7 +3842,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); if (log_inode_only) { /* set the generation to zero so the recover code @@ -4302,8 +4302,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - btrfs_init_map_token(&token); - ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); @@ -4321,6 +4319,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } leaf = path->nodes[0]; + btrfs_init_map_token(&token, leaf); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -6233,7 +6232,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, - .stage = 0, + .stage = LOG_WALK_PIN_ONLY, }; path = btrfs_alloc_path(); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a447d3ec48d506b848e1ab228fc0ce55fb0f87c7..a324480bc88b9827f13f72d94beb76a9d7b3fb61 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include #include #include +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -24,11 +25,11 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" -#include "math.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" #include "space-info.h" +#include "block-group.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -190,7 +191,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -358,19 +358,6 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) kfree(fs_devices); } -static void btrfs_kobject_uevent(struct block_device *bdev, - enum kobject_action action) -{ - int ret; - - ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); - if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", - action, - kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), - &disk_to_dev(bdev->bd_disk)->kobj); -} - void __exit btrfs_cleanup_fs_uuids(void) { struct btrfs_fs_devices *fs_devices; @@ -1128,6 +1115,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_fs_devices *fs_devices; struct btrfs_device *device; struct btrfs_device *orig_dev; + int ret = 0; fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) @@ -1141,8 +1129,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid); - if (IS_ERR(device)) + if (IS_ERR(device)) { + ret = PTR_ERR(device); goto error; + } /* * This is ok to do without rcu read locked because we hold the @@ -1153,6 +1143,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) GFP_KERNEL); if (!name) { btrfs_free_device(device); + ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); @@ -1167,7 +1158,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) error: mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } /* @@ -1551,9 +1542,16 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, * @len is used to store the size of the free space that we find. * But if we don't find suitable free space, it is used to store the size of * the max free space. + * + * NOTE: This function will search *commit* root of device tree, and does extra + * check to ensure dev extents are not double allocated. + * This makes the function safe to allocate dev extents but may not report + * correct usable device space, as device extent freed in current transaction + * is not reported as avaiable. */ -int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *len) +static int find_free_dev_extent_start(struct btrfs_device *device, + u64 num_bytes, u64 search_start, u64 *start, + u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -1855,7 +1853,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* Corruption */ + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(fs_info->chunk_root, path, BTRFS_DEV_ITEMS_OBJECTID, @@ -2686,22 +2689,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; - ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } - /* Sprouting would change fsid of the mounted root, - * so rename the fsid on the sysfs - */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fs_devices->fsid); - if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) - btrfs_warn(fs_info, - "sysfs: failed to create fsid for sprout"); + btrfs_sysfs_update_sprout_fsid(fs_devices, + fs_info->fs_devices->fsid); } ret = btrfs_commit_transaction(trans); @@ -3076,10 +3071,6 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) */ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); - ret = btrfs_can_relocate(fs_info, chunk_offset); - if (ret) - return -ENOSPC; - /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); @@ -6011,7 +6002,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - u64 offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; @@ -6042,11 +6032,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, return ret; em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(em); + ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; - offset = geom.offset; stripe_len = geom.stripe_len; stripe_nr = geom.stripe_nr; stripe_offset = geom.stripe_offset; @@ -7296,18 +7285,32 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) } } -static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, + int index) { - int i; + u64 val; - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_reset(dev, i); + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_key key; - struct btrfs_key found_key; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct extent_buffer *eb; @@ -7318,10 +7321,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) int i; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { @@ -7333,14 +7334,14 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - __btrfs_reset_dev_stats(device); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); continue; } slot = path->slots[0]; eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); item_size = btrfs_item_size_nr(eb, slot); ptr = btrfs_item_ptr(eb, slot, @@ -7351,7 +7352,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) btrfs_dev_stat_set(device, i, btrfs_dev_stats_value(eb, ptr, i)); else - btrfs_dev_stat_reset(device, i); + btrfs_dev_stat_set(device, i, 0); } device->dev_stats_valid = 1; @@ -7360,7 +7361,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); -out: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -7534,7 +7534,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, stats->values[i] = btrfs_dev_stat_read_and_reset(dev, i); else - btrfs_dev_stat_reset(dev, i); + btrfs_dev_stat_set(dev, i, 0); } } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7f6aa1816409dca00c4024fd6e488af29b26f29c..a7da1f3e362758d44c6500785b2150a4e29d3388 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -82,7 +82,6 @@ struct btrfs_device { unsigned long dev_state; blk_status_t last_flush_error; - int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -475,8 +474,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *max_avail); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); @@ -550,12 +547,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } -static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, - int index) -{ - btrfs_dev_stat_set(dev, index, 0); -} - /* * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which * can be used as index to access btrfs_raid_array[]. diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b86b7ad6b900a6d2785eb77283cf148cc3d3258f..df1aace5df5010969b9fb001a68444d46680cbe3 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -418,14 +418,6 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, return ret; } -static unsigned int zlib_set_level(unsigned int level) -{ - if (!level) - return BTRFS_ZLIB_DEFAULT_LEVEL; - - return min_t(unsigned int, level, 9); -} - const struct btrfs_compress_op btrfs_zlib_compress = { .init_workspace_manager = zlib_init_workspace_manager, .cleanup_workspace_manager = zlib_cleanup_workspace_manager, @@ -436,5 +428,6 @@ const struct btrfs_compress_op btrfs_zlib_compress = { .compress_pages = zlib_compress_pages, .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, - .set_level = zlib_set_level, + .max_level = 9, + .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3837ca180d52d67ec1c59459ed3a5c0e0b40d378..764d47b107e569e35be29665c04869a66971b96b 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -17,6 +17,7 @@ #include #include #include +#include "misc.h" #include "compression.h" #include "ctree.h" @@ -710,14 +711,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, return ret; } -static unsigned int zstd_set_level(unsigned int level) -{ - if (!level) - return ZSTD_BTRFS_DEFAULT_LEVEL; - - return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL); -} - const struct btrfs_compress_op btrfs_zstd_compress = { .init_workspace_manager = zstd_init_workspace_manager, .cleanup_workspace_manager = zstd_cleanup_workspace_manager, @@ -728,5 +721,6 @@ const struct btrfs_compress_op btrfs_zstd_compress = { .compress_pages = zstd_compress_pages, .decompress_bio = zstd_decompress_bio, .decompress = zstd_decompress, - .set_level = zstd_set_level, + .max_level = ZSTD_BTRFS_MAX_LEVEL, + .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 2f6a669408bbd85d2bdc31787f0d59b896f9c03f..5df604de4f11937679c628c967de885cbbf7634f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1088,6 +1088,7 @@ TRACE_EVENT(btrfs_trigger_flush, { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ { ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE"}, \ + { RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS"}, \ { COMMIT_TRANS, "COMMIT_TRANS"}) TRACE_EVENT(btrfs_flush_space, @@ -2086,8 +2087,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write); -DEFINE_BTRFS_LOCK_EVENT(btrfs_clear_lock_blocking_read); -DEFINE_BTRFS_LOCK_EVENT(btrfs_clear_lock_blocking_write); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index c195896d478f295ee80b667597a6c400696ac0dc..3ee0678c0a8355ca8aa8495733e231a013042695 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -665,7 +665,12 @@ struct btrfs_ioctl_get_dev_stats { /* out values: */ __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; - __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ + /* + * This pads the struct to 1032 bytes. It was originally meant to pad to + * 1024 bytes, but when adding the flags field, the padding calculation + * was not adjusted. + */ + __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; }; #define BTRFS_QUOTA_CTL_ENABLE 1 @@ -917,10 +922,8 @@ enum btrfs_err_code { #define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \ struct btrfs_ioctl_quota_rescan_args) #define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46) -#define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \ - char[BTRFS_LABEL_SIZE]) -#define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \ - char[BTRFS_LABEL_SIZE]) +#define BTRFS_IOC_GET_FSLABEL FS_IOC_GETFSLABEL +#define BTRFS_IOC_SET_FSLABEL FS_IOC_SETFSLABEL #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 34d5b34286facd92b92d896f4a87905d4f86b0ff..b65c7ee75bc7e996c644b58f836438f6791ab94b 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -300,7 +300,9 @@ #define BTRFS_CSUM_SIZE 32 /* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 +enum btrfs_csum_type { + BTRFS_CSUM_TYPE_CRC32 = 0, +}; /* * flags definitions for directory entry item type @@ -806,11 +808,6 @@ struct btrfs_dev_stats_item { #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 struct btrfs_dev_replace_item { /*