提交 6dec9f40 编写于 作者: L Linus Torvalds

Merge tag 'for-5.9-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "We don't have any big feature updates this time, there are lots of
  small enhacements or fixes. A highlight perhaps is the parallel fsync
  performance improvements, numbers below.

  Regarding the dio/iomap that was reverted last time, the required API
  changes are likely to land in the upcoming cycle, the btrfs part will
  be updated afterwards.

  User visible changes:

   - new mount option rescue= to group all recovery-related mount
     options so we don't have many specific options, currently
     introducing only aliases for existing options, future extensions
     are in development to allow read-only mount with partially damaged
     structures:
      - usebackuproot is an alias for rescue=usebackuproot
      - nologreplay is an alias for rescue=nologreplay

   - start deprecation of mount option inode_cache, removal scheduled to
     v5.11

   - removed deprecated mount options alloc_start and subvolrootid

   - device stats corruption counter gets incremented when a checksum
     mismatch is found

   - qgroup information exported in /sys/fs/btrfs/<UUID>/qgroups/<id>
     using sysfs

   - add link /sys/fs/btrfs/<UUID>/bdi pointing to the associated
     backing dev info

   - FS_INFO ioctl enhancements:
      - add flags to request/describe newly added items
      - new item: numeric checksum type and checksum size
      - new item: generation
      - new item: metadata_uuid

   - seed device: with one new read-write device added, print the new
     device information in /proc/mounts

   - balance: detect cancellation by Ctrl-C in existing cancellation
     points

  Performance improvements:

   - optimized versions of various helpers on little-endian
     architectures, where we don't have to do LE/BE conversion from
     on-disk format

   - tree-log/fsync optimizations leading to lower max latency reported
     by dbench, reduced by about 12%

   - all chunk tree leaves are prefetched at mount time, can improve
     mount time on large (terabyte-sized) filesystems

   - speed up parallel fsync of files with reflinked/deduped extents,
     with jobs 16 to 1024 the throughput gets improved roughly by 50% on
     average and runtime decreased roughly by 30% on average, notable
     outlier is 128 jobs with +121.2% on throughput and -54.6% runtime

   - another speed up of parallel fsync, reduce number of checksum tree
     lookups and contention, the improvements start to show up with 2
     tasks with +20% throughput and -16% runtime up to 64 with +200%
     throughput and -66% runtime

  Core:

   - umount-time qgroup leak checker

   - qgroups
      - add a way to unreserve partial range after failure, avoiding
        some EDQUOT errors
      - improved flushing logic when EDQUOT is hit

   - possible EINTR interruption caused by failed reservations after
     transaction start is better handled and documented

   - transaction abort errors are unified to EROFS in case it's not the
     original reason of abort or we don't have other way to determine
     the reason

  Fixes:

   - make truncate succeed on a NOCOW file even if data space is
     exhausted

   - fix cancelling balance on filesystem with exhausted metadata space

   - anon block device:
      - preallocate anon bdev when subvolume is created to report
        failure early
      - shorten time the anon bdev id is allocated
      - don't allocate anon bdev for internal roots

   - minor memory leak in ref-verify

   - refuse invalid combinations of compression and NOCOW file flags

   - lockdep fixes, updating the device locks

   - remove obsolete fallback logic for block group profile adjustments
     when switching from 1 to more devices, causing allocation of
     unwanted block groups

  Other cleanups, refactoring, simplifications:

   - conversions from struct inode to struct btrfs_inode in internal
     functions

   - removal of unused struct members"

* tag 'for-5.9-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (151 commits)
  btrfs: do not set the full sync flag on the inode during page release
  btrfs: release old extent maps during page release
  btrfs: fix race between page release and a fast fsync
  btrfs: open-code remount flag setting in btrfs_remount
  btrfs: if we're restriping, use the target restripe profile
  btrfs: don't adjust bg flags and use default allocation profiles
  btrfs: fix lockdep splat from btrfs_dump_space_info
  btrfs: move the chunk_mutex in btrfs_read_chunk_tree
  btrfs: open device without device_list_mutex
  btrfs: sysfs: use NOFS for device creation
  btrfs: return EROFS for BTRFS_FS_STATE_ERROR cases
  btrfs: document special case error codes for fs errors
  btrfs: don't WARN if we abort a transaction with EROFS
  btrfs: reduce contention on log trees when logging checksums
  btrfs: remove done label in writepage_delalloc
  btrfs: add comments for btrfs_reserve_flush_enum
  btrfs: relocation: review the call sites which can be interrupted by signal
  btrfs: avoid possible signal interruption of btrfs_drop_snapshot() on relocation tree
  btrfs: relocation: allow signal to cancel balance
  btrfs: raid56: remove out label in __raid56_parity_recover
  ...
......@@ -65,11 +65,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
/* Pick target profile only if it's already available */
if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
spin_unlock(&fs_info->balance_lock);
return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
......@@ -118,12 +115,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
atomic_inc(&cache->count);
refcount_inc(&cache->refs);
}
void btrfs_put_block_group(struct btrfs_block_group *cache)
{
if (atomic_dec_and_test(&cache->count)) {
if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
......@@ -1111,7 +1108,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&block_group->lock);
block_group->removed = 1;
/*
......@@ -1143,8 +1139,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
remove_em = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
mutex_unlock(&fs_info->chunk_mutex);
if (remove_em) {
struct extent_map_tree *em_tree;
......@@ -1532,21 +1526,70 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
u64 flags;
int ret = 0;
slot = path->slots[0];
leaf = path->nodes[0];
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
if (em->start != key->objectid || em->len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, em->start, em->len);
ret = -EUCLEAN;
goto out_free_em;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
sizeof(bg));
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
(BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
ret = -EUCLEAN;
}
out_free_em:
free_extent_map(em);
return ret;
}
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_root *root = fs_info->extent_root;
int ret = 0;
int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
struct btrfs_block_group_item bg;
u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
goto out;
return ret;
while (1) {
slot = path->slots[0];
......@@ -1563,49 +1606,10 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
struct extent_map_tree *em_tree;
struct extent_map *em;
em_tree = &root->fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, found_key.objectid,
found_key.offset);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
found_key.objectid, found_key.offset);
ret = -ENOENT;
} else if (em->start != found_key.objectid ||
em->len != found_key.offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
found_key.objectid, found_key.offset,
em->start, em->len);
ret = -EUCLEAN;
} else {
read_extent_buffer(leaf, &bg,
btrfs_item_ptr_offset(leaf, slot),
sizeof(bg));
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
if (flags != (em->map_lookup->type &
BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
found_key.objectid,
found_key.offset, flags,
(BTRFS_BLOCK_GROUP_TYPE_MASK &
em->map_lookup->type));
ret = -EUCLEAN;
} else {
ret = 0;
}
}
free_extent_map(em);
goto out;
ret = read_bg_from_eb(fs_info, &found_key, path);
break;
}
path->slots[0]++;
}
out:
......@@ -1657,19 +1661,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
return -EIO;
map = em->map_lookup;
data_stripe_length = em->len;
data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
data_stripe_length = div_u64(data_stripe_length,
map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
data_stripe_length = div_u64(data_stripe_length,
nr_data_stripes(map));
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
io_stripe_size = map->stripe_len * nr_data_stripes(map);
}
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
......@@ -1748,25 +1745,12 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return ret;
while (nr--) {
u64 start, len;
if (logical[nr] > cache->start + cache->length)
continue;
if (logical[nr] + stripe_len <= cache->start)
continue;
start = logical[nr];
if (start < cache->start) {
start = cache->start;
len = (logical[nr] + stripe_len) - start;
} else {
len = min_t(u64, stripe_len,
cache->start + cache->length - start);
}
u64 len = min_t(u64, stripe_len,
cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
ret = btrfs_add_excluded_extent(fs_info, start, len);
ret = btrfs_add_excluded_extent(fs_info, logical[nr],
len);
if (ret) {
kfree(logical);
return ret;
......@@ -1818,7 +1802,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
atomic_set(&cache->count, 1);
refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
......@@ -2207,54 +2191,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return 0;
}
static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 num_devices;
u64 stripped;
/*
* if restripe for this chunk_type is on pick target profile and
* return, otherwise do the usual balance
*/
stripped = get_restripe_target(fs_info, flags);
if (stripped)
return extended_to_chunk(stripped);
num_devices = fs_info->fs_devices->rw_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
if (num_devices == 1) {
stripped |= BTRFS_BLOCK_GROUP_DUP;
stripped = flags & ~stripped;
/* turn raid0 into single device chunks */
if (flags & BTRFS_BLOCK_GROUP_RAID0)
return stripped;
/* turn mirroring into duplication */
if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID10))
return stripped | BTRFS_BLOCK_GROUP_DUP;
} else {
/* they already had raid on here, just return */
if (flags & stripped)
return flags;
stripped |= BTRFS_BLOCK_GROUP_DUP;
stripped = flags & ~stripped;
/* switch duplicated blocks with raid1 */
if (flags & BTRFS_BLOCK_GROUP_DUP)
return stripped | BTRFS_BLOCK_GROUP_RAID1;
/* this is drive concat, leave it alone */
}
return flags;
}
/*
* Mark one block group RO, can be called several times for the same block
* group.
......@@ -2300,7 +2236,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
alloc_flags = update_block_group_flags(fs_info, cache->flags);
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
ret = btrfs_chunk_alloc(trans, alloc_flags,
CHUNK_ALLOC_FORCE);
......@@ -2327,7 +2263,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = update_block_group_flags(fs_info, cache->flags);
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
......@@ -2521,7 +2457,8 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
num_pages *= 16;
num_pages *= PAGE_SIZE;
ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
num_pages);
if (ret)
goto out_put;
......@@ -3392,7 +3329,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
ASSERT(atomic_read(&block_group->count) == 1);
ASSERT(refcount_read(&block_group->refs) == 1);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
......@@ -3447,7 +3384,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
mutex_lock(&fs_info->chunk_mutex);
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->start,
......@@ -3455,7 +3391,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
BUG_ON(!em); /* logic error, can't happen */
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
mutex_unlock(&fs_info->chunk_mutex);
/* once for us and once for the tree */
free_extent_map(em);
......
......@@ -114,8 +114,7 @@ struct btrfs_block_group {
/* For block groups in the same raid type */
struct list_head list;
/* Usage count */
atomic_t count;
refcount_t refs;
/*
* List of struct btrfs_free_clusters for this block group.
......
......@@ -151,6 +151,17 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
/*
* The id/generation of the last transaction where this inode was
* either the source or the destination of a clone/dedupe operation.
* Used when logging an inode to know if there are shared extents that
* need special care when logging checksum items, to avoid duplicate
* checksum items in a log (which can lead to a corruption where we end
* up with missing checksum ranges after log replay).
* Protected by the vfs inode lock.
*/
u64 last_reflink_trans;
/*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
......
......@@ -631,10 +631,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int pass;
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
pr_info("btrfsic: error, kmalloc failed!\n");
if (!selected_super)
return -ENOMEM;
}
list_for_each_entry(device, dev_head, dev_list) {
int i;
......@@ -795,7 +793,6 @@ static int btrfsic_process_superblock_dev_mirror(
if (NULL == superblock_tmp) {
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
pr_info("btrfsic: error, kmalloc failed!\n");
ret = -1;
goto out;
}
......@@ -921,9 +918,7 @@ static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
struct btrfsic_stack_frame *sf;
sf = kzalloc(sizeof(*sf), GFP_NOFS);
if (NULL == sf)
pr_info("btrfsic: alloc memory failed!\n");
else
if (sf)
sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
return sf;
}
......@@ -1313,7 +1308,6 @@ static int btrfsic_create_link_to_next_block(
if (NULL == l) {
l = btrfsic_block_link_alloc();
if (NULL == l) {
pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(next_block_ctx);
*next_blockp = NULL;
return -1;
......@@ -1470,7 +1464,6 @@ static int btrfsic_handle_extent_data(
mirror_num,
&block_was_created);
if (NULL == next_block) {
pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&next_block_ctx);
return -1;
}
......@@ -2013,7 +2006,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block = btrfsic_block_alloc();
if (NULL == block) {
pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
goto continue_loop;
}
......@@ -2234,7 +2226,6 @@ static int btrfsic_process_written_superblock(
mirror_num,
&was_created);
if (NULL == next_block) {
pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&tmp_next_block_ctx);
return -1;
}
......@@ -2542,10 +2533,8 @@ static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
&state->block_link_hashtable);
if (NULL == l) {
l = btrfsic_block_link_alloc();
if (NULL == l) {
pr_info("btrfsic: error, kmalloc failed!\n");
if (!l)
return NULL;
}
l->block_ref_to = next_block;
l->block_ref_from = from_block;
......@@ -2589,10 +2578,9 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
struct btrfsic_dev_state *dev_state;
block = btrfsic_block_alloc();
if (NULL == block) {
pr_info("btrfsic: error, kmalloc failed!\n");
if (!block)
return NULL;
}
dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
if (NULL == dev_state) {
pr_info("btrfsic: error, lookup dev_state failed!\n");
......@@ -2797,10 +2785,8 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
return -1;
}
state = kvzalloc(sizeof(*state), GFP_KERNEL);
if (!state) {
pr_info("btrfs check-integrity: allocation failed!\n");
if (!state)
return -ENOMEM;
}
if (!btrfsic_is_initialized) {
mutex_init(&btrfsic_mutex);
......@@ -2829,7 +2815,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
ds = btrfsic_dev_state_alloc();
if (NULL == ds) {
pr_info("btrfs check-integrity: kmalloc() failed!\n");
mutex_unlock(&btrfsic_mutex);
return -ENOMEM;
}
......
......@@ -172,18 +172,17 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
static int check_compressed_csum(struct btrfs_inode *inode,
struct compressed_bio *cb,
static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int ret;
struct page *page;
unsigned long i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
if (inode->flags & BTRFS_INODE_NODATASUM)
......@@ -201,15 +200,15 @@ static int check_compressed_csum(struct btrfs_inode *inode,
if (memcmp(&csum, cb_sum, csum_size)) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
ret = -EIO;
goto fail;
if (btrfs_io_bio(bio)->device)
btrfs_dev_stat_inc_and_print(
btrfs_io_bio(bio)->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
return -EIO;
}
cb_sum += csum_size;
}
ret = 0;
fail:
return ret;
return 0;
}
/* when we finish reading compressed pages from the disk, we
......@@ -244,7 +243,6 @@ static void end_compressed_bio_read(struct bio *bio)
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
ASSERT(btrfs_io_bio(cb->orig_bio));
btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
......@@ -256,7 +254,7 @@ static void end_compressed_bio_read(struct bio *bio)
goto csum_failed;
inode = cb->inode;
ret = check_compressed_csum(BTRFS_I(inode), cb,
ret = check_compressed_csum(BTRFS_I(inode), bio,
(u64)bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
......@@ -405,7 +403,7 @@ static void end_compressed_bio_write(struct bio *bio)
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
......@@ -413,7 +411,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
unsigned long bytes_left;
......@@ -421,7 +419,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct page *page;
u64 first_byte = disk_start;
blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
......@@ -429,7 +427,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
......@@ -455,7 +453,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
int submit = 0;
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
page->mapping = inode->vfs_inode.i_mapping;
if (bio->bi_iter.bi_size)
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
......
......@@ -8,6 +8,8 @@
#include <linux/sizes.h>
struct btrfs_inode;
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
......@@ -88,7 +90,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio *bio);
blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
struct page **compressed_pages,
......
......@@ -1501,6 +1501,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
return 0;
}
#ifdef __LITTLE_ENDIAN
/*
* Compare two keys, on little-endian the disk order is same as CPU order and
* we can avoid the conversion.
*/
static int comp_keys(const struct btrfs_disk_key *disk_key,
const struct btrfs_key *k2)
{
const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
return btrfs_comp_cpu_keys(k1, k2);
}
#else
/*
* compare two keys in a memcmp fashion
*/
......@@ -1513,6 +1529,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
return btrfs_comp_cpu_keys(&k1, k2);
}
#endif
/*
* same as comp_keys only with two btrfs_key's
......
......@@ -545,11 +545,6 @@ enum {
* (device replace, resize, device add/delete, balance)
*/
BTRFS_FS_EXCL_OP,
/*
* To info transaction_kthread we need an immediate commit so it
* doesn't need to wait for commit_interval
*/
BTRFS_FS_NEED_ASYNC_COMMIT,
/*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
......@@ -779,6 +774,7 @@ struct btrfs_fs_info {
u32 thread_pool_size;
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
u64 total_pinned;
......@@ -1011,6 +1007,8 @@ enum {
BTRFS_ROOT_DEAD_TREE,
/* The root has a log tree. Used only for subvolume roots. */
BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING,
};
/*
......@@ -1059,8 +1057,10 @@ struct btrfs_root {
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
struct list_head log_ctxs[2];
/* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_writers;
atomic_t log_commit[2];
/* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
int log_transid;
/* No matter the commit succeeds or not*/
......@@ -1075,7 +1075,6 @@ struct btrfs_root {
u64 highest_objectid;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
......@@ -1162,6 +1161,7 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */
atomic_t nr_swapfiles;
......@@ -1277,18 +1277,18 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
BTRFS_MOUNT_##opt)
#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
{ \
do { \
if (!btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_set_opt(fs_info->mount_opt, opt); \
}
} while (0)
#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
{ \
do { \
if (btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_clear_opt(fs_info->mount_opt, opt); \
}
} while (0)
/*
* Requests for changes that need to be done during transaction commit.
......@@ -1895,6 +1895,52 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
#ifdef __LITTLE_ENDIAN
/*
* Optimized helpers for little-endian architectures where CPU and on-disk
* structures have the same endianness and we can skip conversions.
*/
static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
const struct btrfs_disk_key *disk_key)
{
memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
}
static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
const struct btrfs_key *cpu_key)
{
memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
}
static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
struct btrfs_key *cpu_key, int nr)
{
struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
btrfs_node_key(eb, disk_key, nr);
}
static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
struct btrfs_key *cpu_key, int nr)
{
struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
btrfs_item_key(eb, disk_key, nr);
}
static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
const struct btrfs_dir_item *item,
struct btrfs_key *cpu_key)
{
struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
btrfs_dir_item_key(eb, item, disk_key);
}
#else
static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
const struct btrfs_disk_key *disk)
{
......@@ -1936,6 +1982,8 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
btrfs_disk_key_to_cpu(key, &disk_key);
}
#endif
/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
......@@ -2232,7 +2280,8 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
}
/* struct btrfs_file_extent_item */
BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
struct btrfs_file_extent_item, disk_bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
......@@ -2241,6 +2290,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
struct btrfs_file_extent_item, num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
struct btrfs_file_extent_item, ram_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
struct btrfs_file_extent_item, disk_num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
......@@ -2257,6 +2308,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
......@@ -2508,16 +2560,46 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
/*
* Different levels for to flush space when doing space reservations.
*
* The higher the level, the more methods we try to reclaim space.
*/
enum btrfs_reserve_flush_enum {
/* If we are in the transaction, we can't flush anything.*/
BTRFS_RESERVE_NO_FLUSH,
/*
* Flushing delalloc may cause deadlock somewhere, in this
* case, use FLUSH LIMIT
* Flush space by:
* - Running delayed inode items
* - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_LIMIT,
/*
* Flush space by:
* - Running delayed inode items
* - Running delayed refs
* - Running delalloc and waiting for ordered extents
* - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_EVICT,
/*
* Flush space by above mentioned methods and by:
* - Running delayed iputs
* - Commiting transaction
*
* Can be interruped by fatal signal.
*/
BTRFS_RESERVE_FLUSH_ALL,
/*
* Pretty much the same as FLUSH_ALL, but can also steal space from
* global rsv.
*
* Can be interruped by fatal signal.
*/
BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
......@@ -2831,8 +2913,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig);
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
......@@ -2875,7 +2957,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
......@@ -2928,7 +3010,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
......@@ -2962,7 +3044,7 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
......@@ -2978,10 +3060,13 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct inode *inode, struct page **pages,
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
......@@ -3194,7 +3279,7 @@ do { \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
if ((errno) != -EIO) { \
if ((errno) != -EIO && (errno) != -EROFS) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
(errno)); \
......@@ -3378,7 +3463,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
......
......@@ -237,10 +237,10 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
return 0;
}
int btrfs_check_data_free_space(struct inode *inode,
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
/* align the range */
......@@ -248,14 +248,14 @@ int btrfs_check_data_free_space(struct inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
btrfs_free_reserved_data_space_noquota(fs_info, len);
else
ret = 0;
return ret;
......@@ -269,16 +269,12 @@ int btrfs_check_data_free_space(struct inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
/* Make sure the range is aligned to sectorsize */
len = round_up(start + len, fs_info->sectorsize) -
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
......@@ -293,17 +289,17 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
void btrfs_free_reserved_data_space(struct inode *inode,
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
/* Make sure the range is aligned to sectorsize */
len = round_up(start + len, root->fs_info->sectorsize) -
round_down(start, root->fs_info->sectorsize);
start = round_down(start, root->fs_info->sectorsize);
len = round_up(start + len, fs_info->sectorsize) -
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(inode, start, len);
btrfs_free_reserved_data_space_noquota(fs_info, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
......@@ -557,7 +553,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
int btrfs_delalloc_reserve_space(struct inode *inode,
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
......@@ -565,7 +561,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
ret = btrfs_delalloc_reserve_metadata(inode, len);
if (ret < 0)
btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
......@@ -583,10 +579,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
void btrfs_delalloc_release_space(struct inode *inode,
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
btrfs_delalloc_release_metadata(inode, len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
......@@ -6,18 +6,18 @@
struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct inode *inode,
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space(struct inode *inode,
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode,
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct inode *inode,
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
#endif /* BTRFS_DELALLOC_SPACE_H */
......@@ -1116,6 +1116,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
......@@ -1141,10 +1142,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
if (!dummy)
root->defrag_trans_start = fs_info->generation;
else
root->defrag_trans_start = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
......@@ -1395,7 +1392,12 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
goto out;
}
static int btrfs_init_fs_root(struct btrfs_root *root)
/*
* Initialize subvolume root in-memory structure
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
unsigned int nofs_flag;
......@@ -1428,9 +1430,20 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(&root->ino_cache_lock);
init_waitqueue_head(&root->ino_cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto fail;
/*
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
if (is_fstree(root->root_key.objectid) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto fail;
} else {
root->anon_dev = anon_dev;
}
}
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
......@@ -1534,8 +1547,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
}
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
/*
* Get an in-memory reference of a root structure.
*
* For essential trees like root/extent tree, we grab it from fs_info directly.
* For subvolume trees, we check the cached filesystem roots first. If not
* found, then read it from disk and add it to cached fs roots.
*
* Caller should release the root by calling btrfs_put_root() after the usage.
*
* NOTE: Reloc and log trees can't be read by this function as they share the
* same root objectid.
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
* pass 0 for new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev,
bool check_ref)
{
struct btrfs_root *root;
struct btrfs_path *path;
......@@ -1564,6 +1596,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
/* Shouldn't get preallocated anon_dev for cached roots */
ASSERT(!anon_dev);
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
......@@ -1583,7 +1617,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
goto fail;
}
ret = btrfs_init_fs_root(root);
ret = btrfs_init_fs_root(root, anon_dev);
if (ret)
goto fail;
......@@ -1616,6 +1650,33 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
return ERR_PTR(ret);
}
/*
* Get in-memory reference of a root structure
*
* @objectid: tree objectid
* @check_ref: if set, verify that the tree exists and the item has at least
* one reference
*/
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
}
/*
* Get in-memory reference of a root structure, created as new, optionally pass
* the anonymous block device id
*
* @objectid: tree objectid
* @anon_dev: if zero, allocate a new anonymous block device or use the
* parameter value
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
{
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
......@@ -1749,7 +1810,6 @@ static int transaction_kthread(void *arg)
now = ktime_get_seconds();
if (cur->state < TRANS_STATE_COMMIT_START &&
!test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
spin_unlock(&fs_info->trans_lock);
......@@ -2001,8 +2061,7 @@ void btrfs_put_root(struct btrfs_root *root)
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
free_root_extent_buffers(root);
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
#ifdef CONFIG_BTRFS_DEBUG
......@@ -4058,6 +4117,11 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
if (btrfs_check_quota_leak(fs_info)) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_err(fs_info, "qgroup reserved space leaked");
}
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
......
......@@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
......
......@@ -233,14 +233,11 @@ bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
struct extent_state **cached_state);
/* This should be reworked in the future and put elsewhere. */
int get_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record **failrec);
struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
int set_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record *failrec);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
struct io_failure_record **failrec_ret);
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
......
......@@ -5298,7 +5298,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out;
}
trans = btrfs_start_transaction(tree_root, 0);
/*
* Use join to avoid potential EINTR from transaction start. See
* wait_reserve_ticket and the whole reservation callchain.
*/
if (for_reloc)
trans = btrfs_join_transaction(tree_root);
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
......@@ -5466,6 +5473,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
}
/*
* This subvolume is going to be completely dropped, and won't be
* recorded as dirty roots, thus pertrans meta rsv will not be freed at
* commit transaction time. So free it here manually.
*/
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
......
......@@ -2018,15 +2018,14 @@ static int __process_pages_contig(struct address_space *mapping,
return err;
}
void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
{
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
NULL);
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
__process_pages_contig(inode->i_mapping, locked_page,
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
page_ops, NULL);
}
......@@ -2123,12 +2122,11 @@ int set_state_failrec(struct extent_io_tree *tree, u64 start,
return ret;
}
int get_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record **failrec)
struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
{
struct rb_node *node;
struct extent_state *state;
int ret = 0;
struct io_failure_record *failrec;
spin_lock(&tree->lock);
/*
......@@ -2137,18 +2135,19 @@ int get_state_failrec(struct extent_io_tree *tree, u64 start,
*/
node = tree_search(tree, start);
if (!node) {
ret = -ENOENT;
failrec = ERR_PTR(-ENOENT);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
ret = -ENOENT;
failrec = ERR_PTR(-ENOENT);
goto out;
}
*failrec = state->failrec;
failrec = state->failrec;
out:
spin_unlock(&tree->lock);
return ret;
return failrec;
}
/*
......@@ -2378,8 +2377,8 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
ret = get_state_failrec(failure_tree, start, &failrec);
if (ret)
failrec = get_state_failrec(failure_tree, start);
if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
......@@ -2451,8 +2450,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
spin_unlock(&failure_tree->lock);
}
int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
struct io_failure_record **failrec_ret)
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
......@@ -2463,65 +2462,8 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
int ret;
u64 logical;
ret = get_state_failrec(failure_tree, start, &failrec);
if (ret) {
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return -ENOMEM;
failrec->start = start;
failrec->len = end - start + 1;
failrec->this_mirror = 0;
failrec->bio_flags = 0;
failrec->in_validation = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (!em) {
read_unlock(&em_tree->lock);
kfree(failrec);
return -EIO;
}
if (em->start > start || em->start + em->len <= start) {
free_extent_map(em);
em = NULL;
}
read_unlock(&em_tree->lock);
if (!em) {
kfree(failrec);
return -EIO;
}
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&failrec->bio_flags,
em->compress_type);
}
btrfs_debug(fs_info,
"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
logical, start, failrec->len);
failrec->logical = logical;
free_extent_map(em);
/* set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0)
ret = set_state_failrec(failure_tree, start, failrec);
/* set the bits in the inode's tree */
if (ret >= 0)
ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
if (ret < 0) {
kfree(failrec);
return ret;
}
} else {
failrec = get_state_failrec(failure_tree, start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
failrec->logical, failrec->start, failrec->len,
......@@ -2531,11 +2473,66 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
return failrec;
}
*failrec_ret = failrec;
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return ERR_PTR(-ENOMEM);
return 0;
failrec->start = start;
failrec->len = end - start + 1;
failrec->this_mirror = 0;
failrec->bio_flags = 0;
failrec->in_validation = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (!em) {
read_unlock(&em_tree->lock);
kfree(failrec);
return ERR_PTR(-EIO);
}
if (em->start > start || em->start + em->len <= start) {
free_extent_map(em);
em = NULL;
}
read_unlock(&em_tree->lock);
if (!em) {
kfree(failrec);
return ERR_PTR(-EIO);
}
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&failrec->bio_flags, em->compress_type);
}
btrfs_debug(fs_info,
"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
logical, start, failrec->len);
failrec->logical = logical;
free_extent_map(em);
/* Set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0) {
ret = set_state_failrec(failure_tree, start, failrec);
/* Set the bits in the inode's tree */
ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
} else if (ret < 0) {
kfree(failrec);
return ERR_PTR(ret);
}
return failrec;
}
static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
......@@ -2660,16 +2657,15 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
int ret;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
if (ret)
return errno_to_blk_status(ret);
failrec = btrfs_get_io_failure_record(inode, start, end);
if (IS_ERR(failrec))
return errno_to_blk_status(PTR_ERR(failrec));
need_validation = btrfs_io_needs_validation(inode, failed_bio);
......@@ -3420,7 +3416,7 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct inode *inode,
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
......@@ -3433,7 +3429,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
while (delalloc_end < page_end) {
found = find_lock_delalloc_range(inode, page,
found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
......@@ -3450,8 +3446,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
* started, so we don't want to return > 0 unless
* things are going well.
*/
ret = ret < 0 ? ret : -EIO;
goto done;
return ret < 0 ? ret : -EIO;
}
/*
* delalloc_end is already one less than the total length, so
......@@ -3483,10 +3478,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
return 1;
}
ret = 0;
done:
return ret;
return 0;
}
/*
......@@ -3497,7 +3489,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
static noinline_for_stack int __extent_writepage_io(struct inode *inode,
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
......@@ -3505,7 +3497,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
unsigned long nr_written,
int *nr_ret)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
......@@ -3537,7 +3529,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
update_nr_written(wbc, nr_written + 1);
end = page_end;
blocksize = inode->i_sb->s_blocksize;
blocksize = inode->vfs_inode.i_sb->s_blocksize;
while (cur <= end) {
u64 em_end;
......@@ -3548,8 +3540,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, 1);
break;
}
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
end - cur + 1);
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
......@@ -3586,7 +3577,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
......@@ -3659,15 +3650,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_extent_mapped(page);
if (!epd->extent_locked) {
ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
&nr_written);
if (ret == 1)
return 0;
if (ret)
goto done;
}
ret = __extent_writepage_io(inode, page, wbc, epd,
i_size, nr_written, &nr);
ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
nr_written, &nr);
if (ret == 1)
return 0;
......@@ -4127,7 +4119,7 @@ int btree_write_cache_pages(struct address_space *mapping,
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
ret = -EUCLEAN;
ret = -EROFS;
end_write_bio(&epd, ret);
}
return ret;
......@@ -4489,6 +4481,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
struct btrfs_fs_info *fs_info;
u64 cur_gen;
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
......@@ -4502,15 +4497,45 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
EXTENT_LOCKED, 0, NULL)) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&btrfs_inode->runtime_flags);
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
if (test_range_bit(tree, em->start,
extent_map_end(em) - 1,
EXTENT_LOCKED, 0, NULL))
goto next;
/*
* If it's not in the list of modified extents, used
* by a fast fsync, we can remove it. If it's being
* logged we can safely remove it since fsync took an
* extra reference on the em.
*/
if (list_empty(&em->list) ||
test_bit(EXTENT_FLAG_LOGGING, &em->flags))
goto remove_em;
/*
* If it's in the list of modified extents, remove it
* only if its generation is older then the current one,
* in which case we don't need it for a fast fsync.
* Otherwise don't remove it, we could be racing with an
* ongoing fast fsync that could miss the new extent.
*/
fs_info = btrfs_inode->root->fs_info;
spin_lock(&fs_info->trans_lock);
cur_gen = fs_info->generation;
spin_unlock(&fs_info->trans_lock);
if (em->generation >= cur_gen)
goto next;
remove_em:
/*
* We only remove extent maps that are not in the list of
* modified extents or that are in the list but with a
* generation lower then the current generation, so there
* is no need to set the full fsync flag on the inode (it
* hurts the fsync performance for workloads with a data
* size that exceeds or is close to the system's memory).
*/
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
next:
start = extent_map_end(em);
write_unlock(&map->lock);
......@@ -4670,7 +4695,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
}
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
u64 start, u64 len)
{
int ret = 0;
u64 off = start;
......
......@@ -204,7 +204,7 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
......@@ -277,7 +277,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
......
......@@ -522,10 +522,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
* means this bio can contains potentially discontigous bio vecs
* so the logical offset of each should be calculated separately.
*/
blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 file_start, int contig)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
......
......@@ -500,18 +500,18 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
int btrfs_dirty_pages(struct inode *inode, struct page **pages,
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(inode);
loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
start_pos = pos & ~((u64) fs_info->sectorsize - 1);
......@@ -524,13 +524,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (!btrfs_is_free_space_inode(inode)) {
if (start_pos >= isize &&
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
!(inode->flags & BTRFS_INODE_PREALLOC)) {
/*
* There can't be any extents following eof in this case
* so just set the delalloc new bit for the range
......@@ -538,8 +538,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
*/
extra_bits |= EXTENT_DELALLOC_NEW;
} else {
err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
start_pos,
err = btrfs_find_new_delalloc_bytes(inode, start_pos,
num_bytes, cached);
if (err)
return err;
......@@ -564,7 +563,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* at this time.
*/
if (end_pos > isize)
i_size_write(inode, end_pos);
i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
......@@ -731,7 +730,7 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
* is deleted from the tree.
*/
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
......@@ -744,7 +743,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(BTRFS_I(inode));
struct inode *vfs_inode = &inode->vfs_inode;
u64 ino = btrfs_ino(inode);
u64 search_start = start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
......@@ -762,9 +762,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int leafs_visited = 0;
if (drop_cache)
btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
btrfs_drop_extent_cache(inode, start, end - 1, 0);
if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
if (start >= inode->disk_i_size && !replace_extent)
modify_tree = 0;
update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
......@@ -935,7 +935,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, end - key.offset);
inode_sub_bytes(vfs_inode, end - key.offset);
break;
}
......@@ -955,7 +955,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, extent_end - start);
inode_sub_bytes(vfs_inode, extent_end - start);
if (end == extent_end)
break;
......@@ -979,7 +979,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
inode_sub_bytes(inode,
inode_sub_bytes(vfs_inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
......@@ -993,7 +993,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
inode_sub_bytes(vfs_inode,
extent_end - key.offset);
}
......@@ -1082,8 +1082,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
drop_cache, 0, 0, NULL);
ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
end, NULL, drop_cache, 0, 0, NULL);
btrfs_free_path(path);
return ret;
}
......@@ -1532,8 +1532,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait)
static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
......@@ -1541,6 +1541,9 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
......@@ -1583,6 +1586,42 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
return ret;
}
static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
return check_can_nocow(inode, pos, write_bytes, true);
}
/*
* Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
*
* @pos: File offset
* @write_bytes: The length to write, will be updated to the nocow writeable
* range
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
*
* Return:
* >0 and update @write_bytes if we can do nocow write
* 0 if we can't do nocow write
* -EAGAIN if we can't get the needed lock or there are ordered extents
* for * (nowait == true) case
* <0 if other error happened
*
* NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
*/
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
return check_can_nocow(inode, pos, write_bytes, false);
}
void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
{
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
......@@ -1590,7 +1629,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
......@@ -1643,13 +1681,12 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
write_bytes);
if (ret < 0) {
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
check_can_nocow(BTRFS_I(inode), pos,
&write_bytes, false) > 0) {
if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
&write_bytes) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
......@@ -1674,11 +1711,11 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
reserve_bytes);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode,
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_check_nocow_unlock(BTRFS_I(inode));
break;
}
......@@ -1748,7 +1785,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(inode,
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
......@@ -1758,8 +1795,9 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, &cached_state);
ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
dirty_pages, pos, copied,
&cached_state);
/*
* If we have not locked the extent range, because the range's
......@@ -1782,7 +1820,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
release_bytes = 0;
if (only_release_metadata)
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_check_nocow_unlock(BTRFS_I(inode));
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
......@@ -1800,8 +1838,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
btrfs_btree_balance_dirty(fs_info);
pos += copied;
num_written += copied;
......@@ -1811,11 +1847,12 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (release_bytes) {
if (only_release_metadata) {
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
btrfs_delalloc_release_space(inode, data_reserved,
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
......@@ -1926,10 +1963,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* We will allocate space in case nodatacow is not set,
* so bail
*/
if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) ||
check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes,
true) <= 0) {
if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
<= 0) {
inode_unlock(inode);
return -EAGAIN;
}
......@@ -2598,7 +2633,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
cur_offset = start;
while (cur_offset < end) {
ret = __btrfs_drop_extents(trans, root, inode, path,
ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
cur_offset, end + 1, &drop_end,
1, 0, 0, NULL);
if (ret != -ENOSPC) {
......@@ -3176,14 +3211,14 @@ static int btrfs_zero_range(struct inode *inode,
if (ret < 0)
goto out;
space_reserved = true;
ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
alloc_start, bytes_to_reserve);
if (ret)
goto out;
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out;
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret)
goto out;
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
......@@ -3199,7 +3234,7 @@ static int btrfs_zero_range(struct inode *inode,
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
btrfs_free_reserved_data_space(inode, data_reserved,
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
......@@ -3350,8 +3385,9 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
cur_offset, last_byte - cur_offset);
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
&data_reserved, cur_offset,
last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
......@@ -3363,8 +3399,9 @@ static long btrfs_fallocate(struct file *file, int mode,
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
btrfs_free_reserved_data_space(inode, data_reserved,
cur_offset, last_byte - cur_offset);
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, cur_offset,
last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
......@@ -3381,7 +3418,7 @@ static long btrfs_fallocate(struct file *file, int mode,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
btrfs_free_reserved_data_space(inode,
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
list_del(&range->list);
......@@ -3402,7 +3439,7 @@ static long btrfs_fallocate(struct file *file, int mode,
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(inode, data_reserved,
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
......
......@@ -1334,8 +1334,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
i_size_read(inode), &cached_state);
ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
io_ctl->num_pages, 0, i_size_read(inode),
&cached_state);
if (ret)
goto out_nospc;
......@@ -2703,8 +2704,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
static int
__btrfs_return_cluster_to_free_space(
static void __btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
......@@ -2756,7 +2756,6 @@ __btrfs_return_cluster_to_free_space(
out:
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
return 0;
}
static void __btrfs_remove_free_space_cache_locked(
......@@ -2907,12 +2906,11 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
int btrfs_return_cluster_to_free_space(
void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
int ret;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
......@@ -2920,28 +2918,27 @@ int btrfs_return_cluster_to_free_space(
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
return 0;
return;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
return 0;
return;
}
atomic_inc(&block_group->count);
btrfs_get_block_group(block_group);
spin_unlock(&cluster->lock);
ctl = block_group->free_space_ctl;
/* now return any extents the cluster had on it */
spin_lock(&ctl->tree_lock);
ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
__btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
/* finally drop our ref */
btrfs_put_block_group(block_group);
return ret;
}
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
......@@ -3358,7 +3355,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
list_del_init(&entry->list);
if (!ret) {
atomic_inc(&block_group->count);
btrfs_get_block_group(block_group);
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
......
......@@ -136,7 +136,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
int btrfs_return_cluster_to_free_space(
void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
......
......@@ -495,7 +495,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_SIZE;
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
prealloc);
if (ret)
goto out_put;
......
此差异已折叠。
......@@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
return 0;
}
/* Check if @flags are a supported and valid set of FS_*_FL flags */
static int check_fsflags(unsigned int flags)
/*
* Check if @flags are a supported and valid set of FS_*_FL flags and that
* the old and new flags are not conflicting
*/
static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
......@@ -174,9 +177,19 @@ static int check_fsflags(unsigned int flags)
FS_NOCOW_FL))
return -EOPNOTSUPP;
/* COMPR and NOCOMP on new/old are valid */
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
return -EINVAL;
if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
return -EINVAL;
/* NOCOW and compression options are mutually exclusive */
if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
return -EINVAL;
if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
return -EINVAL;
return 0;
}
......@@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
u32 binode_flags = binode->flags;
u32 binode_flags;
if (!inode_owner_or_capable(inode))
return -EPERM;
......@@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
ret = check_fsflags(fsflags);
if (ret)
return ret;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(inode);
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
if (ret)
goto out_unlock;
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
goto out_unlock;
binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
else
......@@ -566,6 +580,7 @@ static noinline int create_subvol(struct inode *dir,
struct inode *inode;
int ret;
int err;
dev_t anon_dev = 0;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
......@@ -578,6 +593,10 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail_free;
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
goto fail_free;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
......@@ -660,12 +679,15 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
key.offset = (u64)-1;
new_root = btrfs_get_fs_root(fs_info, objectid, true);
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0;
btrfs_record_root_in_trans(trans, new_root);
......@@ -735,6 +757,8 @@ static noinline int create_subvol(struct inode *dir,
return ret;
fail_free:
if (anon_dev)
free_anon_bdev(anon_dev);
kfree(root_item);
return ret;
}
......@@ -762,6 +786,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!pending_snapshot)
return -ENOMEM;
ret = get_anon_bdev(&pending_snapshot->anon_dev);
if (ret < 0)
goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
......@@ -823,10 +850,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode);
ret = 0;
pending_snapshot->anon_dev = 0;
fail:
/* Prevent double freeing of anon_dev */
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);
......@@ -1243,7 +1276,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
if (ret)
......@@ -1265,7 +1298,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
while (1) {
lock_extent_bits(tree, page_start, page_end,
&cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
page_start);
unlock_extent_cached(tree, page_start, page_end,
&cached_state);
......@@ -1333,7 +1366,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
(page_cnt - i_done) << PAGE_SHIFT, true);
}
......@@ -1361,7 +1394,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
unlock_page(pages[i]);
put_page(pages[i]);
}
btrfs_delalloc_release_space(inode, data_reserved,
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
......@@ -3198,11 +3231,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 flags_in;
int ret = 0;
fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
if (!fi_args)
return -ENOMEM;
fi_args = memdup_user(arg, sizeof(*fi_args));
if (IS_ERR(fi_args))
return PTR_ERR(fi_args);
flags_in = fi_args->flags;
memset(fi_args, 0, sizeof(*fi_args));
rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
......@@ -3218,6 +3255,23 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
}
if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
fi_args->generation = fs_info->generation;
fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
}
if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
sizeof(fi_args->metadata_uuid));
fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
}
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
......
......@@ -15,6 +15,7 @@
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
#include "qgroup.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
......@@ -152,23 +153,39 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
/* allocate and add a new ordered_extent into the per-inode tree.
/*
* Allocate and add a new ordered_extent into the per-inode tree.
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type, int dio,
int compress_type)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
return ret;
ret = 0;
} else {
/*
* The ordered extent has reserved qgroup space, release now
* and pass the reserved number for qgroup_record to free.
*/
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
if (ret < 0)
return ret;
}
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
......@@ -178,9 +195,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->num_bytes = num_bytes;
entry->disk_num_bytes = disk_num_bytes;
entry->bytes_left = num_bytes;
entry->inode = igrab(inode);
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
......@@ -197,10 +215,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->trans_list);
trace_btrfs_ordered_extent_add(inode, entry);
trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
......@@ -228,14 +244,14 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
* that work has been done at higher layers, so this is truly the
* smallest the extent is going to get.
*/
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
return 0;
}
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
......@@ -244,7 +260,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
......@@ -253,7 +269,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type)
......@@ -291,12 +307,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
* file_offset is updated to one byte past the range that is recorded as
* complete. This allows you to walk forward in the file.
*/
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
......@@ -305,7 +321,6 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
u64 dec_start;
u64 to_dec;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
......@@ -429,8 +444,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->log_list));
ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
......@@ -698,14 +711,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* find an ordered extent corresponding to file_offset. return NULL if
* nothing is found, otherwise take a reference on the extent and return it
*/
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
......@@ -803,7 +816,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
ordered = btrfs_lookup_ordered_extent(inode, offset);
ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
if (!ordered)
return 0;
......
......@@ -92,6 +92,9 @@ struct btrfs_ordered_extent {
/* compression algorithm */
int compress_type;
/* Qgroup reserved space */
int qgroup_rsv;
/* reference count */
refcount_t refs;
......@@ -101,12 +104,6 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
/* If we need to wait on this to be done */
struct list_head log_list;
/* If the transaction needs to wait on this ordered extent */
struct list_head trans_list;
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
......@@ -150,23 +147,23 @@ void btrfs_remove_ordered_extent(struct inode *inode,
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size,
int uptodate);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type);
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
......
......@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
#include <linux/sizes.h>
#include "ctree.h"
#include "transaction.h"
......@@ -22,6 +21,7 @@
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
#include "sysfs.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
......@@ -220,10 +220,12 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
list_del(&qgroup->dirty);
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
......@@ -252,7 +254,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
__del_qgroup_rb(fs_info, qgroup);
return 0;
}
......@@ -351,6 +353,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
/* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
......@@ -412,6 +417,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
goto out;
switch (found_key.type) {
case BTRFS_QGROUP_INFO_KEY: {
struct btrfs_qgroup_info_item *ptr;
......@@ -500,11 +509,50 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/*
* Called in close_ctree() when quota is still enabled. This verifies we don't
* leak some reserved space.
*
* Return false if no reserved space is left.
* Return true if some reserved space is leaked.
*/
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
bool ret = false;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return ret;
/*
* Since we're unmounting, there is no race and no need to grab qgroup
* lock. And here we don't go post-order to provide a more user
* friendly sorted result.
*/
for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
struct btrfs_qgroup *qgroup;
int i;
qgroup = rb_entry(node, struct btrfs_qgroup, node);
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
if (qgroup->rsv.values[i]) {
ret = true;
btrfs_warn(fs_info,
"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
i, qgroup->rsv.values[i]);
}
}
}
return ret;
}
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
......@@ -519,7 +567,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
__del_qgroup_rb(fs_info, qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
......@@ -528,6 +576,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
*/
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
btrfs_sysfs_del_qgroups(fs_info);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
......@@ -900,6 +949,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out;
}
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
......@@ -987,6 +1039,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
......@@ -1011,6 +1068,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_commit_transaction(trans);
trans = NULL;
......@@ -1046,6 +1108,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
fs_info->qgroup_ulist = NULL;
if (trans)
btrfs_end_transaction(trans);
btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
......@@ -1398,8 +1461,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
qgroup = add_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
if (IS_ERR(qgroup))
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
goto out;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
......@@ -2818,6 +2884,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
unlock:
spin_unlock(&fs_info->qgroup_lock);
if (!ret)
ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
......@@ -2826,20 +2894,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
return ret;
}
/*
* Two limits to commit transaction in advance.
*
* For RATIO, it will be 1/RATIO of the remaining limit as threshold.
* For SIZE, it will be in byte unit as threshold.
*/
#define QGROUP_FREE_RATIO 32
#define QGROUP_FREE_SIZE SZ_32M
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup *qg, u64 num_bytes)
static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
u64 free;
u64 threshold;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
......@@ -2848,32 +2904,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
/*
* Even if we passed the check, it's better to check if reservation
* for meta_pertrans is pushing us near limit.
* If there is too much pertrans reservation or it's near the limit,
* let's try commit transaction to free some, using transaction_kthread
*/
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
QGROUP_FREE_SIZE);
} else {
free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
QGROUP_FREE_SIZE);
}
/*
* Use transaction_kthread to commit transaction, so we no
* longer need to bother nested transaction nor lock context.
*/
if (free < threshold)
btrfs_commit_transaction_locksafe(fs_info);
}
return true;
}
......@@ -2921,7 +2951,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
......@@ -3378,28 +3408,132 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
}
}
#define rbtree_iterate_from_safe(node, next, start) \
for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
static int qgroup_unreserve_range(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
u64 len)
{
struct rb_node *node;
struct rb_node *next;
struct ulist_node *entry = NULL;
int ret = 0;
node = reserved->range_changed.root.rb_node;
while (node) {
entry = rb_entry(node, struct ulist_node, rb_node);
if (entry->val < start)
node = node->rb_right;
else if (entry)
node = node->rb_left;
else
break;
}
/* Empty changeset */
if (!entry)
return 0;
if (entry->val > start && rb_prev(&entry->rb_node))
entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
rb_node);
rbtree_iterate_from_safe(node, next, &entry->rb_node) {
u64 entry_start;
u64 entry_end;
u64 entry_len;
int clear_ret;
entry = rb_entry(node, struct ulist_node, rb_node);
entry_start = entry->val;
entry_end = entry->aux;
entry_len = entry_end - entry_start + 1;
if (entry_start >= start + len)
break;
if (entry_start + entry_len <= start)
continue;
/*
* Now the entry is in [start, start + len), revert the
* EXTENT_QGROUP_RESERVED bit.
*/
clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
entry_end, EXTENT_QGROUP_RESERVED);
if (!ret && clear_ret < 0)
ret = clear_ret;
ulist_del(&reserved->range_changed, entry->val, entry->aux);
if (likely(reserved->bytes_changed >= entry_len)) {
reserved->bytes_changed -= entry_len;
} else {
WARN_ON(1);
reserved->bytes_changed = 0;
}
}
return ret;
}
/*
* Reserve qgroup space for range [start, start + len).
* Try to free some space for qgroup.
*
* This function will either reserve space from related qgroups or doing
* nothing if the range is already reserved.
* For qgroup, there are only 3 ways to free qgroup space:
* - Flush nodatacow write
* Any nodatacow write will free its reserved data space at run_delalloc_range().
* In theory, we should only flush nodatacow inodes, but it's not yet
* possible, so we need to flush the whole root.
*
* Return 0 for successful reserve
* Return <0 for error (including -EQUOT)
* - Wait for ordered extents
* When ordered extents are finished, their reserved metadata is finally
* converted to per_trans status, which can be freed by later commit
* transaction.
*
* NOTE: this function may sleep for memory allocation.
* if btrfs_qgroup_reserve_data() is called multiple times with
* same @reserved, caller must ensure when error happens it's OK
* to free *ALL* reserved space.
* - Commit transaction
* This would free the meta_per_trans space.
* In theory this shouldn't provide much space, but any more qgroup space
* is needed.
*/
int btrfs_qgroup_reserve_data(struct inode *inode,
static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
/*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
}
ret = btrfs_start_delalloc_snapshot(root);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
ret = btrfs_commit_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
return ret;
}
static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
......@@ -3412,6 +3546,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
......@@ -3419,15 +3554,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
trace_btrfs_qgroup_reserve_data(inode, start, len,
trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
goto cleanup;
goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
......@@ -3435,23 +3570,49 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
return ret;
cleanup:
/* cleanup *ALL* already reserved ranges */
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
/* Also free data bytes of already reserved one */
btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
orig_reserved, BTRFS_QGROUP_RSV_DATA);
extent_changeset_release(reserved);
qgroup_unreserve_range(inode, reserved, start, len);
out:
if (new_reserved) {
extent_changeset_release(reserved);
kfree(reserved);
*reserved_ret = NULL;
}
return ret;
}
/*
* Reserve qgroup space for range [start, start + len).
*
* This function will either reserve space from related qgroups or do nothing
* if the range is already reserved.
*
* Return 0 for successful reservation
* Return <0 for error (including -EQUOT)
*
* NOTE: This function may sleep for memory allocation, dirty page flushing and
* commit transaction. So caller should not hold any dirty page locked.
*/
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
int ret;
ret = qgroup_reserve_data(inode, reserved_ret, start, len);
if (ret <= 0 && ret != -EDQUOT)
return ret;
ret = try_flush_qgroup(inode->root);
if (ret < 0)
return ret;
return qgroup_reserve_data(inode, reserved_ret, start, len);
}
/* Free ranges specified by @reserved, normally in error path */
static int qgroup_free_reserved_data(struct inode *inode,
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
......@@ -3487,8 +3648,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
free_start, free_start + free_len - 1,
ret = clear_record_extent_bits(&inode->io_tree, free_start,
free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
......@@ -3502,7 +3663,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
return ret;
}
static int __btrfs_qgroup_release_data(struct inode *inode,
static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
int free)
{
......@@ -3510,8 +3671,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
&BTRFS_I(inode)->root->fs_info->flags))
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
return 0;
/* In release case, we shouldn't have @reserved */
......@@ -3519,18 +3679,18 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
if (free)
trace_op = QGROUP_FREE;
trace_btrfs_qgroup_release_data(inode, start, len,
trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->root_key.objectid,
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
......@@ -3550,7 +3710,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
int btrfs_qgroup_free_data(struct inode *inode,
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
......@@ -3571,7 +3731,7 @@ int btrfs_qgroup_free_data(struct inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
......@@ -3616,7 +3776,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
......@@ -3643,6 +3803,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return ret;
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
{
int ret;
ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
if (ret <= 0 && ret != -EDQUOT)
return ret;
ret = try_flush_qgroup(root);
if (ret < 0)
return ret;
return qgroup_reserve_meta(root, num_bytes, type, enforce);
}
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
......@@ -3742,7 +3917,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
* Check qgroup reserved space leaking, normally at destroy inode
* time
*/
void btrfs_qgroup_check_reserved_leak(struct inode *inode)
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
struct extent_changeset changeset;
struct ulist_node *unode;
......@@ -3750,19 +3925,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
int ret;
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
ULIST_ITER_INIT(&iter);
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
btrfs_warn(BTRFS_I(inode)->root->fs_info,
"leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
inode->i_ino, unode->val, unode->aux);
btrfs_warn(inode->root->fs_info,
"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
btrfs_ino(inode), unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->root_key.objectid,
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
......
......@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
......@@ -223,8 +224,18 @@ struct btrfs_qgroup {
*/
u64 old_refcnt;
u64 new_refcnt;
/*
* Sysfs kobjectid
*/
struct kobject kobj;
};
static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
{
return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
}
/*
* For qgroup event trace points only
*/
......@@ -344,12 +355,12 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
int btrfs_qgroup_reserve_data(struct inode *inode,
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
u64 len);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
......@@ -399,7 +410,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
*/
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
......@@ -415,5 +426,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
......@@ -1083,7 +1083,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
u64 last_end = 0;
int ret;
struct bio *bio;
struct btrfs_bio_stripe *stripe;
......@@ -1098,15 +1097,14 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
last_end = (u64)last->bi_iter.bi_sector << 9;
u64 last_end = (u64)last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
if (last_end == disk_start && stripe->dev->bdev &&
!last->bi_status &&
if (last_end == disk_start && !last->bi_status &&
last->bi_disk == stripe->dev->bdev->bd_disk &&
last->bi_partno == stripe->dev->bdev->bd_partno) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
......@@ -1117,6 +1115,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
btrfs_io_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
......@@ -1325,11 +1324,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
break;
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
......@@ -1354,7 +1349,6 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
u64 stripe_start;
int i;
struct btrfs_bio_stripe *stripe;
......@@ -1362,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
stripe_start = stripe->physical;
if (physical >= stripe_start &&
physical < stripe_start + rbio->stripe_len &&
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev &&
bio->bi_disk == stripe->dev->bdev->bd_disk &&
bio->bi_partno == stripe->dev->bdev->bd_partno) {
......@@ -1382,18 +1374,14 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 logical = bio->bi_iter.bi_sector;
u64 stripe_start;
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
int i;
logical <<= 9;
for (i = 0; i < rbio->nr_data; i++) {
stripe_start = rbio->bbio->raid_map[i];
if (logical >= stripe_start &&
logical < stripe_start + rbio->stripe_len) {
u64 stripe_start = rbio->bbio->raid_map[i];
if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
}
}
return -1;
}
......@@ -1567,11 +1555,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
break;
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
......@@ -1878,11 +1862,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* make sure our ps and qs are in order */
if (faila > failb) {
int tmp = failb;
failb = faila;
faila = tmp;
}
if (faila > failb)
swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
......@@ -2102,7 +2083,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
*/
if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
goto out;
return 0;
} else {
goto cleanup;
}
......@@ -2113,11 +2094,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
break;
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
......@@ -2126,7 +2103,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
submit_bio(bio);
}
out:
return 0;
cleanup:
......@@ -2482,11 +2459,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
atomic_set(&rbio->stripes_pending, nr_data);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
break;
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
......@@ -2664,11 +2637,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while (1) {
bio = bio_list_pop(&bio_list);
if (!bio)
break;
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
......
......@@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
exist_re = insert_root_entry(&exist->roots, re);
if (exist_re)
kfree(re);
} else {
kfree(re);
}
kfree(be);
return exist;
......
......@@ -68,8 +68,8 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
block_size);
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
file_offset, block_size);
if (ret)
goto out;
......@@ -84,7 +84,8 @@ static int copy_inline_to_page(struct inode *inode,
clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
0, NULL);
if (ret)
goto out_unlock;
......@@ -133,8 +134,8 @@ static int copy_inline_to_page(struct inode *inode,
put_page(page);
}
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
block_size, true);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
file_offset, block_size, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
out:
extent_changeset_free(data_reserved);
......@@ -336,6 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
while (1) {
u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
u64 extent_gen;
int type;
u32 size;
struct btrfs_key new_key;
......@@ -384,6 +386,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
extent_gen = btrfs_file_extent_generation(leaf, extent);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG ||
......@@ -488,6 +491,19 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
btrfs_release_path(path);
/*
* If this is a new extent update the last_reflink_trans of both
* inodes. This is used by fsync to make sure it does not log
* multiple checksum items with overlapping ranges. For older
* extents we don't need to do it since inode logging skips the
* checksums for older extents. Also ignore holes and inline
* extents because they don't have checksums in the csum tree.
*/
if (extent_gen == trans->transid && disko > 0) {
BTRFS_I(src)->last_reflink_trans = trans->transid;
BTRFS_I(inode)->last_reflink_trans = trans->transid;
}
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode, last_dest_end,
......
......@@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
/*
* In merge_reloc_root(), we modify the upper level pointer to swap the
* tree blocks between reloc tree and subvolume tree. Thus for tree
* block COW, we COW at most from level 1 to root level for each tree.
*
* Thus the needed metadata size is at most root_level * nodesize,
* and * 2 since we have two trees to COW.
*/
min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
BTRFS_RESERVE_FLUSH_ALL);
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
err = ret;
goto out;
......@@ -2571,58 +2579,50 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
return err;
}
static noinline_for_stack
int prealloc_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
static noinline_for_stack int prealloc_file_extent_cluster(
struct btrfs_inode *inode,
struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
u64 end;
u64 offset = BTRFS_I(inode)->index_cnt;
u64 offset = inode->index_cnt;
u64 num_bytes;
int nr = 0;
int nr;
int ret = 0;
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset;
struct extent_changeset *data_reserved = NULL;
u64 cur_offset = prealloc_start;
BUG_ON(cluster->start != cluster->boundary[0]);
inode_lock(inode);
ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
prealloc_end + 1 - prealloc_start);
ret = btrfs_alloc_data_chunk_ondemand(inode,
prealloc_end + 1 - prealloc_start);
if (ret)
goto out;
return ret;
cur_offset = prealloc_start;
while (nr < cluster->nr) {
inode_lock(&inode->vfs_inode);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
lock_extent(&inode->io_tree, start, end);
num_bytes = end + 1 - start;
if (cur_offset < start)
btrfs_free_reserved_data_space(inode, data_reserved,
cur_offset, start - cur_offset);
ret = btrfs_prealloc_file_range(inode, 0, start,
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
unlock_extent(&inode->io_tree, start, end);
if (ret)
break;
nr++;
}
inode_unlock(&inode->vfs_inode);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space(inode, data_reserved,
cur_offset, prealloc_end + 1 - cur_offset);
out:
inode_unlock(inode);
extent_changeset_free(data_reserved);
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
prealloc_end + 1 - cur_offset);
return ret;
}
......@@ -2664,7 +2664,8 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
*/
int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req);
return atomic_read(&fs_info->balance_cancel_req) ||
fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
......@@ -2690,7 +2691,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
ret = prealloc_file_extent_cluster(inode, cluster);
ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
if (ret)
goto out;
......@@ -2762,8 +2763,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
nr++;
}
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
NULL);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
page_end, 0, NULL);
if (ret) {
unlock_page(page);
put_page(page);
......@@ -3872,9 +3873,9 @@ int btrfs_recover_relocation(struct btrfs_root *root)
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
......@@ -3885,7 +3886,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
disk_bytenr = file_pos + inode->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
......
此差异已折叠。
......@@ -468,8 +468,8 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
cache->start, cache->length, cache->used, cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
btrfs_dump_free_space(cache, bytes);
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
......
此差异已折叠。
此差异已折叠。
......@@ -36,4 +36,11 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup);
void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup);
#endif
......@@ -60,8 +60,6 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
if (i >= num_extents)
goto invalid;
if (i >= num_extents ||
extent_start != extents[i].start ||
offset - extent_start != extents[i].length)
......
......@@ -954,8 +954,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
NULL);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
......@@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL);
if (ret) {
......@@ -999,7 +999,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
0, NULL);
......@@ -1017,7 +1017,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
ret = btrfs_set_extent_delalloc(inode,
ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL);
......@@ -1035,7 +1035,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
ret = btrfs_set_extent_delalloc(inode,
ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
......@@ -1069,7 +1069,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
* Refill the hole again just for good measure, because I thought it
* might fail and I'd rather satisfy my paranoia at this point.
*/
ret = btrfs_set_extent_delalloc(inode,
ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
......
......@@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
err = -EIO;
if (TRANS_ABORTED(trans))
err = trans->aborted;
else
err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
......@@ -1630,7 +1633,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
pending->snap = btrfs_get_fs_root(fs_info, objectid, true);
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret);
......@@ -2351,7 +2354,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
......
......@@ -151,18 +151,20 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv;
/* extra metadata reservation for relocation */
int error;
/* Preallocated anonymous block device number */
dev_t anon_dev;
bool readonly;
struct list_head list;
};
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
struct btrfs_inode *inode)
{
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
spin_unlock(&BTRFS_I(inode)->lock);
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
inode->last_sub_trans = inode->root->log_transid;
inode->last_log_commit = inode->root->last_log_commit;
spin_unlock(&inode->lock);
}
/*
......@@ -208,20 +210,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
/*
* Try to commit transaction asynchronously, so this is safe to call
* even holding a spinlock.
*
* It's done by informing transaction_kthread to commit transaction without
* waiting for commit interval.
*/
static inline void btrfs_commit_transaction_locksafe(
struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
......
......@@ -133,10 +133,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0;
}
done:
if (ret != -EAGAIN) {
if (ret != -EAGAIN)
memset(&root->defrag_progress, 0,
sizeof(root->defrag_progress));
root->defrag_trans_start = trans->transid;
}
return ret;
}
此差异已折叠。
此差异已折叠。
......@@ -288,7 +288,7 @@ struct btrfs_fs_devices {
*/
struct btrfs_io_bio {
unsigned int mirror_num;
unsigned int stripe_index;
struct btrfs_device *device;
u64 logical;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
......
此差异已折叠。
......@@ -243,6 +243,18 @@ struct btrfs_ioctl_dev_info_args {
__u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
};
/*
* Retrieve information about the filesystem
*/
/* Request information about checksum type and size */
#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0)
/* Request information about filesystem generation */
#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1)
/* Request information about filesystem metadata UUID */
#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2)
struct btrfs_ioctl_fs_info_args {
__u64 max_id; /* out */
__u64 num_devices; /* out */
......@@ -250,8 +262,13 @@ struct btrfs_ioctl_fs_info_args {
__u32 nodesize; /* out */
__u32 sectorsize; /* out */
__u32 clone_alignment; /* out */
__u32 reserved32;
__u64 reserved[122]; /* pad to 1k */
/* See BTRFS_FS_INFO_FLAG_* */
__u16 csum_type; /* out */
__u16 csum_size; /* out */
__u64 flags; /* in/out */
__u64 generation; /* out */
__u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */
__u8 reserved[944]; /* pad to 1k */
};
/*
......
......@@ -913,9 +913,9 @@ struct btrfs_free_space_info {
#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline __u64 btrfs_qgroup_level(__u64 qgroupid)
static inline __u16 btrfs_qgroup_level(__u64 qgroupid)
{
return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
return (__u16)(qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT);
}
/*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册