提交 839a3f76 编写于 作者: L Linus Torvalds

Merge branch 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs fixes from Chris Mason:
 "These are bug fixes, including a really old fsync bug, and a few trace
  points to help us track down problems in the quota code"

* 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
  Btrfs: fix file/data loss caused by fsync after rename and new inode
  btrfs: Reset IO error counters before start of device replacing
  btrfs: Add qgroup tracing
  Btrfs: don't use src fd for printk
  btrfs: fallback to vmalloc in btrfs_compare_tree
  btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
  btrfs: Output more info for enospc_debug mount option
  Btrfs: fix invalid reference in replace_path
  Btrfs: Improve FL_KEEP_SIZE handling in fallocate
......@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
......@@ -5361,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL);
tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
tmp_buf = vmalloc(left_root->nodesize);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
}
}
left_path->search_commit_root = 1;
......@@ -5565,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
kfree(tmp_buf);
kvfree(tmp_buf);
return ret;
}
......
......@@ -394,6 +394,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
dev_replace->cursor_right = 0;
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace, 1);
......
......@@ -9386,15 +9386,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 dev_min = 1;
u64 dev_nr = 0;
u64 target;
int debug;
int index;
int full = 0;
int ret = 0;
debug = btrfs_test_opt(root, ENOSPC_DEBUG);
block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
/* odd, couldn't find the block group, leave it alone */
if (!block_group)
if (!block_group) {
if (debug)
btrfs_warn(root->fs_info,
"can't find block group for bytenr %llu",
bytenr);
return -1;
}
min_free = btrfs_block_group_used(&block_group->item);
......@@ -9448,8 +9456,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* this is just a balance, so if we were marked as full
* we know there is no space for a new chunk
*/
if (full)
if (full) {
if (debug)
btrfs_warn(root->fs_info,
"no space to alloc new chunk for block group %llu",
block_group->key.objectid);
goto out;
}
index = get_block_group_index(block_group);
}
......@@ -9496,6 +9509,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
ret = -1;
}
}
if (debug && ret == -1)
btrfs_warn(root->fs_info,
"no space to allocate a new chunk for block group %llu",
block_group->key.objectid);
mutex_unlock(&root->fs_info->chunk_mutex);
btrfs_end_transaction(trans, root);
out:
......
......@@ -2682,9 +2682,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
inode_lock(inode);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
goto out;
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
if (ret)
goto out;
}
/*
* TODO: Move these two operations after we have checked
......
......@@ -1654,7 +1654,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(src_inode)->root->fs_info,
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
......
......@@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record
u64 bytenr = record->bytenr;
assert_spin_locked(&delayed_refs->lock);
trace_btrfs_qgroup_insert_dirty_extent(record);
while (*p) {
parent_node = *p;
......@@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
......@@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
goto out_free;
BUG_ON(!fs_info->quota_root);
trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
nr_new_roots);
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
ret = -ENOMEM;
......@@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
trace_btrfs_qgroup_account_extents(record);
if (!ret) {
/*
* Use (u64)-1 as time_seq to do special search, which
......@@ -1842,8 +1851,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
}
/*
* copy the acounting information between qgroups. This is necessary when a
* snapshot or a subvolume is created
* Copy the acounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
* cause a transaction abort so we take extra care here to only error
* when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
......@@ -1873,15 +1884,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
if (!srcgroup) {
ret = -EINVAL;
goto out;
}
if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
ret = -EINVAL;
goto out;
}
/*
* Zero out invalid groups so we can ignore
* them later.
*/
if (!srcgroup ||
((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
*i_qgroups = 0ULL;
++i_qgroups;
}
}
......@@ -1916,17 +1927,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
if (*i_qgroups == 0)
continue;
ret = add_qgroup_relation_item(trans, quota_root,
objectid, *i_qgroups);
if (ret)
if (ret && ret != -EEXIST)
goto out;
ret = add_qgroup_relation_item(trans, quota_root,
*i_qgroups, objectid);
if (ret)
if (ret && ret != -EEXIST)
goto out;
++i_qgroups;
}
ret = 0;
}
......@@ -1987,17 +2000,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
ret = add_relation_rb(quota_root->fs_info, objectid,
*i_qgroups);
if (ret)
goto unlock;
if (*i_qgroups) {
ret = add_relation_rb(quota_root->fs_info, objectid,
*i_qgroups);
if (ret)
goto unlock;
}
++i_qgroups;
}
for (i = 0; i < inherit->num_ref_copies; ++i) {
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
if (!i_qgroups[0] || !i_qgroups[1])
continue;
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
......@@ -2008,12 +2026,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
i_qgroups += 2;
}
for (i = 0; i < inherit->num_excl_copies; ++i) {
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
if (!i_qgroups[0] || !i_qgroups[1])
continue;
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
......@@ -2024,7 +2044,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
i_qgroups += 2;
}
unlock:
......
......@@ -1850,6 +1850,7 @@ int replace_path(struct btrfs_trans_handle *trans,
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
} else if (!extent_buffer_uptodate(eb)) {
ret = -EIO;
free_extent_buffer(eb);
......
......@@ -4415,6 +4415,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
return ret;
}
/*
* When we are logging a new inode X, check if it doesn't have a reference that
* matches the reference from some other inode Y created in a past transaction
* and that was renamed in the current transaction. If we don't do this, then at
* log replay time we can lose inode Y (and all its files if it's a directory):
*
* mkdir /mnt/x
* echo "hello world" > /mnt/x/foobar
* sync
* mv /mnt/x /mnt/y
* mkdir /mnt/x # or touch /mnt/x
* xfs_io -c fsync /mnt/x
* <power fail>
* mount fs, trigger log replay
*
* After the log replay procedure, we would lose the first directory and all its
* files (file foobar).
* For the case where inode Y is not a directory we simply end up losing it:
*
* echo "123" > /mnt/foo
* sync
* mv /mnt/foo /mnt/bar
* echo "abc" > /mnt/foo
* xfs_io -c fsync /mnt/foo
* <power fail>
*
* We also need this for cases where a snapshot entry is replaced by some other
* entry (file or directory) otherwise we end up with an unreplayable log due to
* attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
* if it were a regular entry:
*
* mkdir /mnt/x
* btrfs subvolume snapshot /mnt /mnt/x/snap
* btrfs subvolume delete /mnt/x/snap
* rmdir /mnt/x
* mkdir /mnt/x
* fsync /mnt/x or fsync some new file inside it
* <power fail>
*
* The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
* the same transaction.
*/
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct inode *inode)
{
int ret;
struct btrfs_path *search_path;
char *name = NULL;
u32 name_len = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
search_path = btrfs_alloc_path();
if (!search_path)
return -ENOMEM;
search_path->search_commit_root = 1;
search_path->skip_locking = 1;
while (cur_offset < item_size) {
u64 parent;
u32 this_name_len;
u32 this_len;
unsigned long name_ptr;
struct btrfs_dir_item *di;
if (key->type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
parent = key->offset;
this_name_len = btrfs_inode_ref_name_len(eb, iref);
name_ptr = (unsigned long)(iref + 1);
this_len = sizeof(*iref) + this_name_len;
} else {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)(ptr +
cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
this_name_len = btrfs_inode_extref_name_len(eb, extref);
name_ptr = (unsigned long)&extref->name;
this_len = sizeof(*extref) + this_name_len;
}
if (this_name_len > name_len) {
char *new_name;
new_name = krealloc(name, this_name_len, GFP_NOFS);
if (!new_name) {
ret = -ENOMEM;
goto out;
}
name_len = this_name_len;
name = new_name;
}
read_extent_buffer(eb, name, name_ptr, this_name_len);
di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
search_path, parent,
name, this_name_len, 0);
if (di && !IS_ERR(di)) {
ret = 1;
goto out;
} else if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
btrfs_release_path(search_path);
cur_offset += this_len;
}
ret = 0;
out:
btrfs_free_path(search_path);
kfree(name);
return ret;
}
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
......@@ -4602,6 +4723,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false;
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
BTRFS_I(inode)->generation == trans->transid) {
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0],
&min_key, inode);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0) {
err = 1;
btrfs_set_log_full_commit(root->fs_info, trans);
goto out_unlock;
}
}
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0)
......
......@@ -23,7 +23,7 @@ struct map_lookup;
struct extent_buffer;
struct btrfs_work;
struct __btrfs_workqueue;
struct btrfs_qgroup_operation;
struct btrfs_qgroup_extent_record;
#define show_ref_type(type) \
__print_symbolic(type, \
......@@ -1231,6 +1231,93 @@ DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
TP_ARGS(ref_root, reserved)
);
DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
TP_PROTO(struct btrfs_qgroup_extent_record *rec),
TP_ARGS(rec),
TP_STRUCT__entry(
__field( u64, bytenr )
__field( u64, num_bytes )
),
TP_fast_assign(
__entry->bytenr = rec->bytenr,
__entry->num_bytes = rec->num_bytes;
),
TP_printk("bytenr = %llu, num_bytes = %llu",
(unsigned long long)__entry->bytenr,
(unsigned long long)__entry->num_bytes)
);
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
TP_PROTO(struct btrfs_qgroup_extent_record *rec),
TP_ARGS(rec)
);
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent,
TP_PROTO(struct btrfs_qgroup_extent_record *rec),
TP_ARGS(rec)
);
TRACE_EVENT(btrfs_qgroup_account_extent,
TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots),
TP_STRUCT__entry(
__field( u64, bytenr )
__field( u64, num_bytes )
__field( u64, nr_old_roots )
__field( u64, nr_new_roots )
),
TP_fast_assign(
__entry->bytenr = bytenr;
__entry->num_bytes = num_bytes;
__entry->nr_old_roots = nr_old_roots;
__entry->nr_new_roots = nr_new_roots;
),
TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, "
"nr_new_roots = %llu",
__entry->bytenr,
__entry->num_bytes,
__entry->nr_old_roots,
__entry->nr_new_roots)
);
TRACE_EVENT(qgroup_update_counters,
TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count),
TP_ARGS(qgid, cur_old_count, cur_new_count),
TP_STRUCT__entry(
__field( u64, qgid )
__field( u64, cur_old_count )
__field( u64, cur_new_count )
),
TP_fast_assign(
__entry->qgid = qgid;
__entry->cur_old_count = cur_old_count;
__entry->cur_new_count = cur_new_count;
),
TP_printk("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu",
__entry->qgid,
__entry->cur_old_count,
__entry->cur_new_count)
);
#endif /* _TRACE_BTRFS_H */
/* This part must be outside protection */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册