提交 f28a4928 编写于 作者: F Filipe Manana

Btrfs: fix leaking of ordered extents after direct IO write error

When doing a direct IO write, __blockdev_direct_IO() can call the
btrfs_get_blocks_direct() callback one or more times before it calls the
btrfs_submit_direct() callback. However it can fail after calling the
first callback and before calling the second callback, which is a problem
because the first one creates ordered extents and the second one is the
one that submits bios that cover the ordered extents created by the first
one. That means the ordered extents will never complete nor have any of
the flags BTRFS_ORDERED_IO_DONE / BTRFS_ORDERED_IOERR set, resulting in
subsequent operations (such as other direct IO writes, buffered writes or
hole punching) that lock the same IO range and lookup for ordered extents
in the range to hang forever waiting for those ordered extents because
they can not complete ever, since no bio was submitted.

Fix this by tracking a range of created ordered extents that don't have
yet corresponding bios submitted and completing the ordered extents in
the range if __blockdev_direct_IO() fails with an error.
Signed-off-by: NFilipe Manana <fdmanana@suse.com>
上级 b850ae14
...@@ -66,6 +66,13 @@ struct btrfs_iget_args { ...@@ -66,6 +66,13 @@ struct btrfs_iget_args {
struct btrfs_root *root; struct btrfs_root *root;
}; };
struct btrfs_dio_data {
u64 outstanding_extents;
u64 reserve;
u64 unsubmitted_oe_range_start;
u64 unsubmitted_oe_range_end;
};
static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations;
...@@ -7481,11 +7488,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, ...@@ -7481,11 +7488,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em; return em;
} }
struct btrfs_dio_data {
u64 outstanding_extents;
u64 reserve;
};
static void adjust_dio_outstanding_extents(struct inode *inode, static void adjust_dio_outstanding_extents(struct inode *inode,
struct btrfs_dio_data *dio_data, struct btrfs_dio_data *dio_data,
const u64 len) const u64 len)
...@@ -7669,6 +7671,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ...@@ -7669,6 +7671,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
btrfs_free_reserved_data_space(inode, start, len); btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len); WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len; dio_data->reserve -= len;
dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data; current->journal_info = dio_data;
} }
...@@ -8342,6 +8345,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, ...@@ -8342,6 +8345,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->subio_endio = btrfs_subio_endio_read; dip->subio_endio = btrfs_subio_endio_read;
} }
/*
* Reset the range for unsubmitted ordered extents (to a 0 length range)
* even if we fail to submit a bio, because in such case we do the
* corresponding error handling below and it must not be done a second
* time by btrfs_direct_IO().
*/
if (write) {
struct btrfs_dio_data *dio_data = current->journal_info;
dio_data->unsubmitted_oe_range_end = dip->logical_offset +
dip->bytes;
dio_data->unsubmitted_oe_range_start =
dio_data->unsubmitted_oe_range_end;
}
ret = btrfs_submit_direct_hook(rw, dip, skip_sum); ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret) if (!ret)
return; return;
...@@ -8478,6 +8496,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -8478,6 +8496,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* originally calculated. Abuse current->journal_info for this. * originally calculated. Abuse current->journal_info for this.
*/ */
dio_data.reserve = round_up(count, root->sectorsize); dio_data.reserve = round_up(count, root->sectorsize);
dio_data.unsubmitted_oe_range_start = (u64)offset;
dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data; current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) { &BTRFS_I(inode)->runtime_flags)) {
...@@ -8496,6 +8516,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -8496,6 +8516,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (dio_data.reserve) if (dio_data.reserve)
btrfs_delalloc_release_space(inode, offset, btrfs_delalloc_release_space(inode, offset,
dio_data.reserve); dio_data.reserve);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
* cleanup them up to avoid other tasks getting them
* and waiting for them to complete forever.
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
btrfs_endio_direct_write_update_ordered(inode,
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
0);
} else if (ret >= 0 && (size_t)ret < count) } else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset, btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret); count - (size_t)ret);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册