提交 acb3e26f 编写于 作者: D Dave Chinner

Merge branch 'xfs-dio-fix-4.6' into for-next

......@@ -267,8 +267,13 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_unlock(inode);
if ((retval > 0) && end_io)
end_io(iocb, pos, retval, bh.b_private);
if (end_io) {
int err;
err = end_io(iocb, pos, retval, bh.b_private);
if (err)
retval = err;
}
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
......
......@@ -253,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
if (ret == 0)
ret = transferred;
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, transferred, dio->private);
if (dio->end_io) {
int err;
err = dio->end_io(dio->iocb, offset, ret, dio->private);
if (err)
ret = err;
}
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(dio->inode);
......
......@@ -1504,15 +1504,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
{
return inode->i_private;
......@@ -3293,6 +3284,27 @@ extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
extern int ext4_resize_begin(struct super_block *sb);
extern void ext4_resize_end(struct super_block *sb);
static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
}
}
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
......
......@@ -3161,14 +3161,14 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
}
#endif
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO just return */
if (!io_end)
return;
return 0;
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
......@@ -3176,9 +3176,19 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
iocb->private = NULL;
/*
* Error during AIO DIO. We cannot convert unwritten extents as the
* data was not written. Just clear the unwritten flag and drop io_end.
*/
if (size <= 0) {
ext4_clear_io_unwritten_flag(io_end);
size = 0;
}
io_end->offset = offset;
io_end->size = size;
ext4_put_io_end(io_end);
return 0;
}
/*
......@@ -3301,16 +3311,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (io_end) {
ext4_inode_aio_set(inode, NULL);
ext4_put_io_end(io_end);
/*
* When no IO was submitted ext4_end_io_dio() was not
* called so we have to put iocb's reference.
*/
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
ext4_put_io_end(io_end);
iocb->private = NULL;
}
}
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
......
......@@ -139,16 +139,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
kmem_cache_free(io_end_cachep, io_end);
}
static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
}
/*
* Check a range of space and convert unwritten extents to written. Note that
* we are protected from truncate touching same part of extent tree by the
......
......@@ -620,7 +620,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
* particularly interested in the aio/dio case. We use the rw_lock DLM lock
* to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
static int ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
void *private)
......@@ -628,6 +628,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
struct inode *inode = file_inode(iocb->ki_filp);
int level;
if (bytes <= 0)
return 0;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
......@@ -644,6 +647,8 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
}
return 0;
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
......
......@@ -36,6 +36,10 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
/* flags for direct write completions */
#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
#define XFS_DIO_FLAG_APPEND (1 << 1)
void
xfs_count_page_state(
struct page *page,
......@@ -1238,27 +1242,8 @@ xfs_vm_releasepage(
}
/*
* When we map a DIO buffer, we may need to attach an ioend that describes the
* type of write IO we are doing. This passes to the completion function the
* operations it needs to perform. If the mapping is for an overwrite wholly
* within the EOF then we don't need an ioend and so we don't allocate one.
* This avoids the unnecessary overhead of allocating and freeing ioends for
* workloads that don't require transactions on IO completion.
*
* If we get multiple mappings in a single IO, we might be mapping different
* types. But because the direct IO can only have a single private pointer, we
* need to ensure that:
*
* a) i) the ioend spans the entire region of unwritten mappings; or
* ii) the ioend spans all the mappings that cross or are beyond EOF; and
* b) if it contains unwritten extents, it is *permanently* marked as such
*
* We could do this by chaining ioends like buffered IO does, but we only
* actually get one IO completion callback from the direct IO, and that spans
* the entire IO regardless of how many mappings and IOs are needed to complete
* the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need
* reference counting here.
* When we map a DIO buffer, we may need to pass flags to
* xfs_end_io_direct_write to tell it what kind of write IO we are doing.
*
* Note that for DIO, an IO to the highest supported file block offset (i.e.
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
......@@ -1266,68 +1251,26 @@ xfs_vm_releasepage(
* extending the file size. We won't know for sure until IO completion is run
* and the actual max write offset is communicated to the IO completion
* routine.
*
* For DAX page faults, we are preparing to never see unwritten extents here,
* nor should we ever extend the inode size. Hence we will soon have nothing to
* do here for this case, ensuring we don't have to provide an IO completion
* callback to free an ioend that we don't actually need for a fault into the
* page at offset (2^63 - 1FSB) bytes.
*/
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
xfs_off_t offset,
bool dax_fault)
xfs_off_t offset)
{
struct xfs_ioend *ioend;
uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
xfs_off_t size = bh_result->b_size;
int type;
if (ISUNWRITTEN(imap))
type = XFS_IO_UNWRITTEN;
else
type = XFS_IO_OVERWRITE;
trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
if (dax_fault) {
ASSERT(type == XFS_IO_OVERWRITE);
trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
imap);
return;
}
if (bh_result->b_private) {
ioend = bh_result->b_private;
ASSERT(ioend->io_size > 0);
ASSERT(offset >= ioend->io_offset);
if (offset + size > ioend->io_offset + ioend->io_size)
ioend->io_size = offset - ioend->io_offset + size;
if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
ioend->io_type = XFS_IO_UNWRITTEN;
trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
ioend->io_size, ioend->io_type,
imap);
} else if (type == XFS_IO_UNWRITTEN ||
offset + size > i_size_read(inode) ||
offset + size < 0) {
ioend = xfs_alloc_ioend(inode, type);
ioend->io_offset = offset;
ioend->io_size = size;
trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
bh_result->b_private = ioend;
if (ISUNWRITTEN(imap)) {
*flags |= XFS_DIO_FLAG_UNWRITTEN;
set_buffer_defer_completion(bh_result);
} else if (offset + size > i_size_read(inode) || offset + size < 0) {
*flags |= XFS_DIO_FLAG_APPEND;
set_buffer_defer_completion(bh_result);
trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
imap);
} else {
trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
imap);
}
}
......@@ -1498,9 +1441,12 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
if (create && direct)
xfs_map_direct(inode, bh_result, &imap, offset,
dax_fault);
if (create && direct) {
if (dax_fault)
ASSERT(!ISUNWRITTEN(&imap));
else
xfs_map_direct(inode, bh_result, &imap, offset);
}
}
/*
......@@ -1570,42 +1516,50 @@ xfs_get_blocks_dax_fault(
return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
}
static void
__xfs_end_io_direct_write(
struct inode *inode,
struct xfs_ioend *ioend,
/*
* Complete a direct I/O write request.
*
* xfs_map_direct passes us some flags in the private data to tell us what to
* do. If no flags are set, then the write IO is an overwrite wholly within
* the existing allocated file size and so there is nothing for us to do.
*
* Note that in this case the completion can be called in interrupt context,
* whereas if we have flags set we will always be called in task context
* (i.e. from a workqueue).
*/
STATIC int
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size)
ssize_t size,
void *private)
{
struct xfs_mount *mp = XFS_I(inode)->i_mount;
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
uintptr_t flags = (uintptr_t)private;
int error = 0;
if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
goto out_end_io;
trace_xfs_end_io_direct_write(ip, offset, size);
/*
* dio completion end_io functions are only called on writes if more
* than 0 bytes was written.
*/
ASSERT(size > 0);
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
/*
* The ioend only maps whole blocks, while the IO may be sector aligned.
* Hence the ioend offset/size may not match the IO offset/size exactly.
* Because we don't map overwrites within EOF into the ioend, the offset
* may not match, but only if the endio spans EOF. Either way, write
* the IO sizes into the ioend so that completion processing does the
* right thing.
*/
ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
ioend->io_size = size;
ioend->io_offset = offset;
if (size <= 0)
return size;
/*
* The ioend tells us whether we are doing unwritten extent conversion
* The flags tell us whether we are doing unwritten extent conversions
* or an append transaction that updates the on-disk file size. These
* cases are the only cases where we should *potentially* be needing
* to update the VFS inode size.
*
*/
if (flags == 0) {
ASSERT(offset + size <= i_size_read(inode));
return 0;
}
/*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
......@@ -1616,91 +1570,56 @@ __xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
spin_lock(&XFS_I(inode)->i_flags_lock);
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
spin_unlock(&XFS_I(inode)->i_flags_lock);
spin_unlock(&ip->i_flags_lock);
/*
* If we are doing an append IO that needs to update the EOF on disk,
* do the transaction reserve now so we can use common end io
* processing. Stashing the error (if there is one) in the ioend will
* result in the ioend processing passing on the error if it is
* possible as we can't return it from here.
*/
if (ioend->io_type == XFS_IO_OVERWRITE)
ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
if (flags & XFS_DIO_FLAG_UNWRITTEN) {
trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
out_end_io:
xfs_end_io(&ioend->io_work);
return;
}
error = xfs_iomap_write_unwritten(ip, offset, size);
} else if (flags & XFS_DIO_FLAG_APPEND) {
struct xfs_trans *tp;
/*
* Complete a direct I/O write request.
*
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
* wholly within the EOF and so there is nothing for us to do. Note that in this
* case the completion can be called in interrupt context, whereas if we have an
* ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_ioend *ioend = private;
trace_xfs_end_io_direct_write_append(ip, offset, size);
trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
ioend ? ioend->io_type : 0, NULL);
if (!ioend) {
ASSERT(offset + size <= i_size_read(inode));
return;
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
xfs_trans_cancel(tp);
return error;
}
error = xfs_setfilesize(ip, tp, offset, size);
}
__xfs_end_io_direct_write(inode, ioend, offset, size);
return error;
}
static inline ssize_t
xfs_vm_do_dio(
struct inode *inode,
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset,
void (*endio)(struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private),
int flags)
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
dio_iodone_t *endio = NULL;
int flags = 0;
struct block_device *bdev;
if (IS_DAX(inode))
if (iov_iter_rw(iter) == WRITE) {
endio = xfs_end_io_direct_write;
flags = DIO_ASYNC_EXTEND;
}
if (IS_DAX(inode)) {
return dax_do_io(iocb, inode, iter, offset,
xfs_get_blocks_direct, endio, 0);
}
bdev = xfs_find_bdev_for_inode(inode);
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
xfs_get_blocks_direct, endio, NULL, flags);
}
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
if (iov_iter_rw(iter) == WRITE)
return xfs_vm_do_dio(inode, iocb, iter, offset,
xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
xfs_get_blocks_direct, endio, NULL, flags);
}
/*
......
......@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
......@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
......
......@@ -70,7 +70,7 @@ extern int sysctl_protected_hardlinks;
struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册