提交 776722e8 编写于 作者: J Jan Kara 提交者: Theodore Ts'o

ext4: DAX iomap write support

Implement DAX writes using the new iomap infrastructure instead of
overloading the direct IO path.
Signed-off-by: NJan Kara <jack@suse.cz>
Signed-off-by: NTheodore Ts'o <tytso@mit.edu>
上级 47e69351
......@@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return iov_iter_count(from);
}
#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
bool overwrite = false;
inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
ret = file_remove_privs(iocb->ki_filp);
if (ret)
goto out;
ret = file_update_time(iocb->ki_filp);
if (ret)
goto out;
if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
overwrite = true;
downgrade_write(&inode->i_rwsem);
}
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
out:
if (!overwrite)
inode_unlock(inode);
else
inode_unlock_shared(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
}
#endif
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
......@@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
int overwrite = 0;
ssize_t ret;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
......
......@@ -3329,18 +3329,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct ext4_map_blocks map;
int ret;
if (flags & IOMAP_WRITE)
return -EIO;
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
return -ERANGE;
map.m_lblk = first_block;
map.m_len = last_block - first_block + 1;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
if (!(flags & IOMAP_WRITE)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
} else {
int dio_credits;
handle_t *handle;
int retries = 0;
/* Trim mapping request to maximum we can map at once for DIO */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
retry:
/*
* Either we allocate blocks and then we don't get unwritten
* extent so we have reserved enough credits, or the blocks
* are already allocated and unwritten and in that case
* extent conversion fits in the credits as well.
*/
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
dio_credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_CREATE_ZERO);
if (ret < 0) {
ext4_journal_stop(handle);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
return ret;
}
/* For DAX writes we need to zero out unwritten extents */
if (map.m_flags & EXT4_MAP_UNWRITTEN) {
/*
* We are protected by i_mmap_sem or i_rwsem so we know
* block cannot go away from under us even though we
* dropped i_data_sem. Convert extent to written and
* write zeros there.
*/
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_CONVERT |
EXT4_GET_BLOCKS_CREATE_ZERO);
if (ret < 0) {
ext4_journal_stop(handle);
return ret;
}
}
/*
* If we added blocks beyond i_size we need to make sure they
* will get truncated if we crash before updating i_size in
* ext4_iomap_end().
*/
if (first_block + map.m_len >
(inode->i_size + (1 << blkbits) - 1) >> blkbits) {
int err;
err = ext4_orphan_add(handle, inode);
if (err < 0) {
ext4_journal_stop(handle);
return err;
}
}
ext4_journal_stop(handle);
}
iomap->flags = 0;
iomap->bdev = inode->i_sb->s_bdev;
......@@ -3368,8 +3429,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return 0;
}
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
int ret = 0;
handle_t *handle;
int blkbits = inode->i_blkbits;
bool truncate = false;
if (!(flags & IOMAP_WRITE))
return 0;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto orphan_del;
}
if (ext4_update_inode_size(inode, offset + written))
ext4_mark_inode_dirty(handle, inode);
/*
* We may need to truncate allocated but not written blocks beyond EOF.
*/
if (iomap->offset + iomap->length >
ALIGN(inode->i_size, 1 << blkbits)) {
ext4_lblk_t written_blk, end_blk;
written_blk = (offset + written) >> blkbits;
end_blk = (offset + length) >> blkbits;
if (written_blk < end_blk && ext4_can_truncate(inode))
truncate = true;
}
/*
* Remove inode from orphan list if we were extending a inode and
* everything went fine.
*/
if (!truncate && inode->i_nlink &&
!list_empty(&EXT4_I(inode)->i_orphan))
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
if (truncate) {
ext4_truncate_failed_write(inode);
orphan_del:
/*
* If truncate failed early the inode might still be on the
* orphan list; we need to make sure the inode is removed from
* the orphan list in that case.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
return ret;
}
struct iomap_ops ext4_iomap_ops = {
.iomap_begin = ext4_iomap_begin,
.iomap_end = ext4_iomap_end,
};
#else
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册