diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ccb4dbf359c4e76ba874baa42228b525e8df7371..b491576e11c33ca3bdf6e5fcda595fcb22ced87b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -127,10 +127,11 @@ struct mpage_da_data { int pages_written; int retval; }; - +#define DIO_AIO_UNWRITTEN 0x1 typedef struct ext4_io_end { + struct list_head list; /* per-file finished AIO list */ struct inode *inode; /* file being written to */ - unsigned int flag; /* sync IO or AIO */ + unsigned int flag; /* unwritten or not */ int error; /* I/O error code */ ext4_lblk_t offset; /* offset in the file */ size_t size; /* size of the extent */ @@ -690,6 +691,11 @@ struct ext4_inode_info { __u16 i_extra_isize; spinlock_t i_block_reservation_lock; + + /* completed async DIOs that might need unwritten extents handling */ + struct list_head i_aio_dio_complete_list; + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; }; /* @@ -1419,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t ext4_get_reserved_space(struct inode *inode); - +extern int flush_aio_dio_completed_IO(struct inode *inode); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a38e651c004e3d83e796604725d1fe7f4d174147..10a63096a95ae8c653616dce35bc4fd8620d0e8e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3033,6 +3033,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, { int ret = 0; int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" "block %llu, max_blocks %u, flags %d, allocated %u", @@ -3045,6 +3046,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); + /* flag the io_end struct that we need convert when IO done */ + if (io) + io->flag = DIO_AIO_UNWRITTEN; goto out; } /* DIO end_io complete, convert the filled extent to written */ @@ -3130,6 +3134,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%u requested for inode %lu\n", @@ -3279,8 +3284,20 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); + /* + * io_end structure was created for every async + * direct IO write to the middle of the file. + * To avoid unecessary convertion for every aio dio rewrite + * to the mid of file, here we flag the IO that is really + * need the convertion. + * + */ + if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) + io->flag = DIO_AIO_UNWRITTEN; + } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { /* free data blocks we just allocated */ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 07475740b512535ec6d30aa3b36d6892b765e839..2b1531266ee25010776dcebe8921103da29d9c3a 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,6 +44,8 @@ * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. + * + * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) @@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) trace_ext4_sync_file(file, dentry, datasync); + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) + goto out; /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5633af6a70452c87fc46327f4907050d648b0269..118e16ca91d76cf39bc967398c722e8b0e45444c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + inode->i_ino, create); /* * DIO VFS code passes create = 0 flag for write to * the middle of file. It does this to avoid block @@ -3491,55 +3493,152 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, return ret; } -#define DIO_AIO 0x1 - static void ext4_free_io_end(ext4_io_end_t *io) { + BUG_ON(!io); + iput(io->inode); kfree(io); } +static void dump_aio_dio_list(struct inode * inode) +{ +#ifdef EXT4_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } +#endif +} /* - * IO write completion for unwritten extents. - * * check a range of space and convert unwritten extents to written. */ -static void ext4_end_dio_unwritten(struct work_struct *work) +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); struct inode *inode = io->inode; loff_t offset = io->offset; size_t size = io->size; int ret = 0; - int aio = io->flag & DIO_AIO; - if (aio) - mutex_lock(&inode->i_mutex); + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + if (list_empty(&io->list)) + return ret; + + if (io->flag != DIO_AIO_UNWRITTEN) + return ret; + if (offset + size <= i_size_read(inode)) ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) + if (ret < 0) { printk(KERN_EMERG "%s: failed to convert unwritten" - "extents to written extents, error is %d\n", - __func__, ret); + "extents to written extents, error is %d" + " io is still on inode %lu aio dio list\n", + __func__, ret, inode->i_ino); + return ret; + } + + /* clear the DIO AIO unwritten flag */ + io->flag = 0; + return ret; +} +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_aio_dio_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + int ret = 0; + + mutex_lock(&inode->i_mutex); + ret = ext4_end_aio_dio_nolock(io); + if (ret >= 0) { + if (!list_empty(&io->list)) + list_del_init(&io->list); + ext4_free_io_end(io); + } + mutex_unlock(&inode->i_mutex); +} +/* + * This function is called from ext4_sync_file(). + * + * When AIO DIO IO is completed, the work to convert unwritten + * extents to written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of completed AIO from DIO path + * that might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents to written. + */ +int flush_aio_dio_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + int ret = 0; + int ret2 = 0; - ext4_free_io_end(io); - if (aio) - mutex_unlock(&inode->i_mutex); + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + return ret; + + dump_aio_dio_list(inode); + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + ext4_io_end_t, list); + /* + * Calling ext4_end_aio_dio_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * convertion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + ret = ext4_end_aio_dio_nolock(io); + if (ret < 0) + ret2 = ret; + else + list_del_init(&io->list); + } + return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) { ext4_io_end_t *io = NULL; io = kmalloc(sizeof(*io), GFP_NOFS); if (io) { + igrab(inode); io->inode = inode; - io->flag = flag; + io->flag = 0; io->offset = 0; io->size = 0; io->error = 0; - INIT_WORK(&io->work, ext4_end_dio_unwritten); + INIT_WORK(&io->work, ext4_end_aio_dio_work); + INIT_LIST_HEAD(&io->list); } return io; @@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; - /* if not hole or unwritten extents, just simple return */ - if (!io_end || !size || !iocb->private) + ext_debug("ext4_end_io_dio(): io_end 0x%p" + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) return; + + /* if not aio dio with unwritten extents, just free io and return */ + if (io_end->flag != DIO_AIO_UNWRITTEN){ + ext4_free_io_end(io_end); + iocb->private = NULL; + return; + } + io_end->offset = offset; io_end->size = size; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - /* We need to convert unwritten extents to written */ + /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - if (is_sync_kiocb(iocb)) - flush_workqueue(wq); - + /* Add the io_end to per-inode completed aio dio list*/ + list_add_tail(&io_end->list, + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); iocb->private = NULL; } /* @@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * If those blocks were preallocated, we mark sure they are splited, but * still keep the range to write as unintialized. * - * When end_io call back function called at the last IO complete time, - * those extents will be converted to written extents. + * The unwrritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the convertion + * when async direct IO completed. * * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size @@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, loff_t final_size = offset + count; if (rw == WRITE && final_size <= inode->i_size) { /* - * For DIO we fallocate blocks for holes, we fallocate blocks - * The fallocated extent for hole is marked as uninitialized + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized * to prevent paralel buffered read to expose the stale data * before DIO complete the data IO. - * as for previously fallocated extents, ext4 get_block + * + * As to previously fallocated extents, ext4 get_block * will just simply mark the buffer mapped but still * keep the extents uninitialized. * - * At the end of IO, the ext4 end_io callback function - * will convert those unwritten extents to written, - * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. */ - iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); - if (!iocb->private) - return -ENOMEM; + iocb->private = NULL; + EXT4_I(inode)->cur_aio_dio = NULL; + if (!is_sync_kiocb(iocb)) { + iocb->private = ext4_init_io_end(inode); + if (!iocb->private) + return -ENOMEM; + /* + * we save the io structure for current async + * direct IO, so that later ext4_get_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + EXT4_I(inode)->cur_aio_dio = iocb->private; + } + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block_dio_write, ext4_end_io_dio); + if (iocb->private) + EXT4_I(inode)->cur_aio_dio = NULL; + /* + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0) + /* + * for non AIO case, since the IO is already + * completed, we could do the convertion right here + */ + ret = ext4_convert_unwritten_extents(inode, + offset, ret); return ret; } + + /* for write the the end of file case, we fall back to old way */ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1a03ea98fdd12df804ff2d9d8d77f7299983a94a..f095c60b569efc66394f276d382be5e9a5cd51a9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + ei->cur_aio_dio = NULL; return &ei->vfs_inode; } @@ -3375,11 +3377,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); + jbd2_log_wait_commit(sbi->s_journal, target); } return ret; }