fsync.c 6.8 KB
Newer Older
1
/*
2
 *  linux/fs/ext4/fsync.c
3 4 5 6 7 8 9 10 11
 *
 *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
 *  from
 *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
 *                      Laboratoire MASI - Institut Blaise Pascal
 *                      Universite Pierre et Marie Curie (Paris VI)
 *  from
 *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
 *
12
 *  ext4fs fsync primitive
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *
 *  Removed unnecessary code duplication for little endian machines
 *  and excessive __inline__s.
 *        Andi Kleen, 1997
 *
 * Major simplications and cleanup - we only need to do the metadata, because
 * we can depend on generic_block_fdatasync() to sync the data blocks.
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
29
#include <linux/jbd2.h>
30
#include <linux/blkdev.h>
31

32 33
#include "ext4.h"
#include "ext4_jbd2.h"
34

35 36
#include <trace/events/ext4.h>

37 38
static void dump_completed_IO(struct inode * inode)
{
39
#ifdef	EXT4FS_DEBUG
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
	struct list_head *cur, *before, *after;
	ext4_io_end_t *io, *io0, *io1;
	unsigned long flags;

	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
		return;
	}

	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
		cur = &io->list;
		before = cur->prev;
		io0 = container_of(before, ext4_io_end_t, list);
		after = cur->next;
		io1 = container_of(after, ext4_io_end_t, list);

		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
			    io, inode->i_ino, io0, io1);
	}
	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
#endif
}

/*
 * This function is called from ext4_sync_file().
 *
 * When IO is completed, the work to convert unwritten extents to
 * written is queued on workqueue but may not get immediately
 * scheduled. When fsync is called, we need to ensure the
 * conversion is complete before fsync returns.
 * The inode keeps track of a list of pending/completed IO that
 * might needs to do the conversion. This function walks through
 * the list and convert the related unwritten extents for completed IO
 * to written.
 * The function return the number of pending IOs on success.
 */
78
extern int ext4_flush_completed_IO(struct inode *inode)
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
{
	ext4_io_end_t *io;
	struct ext4_inode_info *ei = EXT4_I(inode);
	unsigned long flags;
	int ret = 0;
	int ret2 = 0;

	if (list_empty(&ei->i_completed_io_list))
		return ret;

	dump_completed_IO(inode);
	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
	while (!list_empty(&ei->i_completed_io_list)){
		io = list_entry(ei->i_completed_io_list.next,
				ext4_io_end_t, list);
		/*
		 * Calling ext4_end_io_nolock() to convert completed
		 * IO to written.
		 *
		 * When ext4_sync_file() is called, run_queue() may already
		 * about to flush the work corresponding to this io structure.
		 * It will be upset if it founds the io structure related
		 * to the work-to-be schedule is freed.
		 *
		 * Thus we need to keep the io structure still valid here after
L
Lucas De Marchi 已提交
104
		 * conversion finished. The io structure has a flag to
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
		 * avoid double converting from both fsync and background work
		 * queue work.
		 */
		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
		ret = ext4_end_io_nolock(io);
		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
		if (ret < 0)
			ret2 = ret;
		else
			list_del_init(&io->list);
	}
	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
	return (ret2 < 0) ? ret2 : 0;
}

120 121 122 123 124 125 126 127
/*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
 * otherwise it will only be written by writeback, leaving a huge
 * window during which a crash may lose the file.  This may apply for
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
128
static int ext4_sync_parent(struct inode *inode)
129
{
130
	struct writeback_control wbc;
131
	struct dentry *dentry = NULL;
132
	int ret = 0;
133 134 135 136 137 138 139 140

	while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
		dentry = list_entry(inode->i_dentry.next,
				    struct dentry, d_alias);
		if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
			break;
		inode = dentry->d_parent->d_inode;
141 142 143 144 145 146 147 148 149
		ret = sync_mapping_buffers(inode->i_mapping);
		if (ret)
			break;
		memset(&wbc, 0, sizeof(wbc));
		wbc.sync_mode = WB_SYNC_ALL;
		wbc.nr_to_write = 0;         /* only write out the inode */
		ret = sync_inode(inode, &wbc);
		if (ret)
			break;
150
	}
151
	return ret;
152 153
}

154
/*
155
 * akpm: A new design for ext4_sync_file().
156 157 158 159 160 161 162 163
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
 * There cannot be a transaction open by this task.
 * Another task could have dirtied this inode.  Its data can be in any
 * state in the journalling system.
 *
 * What we do is just kick off a commit and wait on it.  This will snapshot the
 * inode to disk.
164 165
 *
 * i_mutex lock is held when entering and exiting this function
166 167
 */

168
int ext4_sync_file(struct file *file, int datasync)
169
{
170
	struct inode *inode = file->f_mapping->host;
171
	struct ext4_inode_info *ei = EXT4_I(inode);
172
	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
173 174
	int ret;
	tid_t commit_tid;
175
	bool needs_barrier = false;
176

A
Aneesh Kumar K.V 已提交
177
	J_ASSERT(ext4_journal_current_handle() == NULL);
178

179
	trace_ext4_sync_file_enter(file, datasync);
180

181 182 183
	if (inode->i_sb->s_flags & MS_RDONLY)
		return 0;

184
	ret = ext4_flush_completed_IO(inode);
185
	if (ret < 0)
186
		goto out;
187

188
	if (!journal) {
189
		ret = generic_file_fsync(file, datasync);
190
		if (!ret && !list_empty(&inode->i_dentry))
191
			ret = ext4_sync_parent(inode);
192
		goto out;
193
	}
194

195
	/*
196
	 * data=writeback,ordered:
197
	 *  The caller's filemap_fdatawrite()/wait will sync the data.
198 199
	 *  Metadata is in the journal, we wait for proper transaction to
	 *  commit here.
200 201 202
	 *
	 * data=journal:
	 *  filemap_fdatawrite won't do anything (the buffers are clean).
203
	 *  ext4_force_commit will write the file data into the journal and
204 205 206 207 208
	 *  will wait on that.
	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
	 *  (they were dirtied by commit).  But that's OK - the blocks are
	 *  safe in-journal, which is all fsync() needs to ensure.
	 */
209 210 211 212
	if (ext4_should_journal_data(inode)) {
		ret = ext4_force_commit(inode->i_sb);
		goto out;
	}
213

214
	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
215 216 217 218 219 220
	if (journal->j_flags & JBD2_BARRIER &&
	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
		needs_barrier = true;
	jbd2_log_start_commit(journal, commit_tid);
	ret = jbd2_log_wait_commit(journal, commit_tid);
	if (needs_barrier)
221
		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
222 223
 out:
	trace_ext4_sync_file_exit(inode, ret);
224 225
	return ret;
}