diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 19dcd0b86bca1b5bffd18836c4d25c3e3e84e342..7f7ee5b904026e1e68387636426e2a192563001e 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -478,79 +478,28 @@ int jbd2_log_do_checkpoint(journal_t *journal) int jbd2_cleanup_journal_tail(journal_t *journal) { - transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned long blocknr; if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * - * If the log is now empty, we need to work out which is the - * next transaction ID we will write, and where it will - * start. */ - - write_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - transaction = journal->j_checkpoint_transactions; - if (transaction) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_committing_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_running_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = journal->j_head; - } else { - first_tid = journal->j_transaction_sequence; - blocknr = journal->j_head; - } - spin_unlock(&journal->j_list_lock); - J_ASSERT(blocknr != 0); - - /* If the oldest pinned transaction is at the tail of the log - already then there's not much we can do right now. */ - if (journal->j_tail_sequence == first_tid) { - write_unlock(&journal->j_state_lock); + if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; - } - - /* OK, update the superblock to recover the freed space. - * Physical blocks come first: have we wrapped beyond the end of - * the log? */ - freed = blocknr - journal->j_tail; - if (blocknr < journal->j_tail) - freed = freed + journal->j_last - journal->j_first; - - trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); - jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", - journal->j_tail_sequence, first_tid, blocknr, freed); - - journal->j_free += freed; - journal->j_tail_sequence = first_tid; - journal->j_tail = blocknr; - write_unlock(&journal->j_state_lock); + J_ASSERT(blocknr != 0); /* - * If there is an external journal, we need to make sure that - * any data blocks that were recently written out --- perhaps - * by jbd2_log_do_checkpoint() --- are flushed out before we - * drop the transactions from the external journal. It's - * unlikely this will be necessary, especially with a - * appropriately sized journal, but we need this to guarantee - * correctness. Fortunately jbd2_cleanup_journal_tail() - * doesn't get called all that often. + * We need to make sure that any blocks that were recently written out + * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before + * we drop the transactions from the journal. It's unlikely this will + * be necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * jbd2_cleanup_journal_tail() doesn't get called all that often. */ - if ((journal->j_fs_dev != journal->j_dev) && - (journal->j_flags & JBD2_BARRIER)) + if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - if (!(journal->j_flags & JBD2_ABORT)) - jbd2_journal_update_sb_log_tail(journal); + + __jbd2_update_log_tail(journal, first_tid, blocknr); return 0; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6705717d9b7ffa2cb5c221e9b51f16f77776ba95..b89ef84786a75e14c6c4147bfae10b99b8d07beb 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -341,7 +341,16 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); mutex_lock(&journal->j_checkpoint_mutex); - jbd2_journal_update_sb_log_tail(journal); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fc5f2acc9f18c649ca98302b9257302b91309bcf..c5ff177400ff52e07d802cccf7b7ce870ccb3d3a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -742,6 +742,85 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return jbd2_journal_add_journal_head(bh); } +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block) +{ + transaction_t *transaction; + int ret; + + read_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + *tid = transaction->t_tid; + *block = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + *tid = transaction->t_tid; + *block = journal->j_head; + } else { + *tid = journal->j_transaction_sequence; + *block = journal->j_head; + } + ret = tid_gt(*tid, journal->j_tail_sequence); + spin_unlock(&journal->j_list_lock); + read_unlock(&journal->j_state_lock); + + return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ + unsigned long freed; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + + /* + * We cannot afford for write to remain in drive's caches since as + * soon as we update j_tail, next transaction can start reusing journal + * space and if we lose sb update during power failure we'd replay + * old transaction with possibly newly overwritten data. + */ + jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + write_lock(&journal->j_state_lock); + freed = block - journal->j_tail; + if (block < journal->j_tail) + freed += journal->j_last - journal->j_first; + + trace_jbd2_update_log_tail(journal, tid, block, freed); + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, tid, block, freed); + + journal->j_free += freed; + journal->j_tail_sequence = tid; + journal->j_tail = block; + write_unlock(&journal->j_state_lock); +} + struct jbd2_stats_proc_session { journal_t *journal; struct transaction_stats_s *stats; @@ -1125,18 +1204,30 @@ static int journal_reset(journal_t *journal) } else { /* Lock here to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - /* Add the dynamic fields and write it to disk. */ - jbd2_journal_update_sb_log_tail(journal); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + jbd2_journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); } -static void jbd2_write_superblock(journal_t *journal) +static void jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; + int ret; - trace_jbd2_write_superblock(journal); + trace_jbd2_write_superblock(journal, write_op); + if (!(journal->j_flags & JBD2_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1152,40 +1243,45 @@ static void jbd2_write_superblock(journal_t *journal) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } - - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); if (buffer_write_io_error(bh)) { - printk(KERN_ERR "JBD2: I/O error detected " - "when updating journal superblock for %s.\n", - journal->j_devname); clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + printk(KERN_ERR "JBD2: Error %d detected when updating " + "journal superblock for %s.\n", ret, + journal->j_devname); } } /** * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb * * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void jbd2_journal_update_sb_log_tail(journal_t *journal) +void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned long tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d)\n", - journal->j_tail, journal->j_tail_sequence); + jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + tail_block, tail_tid); - sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - read_unlock(&journal->j_state_lock); + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); - jbd2_write_superblock(journal); + jbd2_write_superblock(journal, write_op); /* Log is no longer empty */ write_lock(&journal->j_state_lock); @@ -1214,7 +1310,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal); + jbd2_write_superblock(journal, WRITE_FUA); /* Log is no longer empty */ write_lock(&journal->j_state_lock); @@ -1240,7 +1336,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal); + jbd2_write_superblock(journal, WRITE_SYNC); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index da6d7baf1390c401cc52b1cb005cde1ff7776483..c1a03354a22ff1b5a787251b422afcb5225ca2c9 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@ #include #include #include +#include #endif /* @@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; - + /* Make sure all replayed data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); return err; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5f05c77438e5642364b27f359040863ca2363544..876a7d87192bc7807c0df3cb913df7608999a700 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -971,6 +971,9 @@ extern void __journal_clean_data_list(transaction_t *transaction); /* Log buffer allocation */ extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, + unsigned long *block); +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ extern void jbd2_journal_commit_transaction(journal_t *); @@ -1087,7 +1090,8 @@ extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); -extern void jbd2_journal_update_sb_log_tail (journal_t *); +extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, + unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index e05a362bf3f14f7fb3d9bde4ebd9f929122dd27a..127993dbf322efa40bc68222551ff181132122b4 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -207,7 +207,7 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->forced_to_close, __entry->written, __entry->dropped) ); -TRACE_EVENT(jbd2_cleanup_journal_tail, +TRACE_EVENT(jbd2_update_log_tail, TP_PROTO(journal_t *journal, tid_t first_tid, unsigned long block_nr, unsigned long freed), @@ -238,19 +238,22 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, TRACE_EVENT(jbd2_write_superblock, - TP_PROTO(journal_t *journal), + TP_PROTO(journal_t *journal, int write_op), - TP_ARGS(journal), + TP_ARGS(journal, write_op), TP_STRUCT__entry( __field( dev_t, dev ) + __field( int, write_op ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; + __entry->write_op = write_op; ), - TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) + TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->write_op) ); #endif /* _TRACE_JBD2_H */