提交 470decc6 编写于 作者: D Dave Kleikamp 提交者: Linus Torvalds

[PATCH] jbd2: initial copy of files from jbd

This is a simple copy of the files in fs/jbd to fs/jbd2 and
/usr/incude/linux/[ext4_]jbd.h to /usr/include/[ext4_]jbd2.h
Signed-off-by: NDave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: NAndrew Morton <akpm@osdl.org>
Signed-off-by: NLinus Torvalds <torvalds@osdl.org>
上级 02ea2104
#
# Makefile for the linux journaling routines.
#
obj-$(CONFIG_JBD) += jbd.o
jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
/*
* linux/fs/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
} else {
jbd_unlock_bh_state(bh);
}
return ret;
}
/*
* __log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __log_wait_for_space(journal_t *journal)
{
int nblocks;
assert_spin_locked(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
while (__log_space_left(journal) < nblocks) {
if (journal->j_flags & JFS_ABORT)
return;
spin_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock
*/
spin_lock(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
if (__log_space_left(journal) < nblocks) {
spin_unlock(&journal->j_state_lock);
log_do_checkpoint(journal);
spin_lock(&journal->j_state_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
/*
* We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
* The caller must restart a list walk. Wait for someone else to run
* jbd_unlock_bh_state().
*/
static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
__releases(journal->j_list_lock)
{
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_lock_bh_state(bh);
jbd_unlock_bh_state(bh);
put_bh(bh);
}
/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
* from the one in which they were submitted for IO.
*
* Called with j_list_lock held.
*/
static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
tid_t this_tid;
int released = 0;
this_tid = transaction->t_tid;
restart:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
return;
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now in whatever state the buffer currently is, we know that
* it has been written out and so we can drop it from the list
*/
released = __journal_remove_checkpoint(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
__brelse(bh);
}
}
#define NR_BATCH 64
static void
__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
ll_rw_block(SWRITE, *batch_count, bhs);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Try to flush one buffer from the checkpoint list to disk.
*
* Return 1 if something happened which requires us to abort the current
* scan of the checkpoint list.
*
* Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
struct buffer_head **bhs, int *batch_count)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
ret = 1;
} else if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
log_start_commit(journal, tid);
log_wait_commit(journal, tid);
ret = 1;
} else if (!buffer_dirty(bh)) {
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
__brelse(bh);
ret = 1;
} else {
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal lock.
* We cannot afford to let the transaction logic start
* messing around with this buffer before we write it to
* disk, as that would break recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
bhs[*batch_count] = bh;
__buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
(*batch_count)++;
if (*batch_count == NR_BATCH) {
spin_unlock(&journal->j_list_lock);
__flush_batch(journal, bhs, batch_count);
ret = 1;
}
}
return ret;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
*/
int log_do_checkpoint(journal_t *journal)
{
transaction_t *transaction;
tid_t this_tid;
int result;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = cleanup_journal_tail(journal);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions == transaction &&
transaction->t_tid == this_tid) {
int batch_count = 0;
struct buffer_head *bhs[NR_BATCH];
struct journal_head *jh;
int retry = 0;
while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
retry = 1;
break;
}
retry = __process_buffer(journal, jh, bhs,&batch_count);
if (!retry && lock_need_resched(&journal->j_list_lock)){
spin_unlock(&journal->j_list_lock);
retry = 1;
break;
}
}
if (batch_count) {
if (!retry) {
spin_unlock(&journal->j_list_lock);
retry = 1;
}
__flush_batch(journal, bhs, &batch_count);
}
if (retry) {
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we have cleaned up the first transaction's checkpoint
* list. Let's clean up the second one
*/
__wait_cp_io(journal, transaction);
}
out:
spin_unlock(&journal->j_list_lock);
result = cleanup_journal_tail(journal);
if (result < 0)
return result;
return 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* Called with the journal lock held.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the journal superblock if
* we have an abort error outstanding.
*/
int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
unsigned long blocknr, freed;
/* OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
* start. */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
if (transaction) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_committing_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_running_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = journal->j_head;
} else {
first_tid = journal->j_transaction_sequence;
blocknr = journal->j_head;
}
spin_unlock(&journal->j_list_lock);
J_ASSERT(blocknr != 0);
/* If the oldest pinned transaction is at the tail of the log
already then there's not much we can do right now. */
if (journal->j_tail_sequence == first_tid) {
spin_unlock(&journal->j_state_lock);
return 1;
}
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
freed = blocknr - journal->j_tail;
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
if (!(journal->j_flags & JFS_ABORT))
journal_update_superblock(journal, 1);
return 0;
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of bufers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret, freed = 0;
*released = 0;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
/* Use trylock because of the ranking */
if (jbd_trylock_bh_state(jh2bh(jh))) {
ret = __try_to_free_cp_buf(jh);
if (ret) {
freed++;
if (ret == 2) {
*released = 1;
return freed;
}
}
}
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return freed;
} while (jh != last_jh);
return freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of buffers reaped (for debug)
*/
int __journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret = 0;
int released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
goto out;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_list, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
goto out;
if (released)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_io_list, &released);
if (need_resched())
goto out;
} while (transaction != last_transaction);
out:
return ret;
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
*
* This function is called with the journal locked.
* This function is called with j_list_lock held.
* This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
int __journal_remove_checkpoint(struct journal_head *jh)
{
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
JBUFFER_TRACE(jh, "transaction has no more buffers");
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a committing transaction's forget list, then even if the
* checkpoint list is empty, the transaction obviously cannot be
* dropped!
*
* The locking here around j_committing_transaction is a bit sleazy.
* See the comment at the end of journal_commit_transaction().
*/
if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "belongs to committing transaction");
goto out;
}
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
__journal_drop_transaction(journal, transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
wake_up(&journal->j_wait_logspace);
ret = 1;
out:
JBUFFER_TRACE(jh, "exit");
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(transaction->t_updates == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
kfree(transaction);
}
此差异已折叠。
此差异已折叠。
/*
* linux/fs/recovery.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#endif
/*
* Maintain information about the progress of the recovery job, so that
* the different passes can carry information between them.
*/
struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
/* Release readahead buffers after use */
static void journal_brelse_array(struct buffer_head *b[], int n)
{
while (--n >= 0)
brelse (b[n]);
}
/*
* When reading from the journal, we are going through the block device
* layer directly and so there is no readahead being done for us. We
* need to implement any readahead ourselves if we want it to happen at
* all. Recovery is basically one long sequential read, so make sure we
* do the IO in reasonably large chunks.
*
* This is not so critical that we need to be enormously clever about
* the readahead size, though. 128K is a purely arbitrary, good-enough
* fixed value.
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned long blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
nbufs = 0;
for (next = start; next < max; next++) {
err = journal_bmap(journal, next, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
next);
goto failed;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
} else
brelse(bh);
}
if (nbufs)
ll_rw_block(READ, nbufs, bufs);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
/*
* Read a block from the journal
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
unsigned long blocknr;
struct buffer_head *bh;
*bhp = NULL;
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD: corrupted journal superblock\n");
return -EIO;
}
err = journal_bmap(journal, offset, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
offset);
return err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return -ENOMEM;
if (!buffer_uptodate(bh)) {
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
do_readahead(journal, offset);
wait_on_buffer(bh);
}
if (!buffer_uptodate(bh)) {
printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
}
*bhp = bh;
return 0;
}
/*
* Count the number of in-use tags in a journal descriptor block.
*/
static int count_tags(struct buffer_head *bh, int size)
{
char * tagp;
journal_block_tag_t * tag;
int nr = 0;
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
tag = (journal_block_tag_t *) tagp;
nr++;
tagp += sizeof(journal_block_tag_t);
if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
tagp += 16;
if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
break;
}
return nr;
}
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
if (var >= (journal)->j_last) \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
/**
* journal_recover - recovers a on-disk journal
* @journal: the journal to recover
*
* The primary function for recovering the log contents when mounting a
* journaled device.
*
* Recovery is done in three passes. In the first pass, we look for the
* end of the log. In the second, we assemble the list of revoke
* blocks. In the third and final pass, we replay any un-revoked blocks
* in the log.
*/
int journal_recover(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
*/
if (!sb->s_start) {
jbd_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
}
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
jbd_debug(0, "JBD: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
journal_clear_revoke(journal);
sync_blockdev(journal->j_fs_dev);
return err;
}
/**
* journal_skip_recovery - Start journal and wipe exiting records
* @journal: journal to startup
*
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
* This function does'nt appear to be exorted..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
* the journal transaction sequence numbers to the next unused ID.
*/
int journal_skip_recovery(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset (&info, 0, sizeof(info));
sb = journal->j_superblock;
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
printk(KERN_ERR "JBD: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD_DEBUG
int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
#endif
jbd_debug(0,
"JBD: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
journal->j_transaction_sequence = ++info.end_transaction;
}
journal->j_tail = 0;
return err;
}
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
unsigned long next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
/ sizeof(journal_block_tag_t));
/*
* First thing is to establish what we expect to find in the log
* (in terms of transaction IDs), and where (in terms of log
* block offsets): query the superblock.
*/
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
jbd_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
* making sure that each transaction has a commit block in the
* expected place. Each complete transaction gets replayed back
* into the main filesystem.
*/
while (1) {
int flags;
char * tagp;
journal_block_tag_t * tag;
struct buffer_head * obh;
struct buffer_head * nbh;
cond_resched(); /* We're under lock_kernel() */
/* If we already know where to stop the log traversal,
* check right now that we haven't gone past the end of
* the log. */
if (pass != PASS_SCAN)
if (tid_geq(next_commit_ID, info->end_transaction))
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
next_log_block++;
wrap(journal, next_log_block);
/* What kind of buffer is it?
*
* If it is a descriptor block, check that it has the
* expected sequence number. Otherwise, we're all done
* here. */
tmp = (journal_header_t *)bh->b_data;
if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
brelse(bh);
break;
}
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
brelse(bh);
break;
}
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
* to do with it? That depends on the pass... */
switch(blocktype) {
case JFS_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
* in pass REPLAY; otherwise, just skip over the
* blocks it describes. */
if (pass != PASS_REPLAY) {
next_log_block +=
count_tags(bh, journal->j_blocksize);
wrap(journal, next_log_block);
brelse(bh);
continue;
}
/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
* getting done here! */
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
/* Recover what we can, but
* report failure at the end. */
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
"block %ld in log\n",
err, io_block);
} else {
unsigned long blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
/* If the block has been
* revoked, then we're all done
* here. */
if (journal_test_revoke
(journal, blocknr,
next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
blocknr,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
"JBD: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
brelse(obh);
goto failed;
}
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data,
journal->j_blocksize);
if (flags & JFS_FLAG_ESCAPE) {
*((__be32 *)bh->b_data) =
cpu_to_be32(JFS_MAGIC_NUMBER);
}
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += sizeof(journal_block_tag_t);
if (!(flags & JFS_FLAG_SAME_UUID))
tagp += 16;
if (flags & JFS_FLAG_LAST_TAG)
break;
}
brelse(bh);
continue;
case JFS_COMMIT_BLOCK:
/* Found an expected commit block: not much to
* do other than move on to the next sequence
* number. */
brelse(bh);
next_commit_ID++;
continue;
case JFS_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
if (err)
goto failed;
continue;
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
}
}
done:
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
* log. If the latter happened, then we know that the "current"
* transaction marks the end of the valid log.
*/
if (pass == PASS_SCAN)
info->end_transaction = next_commit_ID;
else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
printk (KERN_ERR "JBD: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
success = -EIO;
}
}
return success;
failed:
return err;
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
{
journal_revoke_header_t *header;
int offset, max;
header = (journal_revoke_header_t *) bh->b_data;
offset = sizeof(journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
while (offset < max) {
unsigned long blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
offset += 4;
err = journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册