提交 470decc6 编写于 作者: D Dave Kleikamp 提交者: Linus Torvalds

[PATCH] jbd2: initial copy of files from jbd

This is a simple copy of the files in fs/jbd to fs/jbd2 and
/usr/incude/linux/[ext4_]jbd.h to /usr/include/[ext4_]jbd2.h
Signed-off-by: NDave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: NAndrew Morton <akpm@osdl.org>
Signed-off-by: NLinus Torvalds <torvalds@osdl.org>
上级 02ea2104
#
# Makefile for the linux journaling routines.
#
obj-$(CONFIG_JBD) += jbd.o
jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
/*
* linux/fs/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
} else {
jbd_unlock_bh_state(bh);
}
return ret;
}
/*
* __log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __log_wait_for_space(journal_t *journal)
{
int nblocks;
assert_spin_locked(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
while (__log_space_left(journal) < nblocks) {
if (journal->j_flags & JFS_ABORT)
return;
spin_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock
*/
spin_lock(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
if (__log_space_left(journal) < nblocks) {
spin_unlock(&journal->j_state_lock);
log_do_checkpoint(journal);
spin_lock(&journal->j_state_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
/*
* We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
* The caller must restart a list walk. Wait for someone else to run
* jbd_unlock_bh_state().
*/
static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
__releases(journal->j_list_lock)
{
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_lock_bh_state(bh);
jbd_unlock_bh_state(bh);
put_bh(bh);
}
/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
* from the one in which they were submitted for IO.
*
* Called with j_list_lock held.
*/
static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
tid_t this_tid;
int released = 0;
this_tid = transaction->t_tid;
restart:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
return;
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now in whatever state the buffer currently is, we know that
* it has been written out and so we can drop it from the list
*/
released = __journal_remove_checkpoint(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
__brelse(bh);
}
}
#define NR_BATCH 64
static void
__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
{
int i;
ll_rw_block(SWRITE, *batch_count, bhs);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = bhs[i];
clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Try to flush one buffer from the checkpoint list to disk.
*
* Return 1 if something happened which requires us to abort the current
* scan of the checkpoint list.
*
* Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
struct buffer_head **bhs, int *batch_count)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
ret = 1;
} else if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
log_start_commit(journal, tid);
log_wait_commit(journal, tid);
ret = 1;
} else if (!buffer_dirty(bh)) {
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
__brelse(bh);
ret = 1;
} else {
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal lock.
* We cannot afford to let the transaction logic start
* messing around with this buffer before we write it to
* disk, as that would break recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
bhs[*batch_count] = bh;
__buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
(*batch_count)++;
if (*batch_count == NR_BATCH) {
spin_unlock(&journal->j_list_lock);
__flush_batch(journal, bhs, batch_count);
ret = 1;
}
}
return ret;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
*/
int log_do_checkpoint(journal_t *journal)
{
transaction_t *transaction;
tid_t this_tid;
int result;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = cleanup_journal_tail(journal);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions == transaction &&
transaction->t_tid == this_tid) {
int batch_count = 0;
struct buffer_head *bhs[NR_BATCH];
struct journal_head *jh;
int retry = 0;
while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
retry = 1;
break;
}
retry = __process_buffer(journal, jh, bhs,&batch_count);
if (!retry && lock_need_resched(&journal->j_list_lock)){
spin_unlock(&journal->j_list_lock);
retry = 1;
break;
}
}
if (batch_count) {
if (!retry) {
spin_unlock(&journal->j_list_lock);
retry = 1;
}
__flush_batch(journal, bhs, &batch_count);
}
if (retry) {
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we have cleaned up the first transaction's checkpoint
* list. Let's clean up the second one
*/
__wait_cp_io(journal, transaction);
}
out:
spin_unlock(&journal->j_list_lock);
result = cleanup_journal_tail(journal);
if (result < 0)
return result;
return 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* Called with the journal lock held.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the journal superblock if
* we have an abort error outstanding.
*/
int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
unsigned long blocknr, freed;
/* OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
* start. */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
if (transaction) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_committing_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_running_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = journal->j_head;
} else {
first_tid = journal->j_transaction_sequence;
blocknr = journal->j_head;
}
spin_unlock(&journal->j_list_lock);
J_ASSERT(blocknr != 0);
/* If the oldest pinned transaction is at the tail of the log
already then there's not much we can do right now. */
if (journal->j_tail_sequence == first_tid) {
spin_unlock(&journal->j_state_lock);
return 1;
}
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
freed = blocknr - journal->j_tail;
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
if (!(journal->j_flags & JFS_ABORT))
journal_update_superblock(journal, 1);
return 0;
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of bufers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret, freed = 0;
*released = 0;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
/* Use trylock because of the ranking */
if (jbd_trylock_bh_state(jh2bh(jh))) {
ret = __try_to_free_cp_buf(jh);
if (ret) {
freed++;
if (ret == 2) {
*released = 1;
return freed;
}
}
}
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return freed;
} while (jh != last_jh);
return freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of buffers reaped (for debug)
*/
int __journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret = 0;
int released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
goto out;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_list, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
goto out;
if (released)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_io_list, &released);
if (need_resched())
goto out;
} while (transaction != last_transaction);
out:
return ret;
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
*
* This function is called with the journal locked.
* This function is called with j_list_lock held.
* This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
int __journal_remove_checkpoint(struct journal_head *jh)
{
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
JBUFFER_TRACE(jh, "transaction has no more buffers");
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a committing transaction's forget list, then even if the
* checkpoint list is empty, the transaction obviously cannot be
* dropped!
*
* The locking here around j_committing_transaction is a bit sleazy.
* See the comment at the end of journal_commit_transaction().
*/
if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "belongs to committing transaction");
goto out;
}
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
__journal_drop_transaction(journal, transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
wake_up(&journal->j_wait_logspace);
ret = 1;
out:
JBUFFER_TRACE(jh, "exit");
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(transaction->t_updates == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
kfree(transaction);
}
/*
* linux/fs/jbd/commit.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal commit routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
unlock_buffer(bh);
}
/*
* When an ext3-ordered file is truncated, it is possible that many pages are
* not sucessfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
* the numbers in /proc/meminfo look odd.
*
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
* Called under lock_journal(), and possibly under journal_datalist_lock. The
* caller provided us with a ref against the buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
struct page *page;
if (buffer_dirty(bh))
goto nope;
if (atomic_read(&bh->b_count) != 1)
goto nope;
page = bh->b_page;
if (!page)
goto nope;
if (page->mapping)
goto nope;
/* OK, it's a truncated page */
if (TestSetPageLocked(page))
goto nope;
page_cache_get(page);
__brelse(bh);
try_to_free_buffers(page);
unlock_page(page);
page_cache_release(page);
return;
nope:
__brelse(bh);
}
/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
* held. For ranking reasons we must trylock. If we lose, schedule away and
* return 0. j_list_lock is dropped in this case.
*/
static int inverted_lock(journal_t *journal, struct buffer_head *bh)
{
if (!jbd_trylock_bh_state(bh)) {
spin_unlock(&journal->j_list_lock);
schedule();
return 0;
}
return 1;
}
/* Done it all: now write the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
* entirely.
*
* Returns 1 if the journal needs to be aborted or 0 on success
*/
static int journal_write_commit_record(journal_t *journal,
transaction_t *commit_transaction)
{
struct journal_head *descriptor;
struct buffer_head *bh;
int i, ret;
int barrier_done = 0;
if (is_journal_aborted(journal))
return 0;
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor)
return 1;
bh = jh2bh(descriptor);
/* AKPM: buglet - add `i' to tmp! */
for (i = 0; i < bh->b_size; i += 512) {
journal_header_t *tmp = (journal_header_t*)bh->b_data;
tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
}
JBUFFER_TRACE(descriptor, "write commit block");
set_buffer_dirty(bh);
if (journal->j_flags & JFS_BARRIER) {
set_buffer_ordered(bh);
barrier_done = 1;
}
ret = sync_dirty_buffer(bh);
/* is it possible for another commit to fail at roughly
* the same time as this one? If so, we don't want to
* trust the barrier flag in the super, but instead want
* to remember if we sent a barrier request
*/
if (ret == -EOPNOTSUPP && barrier_done) {
char b[BDEVNAME_SIZE];
printk(KERN_WARNING
"JBD: barrier-based sync failed on %s - "
"disabling barriers\n",
bdevname(journal->j_dev, b));
spin_lock(&journal->j_state_lock);
journal->j_flags &= ~JFS_BARRIER;
spin_unlock(&journal->j_state_lock);
/* And try again, without the barrier */
clear_buffer_ordered(bh);
set_buffer_uptodate(bh);
set_buffer_dirty(bh);
ret = sync_dirty_buffer(bh);
}
put_bh(bh); /* One for getblk() */
journal_put_journal_head(descriptor);
return (ret == -EIO);
}
static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
{
int i;
for (i = 0; i < bufs; i++) {
wbuf[i]->b_end_io = end_buffer_write_sync;
/* We use-up our safety reference in submit_bh() */
submit_bh(WRITE, wbuf[i]);
}
}
/*
* Submit all the data buffers to disk
*/
static void journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
int locked;
int bufs = 0;
struct buffer_head **wbuf = journal->j_wbuf;
/*
* Whenever we unlock the journal and sleep, things can get added
* onto ->t_sync_datalist, so we have to keep looping back to
* write_out_data until we *know* that the list is empty.
*
* Cleanup any flushed data buffers from the data list. Even in
* abort mode, we want to flush this out as soon as possible.
*/
write_out_data:
cond_resched();
spin_lock(&journal->j_list_lock);
while (commit_transaction->t_sync_datalist) {
jh = commit_transaction->t_sync_datalist;
bh = jh2bh(jh);
locked = 0;
/* Get reference just to make sure buffer does not disappear
* when we are forced to drop various locks */
get_bh(bh);
/* If the buffer is dirty, we need to submit IO and hence
* we need the buffer lock. We try to lock the buffer without
* blocking. If we fail, we need to drop j_list_lock and do
* blocking lock_buffer().
*/
if (buffer_dirty(bh)) {
if (test_set_buffer_locked(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
/* Write out all data to prevent deadlocks */
journal_do_submit_data(wbuf, bufs);
bufs = 0;
lock_buffer(bh);
spin_lock(&journal->j_list_lock);
}
locked = 1;
}
/* We have to get bh_state lock. Again out of order, sigh. */
if (!inverted_lock(journal, bh)) {
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
}
/* Someone already cleaned up the buffer? */
if (!buffer_jbd(bh)
|| jh->b_transaction != commit_transaction
|| jh->b_jlist != BJ_SyncData) {
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
BUFFER_TRACE(bh, "already cleaned up");
put_bh(bh);
continue;
}
if (locked && test_clear_buffer_dirty(bh)) {
BUFFER_TRACE(bh, "needs writeout, adding to array");
wbuf[bufs++] = bh;
__journal_file_buffer(jh, commit_transaction,
BJ_Locked);
jbd_unlock_bh_state(bh);
if (bufs == journal->j_wbufsize) {
spin_unlock(&journal->j_list_lock);
journal_do_submit_data(wbuf, bufs);
bufs = 0;
goto write_out_data;
}
}
else {
BUFFER_TRACE(bh, "writeout complete: unfile");
__journal_unfile_buffer(jh);
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
journal_remove_journal_head(bh);
/* Once for our safety reference, once for
* journal_remove_journal_head() */
put_bh(bh);
put_bh(bh);
}
if (lock_need_resched(&journal->j_list_lock)) {
spin_unlock(&journal->j_list_lock);
goto write_out_data;
}
}
spin_unlock(&journal->j_list_lock);
journal_do_submit_data(wbuf, bufs);
}
/*
* journal_commit_transaction
*
* The primary function for committing a transaction to the log. This
* function is called by the journal thread to begin a complete commit.
*/
void journal_commit_transaction(journal_t *journal)
{
transaction_t *commit_transaction;
struct journal_head *jh, *new_jh, *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
int err;
unsigned long blocknr;
char *tagp = NULL;
journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
int tag_flag;
int i;
/*
* First job: lock down the current transaction and wait for
* all outstanding updates to complete.
*/
#ifdef COMMIT_STATS
spin_lock(&journal->j_list_lock);
summarise_journal_usage(journal);
spin_unlock(&journal->j_list_lock);
#endif
/* Do we need to erase the effects of a prior journal_flush? */
if (journal->j_flags & JFS_FLUSHED) {
jbd_debug(3, "super block updated\n");
journal_update_superblock(journal, 1);
} else {
jbd_debug(3, "superblock not updated\n");
}
J_ASSERT(journal->j_running_transaction != NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
jbd_debug(1, "JBD: starting commit of transaction %d\n",
commit_transaction->t_tid);
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
DEFINE_WAIT(wait);
prepare_to_wait(&journal->j_wait_updates, &wait,
TASK_UNINTERRUPTIBLE);
if (commit_transaction->t_updates) {
spin_unlock(&commit_transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
schedule();
spin_lock(&journal->j_state_lock);
spin_lock(&commit_transaction->t_handle_lock);
}
finish_wait(&journal->j_wait_updates, &wait);
}
spin_unlock(&commit_transaction->t_handle_lock);
J_ASSERT (commit_transaction->t_outstanding_credits <=
journal->j_max_transaction_buffers);
/*
* First thing we are allowed to do is to discard any remaining
* BJ_Reserved buffers. Note, it is _not_ permissible to assume
* that there are no such buffers: if a large filesystem
* operation like a truncate needs to split itself over multiple
* transactions, then it may try to do a journal_restart() while
* there are still BJ_Reserved buffers outstanding. These must
* be released cleanly from the current transaction.
*
* In this case, the filesystem must still reserve write access
* again before modifying the buffer in the new transaction, but
* we do not require it to remember exactly which old buffers it
* has reserved. This is consistent with the existing behaviour
* that multiple journal_get_write_access() calls to the same
* buffer are perfectly permissable.
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
JBUFFER_TRACE(jh, "reserved, unused: refile");
/*
* A journal_get_undo_access()+journal_release_buffer() may
* leave undo-committed data.
*/
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
jbd_lock_bh_state(bh);
jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
jbd_unlock_bh_state(bh);
}
journal_refile_buffer(journal, jh);
}
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
* frees some memory
*/
spin_lock(&journal->j_list_lock);
__journal_clean_checkpoint_list(journal);
spin_unlock(&journal->j_list_lock);
jbd_debug (3, "JBD: commit phase 1\n");
/*
* Switch to a new revoke table.
*/
journal_switch_revoke_table(journal);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
spin_unlock(&journal->j_state_lock);
jbd_debug (3, "JBD: commit phase 2\n");
/*
* First, drop modified flag: all accesses to the buffers
* will be tracked for a new trasaction only -bzzz
*/
spin_lock(&journal->j_list_lock);
if (commit_transaction->t_buffers) {
new_jh = jh = commit_transaction->t_buffers->b_tnext;
do {
J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
new_jh->b_modified == 0);
new_jh->b_modified = 0;
new_jh = new_jh->b_tnext;
} while (new_jh != jh);
}
spin_unlock(&journal->j_list_lock);
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
err = 0;
journal_submit_data_buffers(journal, commit_transaction);
/*
* Wait for all previously submitted IO to complete.
*/
spin_lock(&journal->j_list_lock);
while (commit_transaction->t_locked_list) {
struct buffer_head *bh;
jh = commit_transaction->t_locked_list->b_tprev;
bh = jh2bh(jh);
get_bh(bh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
spin_lock(&journal->j_list_lock);
}
if (!inverted_lock(journal, bh)) {
put_bh(bh);
spin_lock(&journal->j_list_lock);
continue;
}
if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
__journal_unfile_buffer(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
put_bh(bh);
} else {
jbd_unlock_bh_state(bh);
}
put_bh(bh);
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
if (err)
__journal_abort_hard(journal);
journal_write_revoke_records(journal, commit_transaction);
jbd_debug(3, "JBD: commit phase 2\n");
/*
* If we found any dirty or locked buffers, then we should have
* looped back up to the write_out_data label. If there weren't
* any then journal_clean_data_list should have wiped the list
* clean by now, so check that it is in fact empty.
*/
J_ASSERT (commit_transaction->t_sync_datalist == NULL);
jbd_debug (3, "JBD: commit phase 3\n");
/*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
*/
commit_transaction->t_state = T_COMMIT;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
/* Find the next buffer to be journaled... */
jh = commit_transaction->t_buffers;
/* If we're in abort mode, we just un-journal the buffer and
release it for background writing. */
if (is_journal_aborted(journal)) {
JBUFFER_TRACE(jh, "journal is aborting: refile");
journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
* any descriptor buffers which may have been
* already allocated, even if we are now
* aborting. */
if (!commit_transaction->t_buffers)
goto start_journal_io;
continue;
}
/* Make sure we have a descriptor block in which to
record the metadata buffer. */
if (!descriptor) {
struct buffer_head *bh;
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD: get descriptor\n");
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor) {
__journal_abort_hard(journal);
continue;
}
bh = jh2bh(descriptor);
jbd_debug(4, "JBD: got buffer %llu (%p)\n",
(unsigned long long)bh->b_blocknr, bh->b_data);
header = (journal_header_t *)&bh->b_data[0];
header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
tagp = &bh->b_data[sizeof(journal_header_t)];
space_left = bh->b_size - sizeof(journal_header_t);
first_tag = 1;
set_buffer_jwrite(bh);
set_buffer_dirty(bh);
wbuf[bufs++] = bh;
/* Record it so that we can wait for IO
completion later */
BUFFER_TRACE(bh, "ph3: file as descriptor");
journal_file_buffer(descriptor, commit_transaction,
BJ_LogCtl);
}
/* Where is the buffer to be written? */
err = journal_next_log_block(journal, &blocknr);
/* If the block mapping failed, just abandon the buffer
and repeat this loop: we'll fall into the
refile-on-abort condition above. */
if (err) {
__journal_abort_hard(journal);
continue;
}
/*
* start_this_handle() uses t_outstanding_credits to determine
* the free space in the log, but this counter is changed
* by journal_next_log_block() also.
*/
commit_transaction->t_outstanding_credits--;
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
rid of the BJ_IO/BJ_Shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
/* Make a temporary IO buffer with which to write it out
(this will requeue both the metadata buffer and the
temporary IO buffer). new_bh goes on BJ_IO*/
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
* akpm: journal_write_metadata_buffer() sets
* new_bh->b_transaction to commit_transaction.
* We need to clean this up before we release new_bh
* (which is of type BJ_IO)
*/
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = journal_write_metadata_buffer(commit_transaction,
jh, &new_jh, blocknr);
set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
wbuf[bufs++] = jh2bh(new_jh);
/* Record the new block's tag in the current descriptor
buffer */
tag_flag = 0;
if (flags & 1)
tag_flag |= JFS_FLAG_ESCAPE;
if (!first_tag)
tag_flag |= JFS_FLAG_SAME_UUID;
tag = (journal_block_tag_t *) tagp;
tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be32(tag_flag);
tagp += sizeof(journal_block_tag_t);
space_left -= sizeof(journal_block_tag_t);
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
tagp += 16;
space_left -= 16;
first_tag = 0;
}
/* If there's no more to do, or if the descriptor is full,
let the IO rip! */
if (bufs == journal->j_wbufsize ||
commit_transaction->t_buffers == NULL ||
space_left < sizeof(journal_block_tag_t) + 16) {
jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
the last tag we set up. */
tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
submit_bh(WRITE, bh);
}
cond_resched();
/* Force a new descriptor to be generated next
time round the loop. */
descriptor = NULL;
bufs = 0;
}
}
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
the t_iobuf_list queue.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
so we incur less scheduling load.
*/
jbd_debug(3, "JBD: commit phase 4\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
* See __journal_try_to_free_buffer.
*/
wait_for_iobuf:
while (commit_transaction->t_iobuf_list != NULL) {
struct buffer_head *bh;
jh = commit_transaction->t_iobuf_list->b_tprev;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
wait_on_buffer(bh);
goto wait_for_iobuf;
}
if (cond_resched())
goto wait_for_iobuf;
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
clear_buffer_jwrite(bh);
JBUFFER_TRACE(jh, "ph4: unfile after journal write");
journal_unfile_buffer(journal, jh);
/*
* ->t_iobuf_list should contain only dummy buffer_heads
* which were created by journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
/* We also have to unlock and free the corresponding
shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
clear_bit(BH_JWrite, &bh->b_state);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
we finally commit, we can do any checkpointing
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
journal_file_buffer(jh, commit_transaction, BJ_Forget);
/* Wake up any transactions which were waiting for this
IO to complete */
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
J_ASSERT (commit_transaction->t_shadow_list == NULL);
jbd_debug(3, "JBD: commit phase 5\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
while (commit_transaction->t_log_list != NULL) {
struct buffer_head *bh;
jh = commit_transaction->t_log_list->b_tprev;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
wait_on_buffer(bh);
goto wait_for_ctlbuf;
}
if (cond_resched())
goto wait_for_ctlbuf;
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
journal_unfile_buffer(journal, jh);
journal_put_journal_head(jh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
jbd_debug(3, "JBD: commit phase 6\n");
if (journal_write_commit_record(journal, commit_transaction))
err = -EIO;
if (err)
__journal_abort_hard(journal);
/* End of a transaction! Finally, we can do checkpoint
processing: any buffers committed as a result of this
transaction can be removed from any checkpoint list it was on
before. */
jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_sync_datalist == NULL);
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
* As there are other places (journal_unmap_buffer()) adding buffers
* to this list we have to be careful and hold the j_list_lock.
*/
spin_lock(&journal->j_list_lock);
while (commit_transaction->t_forget) {
transaction_t *cp_transaction;
struct buffer_head *bh;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
bh = jh2bh(jh);
jbd_lock_bh_state(bh);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
jh->b_transaction == journal->j_running_transaction);
/*
* If there is undo-protected committed data against
* this buffer, then we can remove it now. If it is a
* buffer needing such protection, the old frozen_data
* field now points to a committed version of the
* buffer, so rotate that field to the new committed
* data.
*
* Otherwise, we can just throw away the frozen data now.
*/
if (jh->b_committed_data) {
jbd_slab_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
}
} else if (jh->b_frozen_data) {
jbd_slab_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
}
spin_lock(&journal->j_list_lock);
cp_transaction = jh->b_cp_transaction;
if (cp_transaction) {
JBUFFER_TRACE(jh, "remove from old cp transaction");
__journal_remove_checkpoint(jh);
}
/* Only re-checkpoint the buffer_head if it is marked
* dirty. If the buffer was added to the BJ_Forget list
* by journal_forget, it may no longer be dirty and
* there's no point in keeping a checkpoint record for
* it. */
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
* that buffer in the future now that the last use has
* been committed. That's not only a performance gain,
* it also stops aliasing problems if the buffer is left
* behind for writeback and gets reallocated for another
* use in a different page. */
if (buffer_freed(bh)) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
}
if (buffer_jbddirty(bh)) {
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__journal_insert_checkpoint(jh, commit_transaction);
JBUFFER_TRACE(jh, "refile for checkpoint writeback");
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
} else {
J_ASSERT_BH(bh, !buffer_dirty(bh));
/* The buffer on BJ_Forget list and not jbddirty means
* it has been freed by this transaction and hence it
* could not have been reallocated until this
* transaction has committed. *BUT* it could be
* reallocated once we have written all the data to
* disk and before we process the buffer on BJ_Forget
* list. */
JBUFFER_TRACE(jh, "refile or unfile freed buffer");
__journal_refile_buffer(jh);
if (!jh->b_transaction) {
jbd_unlock_bh_state(bh);
/* needs a brelse */
journal_remove_journal_head(bh);
release_buffer_page(bh);
} else
jbd_unlock_bh_state(bh);
}
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
/*
* This is a bit sleazy. We borrow j_list_lock to protect
* journal->j_committing_transaction in __journal_remove_checkpoint.
* Really, __journal_remove_checkpoint should be using j_state_lock but
* it's a bit hassle to hold that across __journal_remove_checkpoint
*/
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
/*
* Now recheck if some buffers did not get attached to the transaction
* while the lock was dropped...
*/
if (commit_transaction->t_forget) {
spin_unlock(&journal->j_list_lock);
spin_unlock(&journal->j_state_lock);
goto restart_loop;
}
/* Done with this transaction! */
jbd_debug(3, "JBD: commit phase 8\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_FINISHED;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
spin_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL) {
__journal_drop_transaction(journal, commit_transaction);
} else {
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
}
}
spin_unlock(&journal->j_list_lock);
jbd_debug(1, "JBD: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
wake_up(&journal->j_wait_done_commit);
}
/*
* linux/fs/jbd/journal.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Generic filesystem journal-writing code; part of the ext2fs
* journaling system.
*
* This file manages journals: areas of disk reserved for logging
* transactional updates. This includes the kernel journaling thread
* which is responsible for scheduling updates to the log.
*
* We do not actually manage the physical storage of the journal in this
* file: that is left to a per-journal policy function, which allows us
* to store the journal within a filesystem-specified area for ext2
* journaling (ext2 can use a reserved inode for storing the log).
*/
#include <linux/module.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/suspend.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/poison.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include <asm/page.h>
EXPORT_SYMBOL(journal_start);
EXPORT_SYMBOL(journal_restart);
EXPORT_SYMBOL(journal_extend);
EXPORT_SYMBOL(journal_stop);
EXPORT_SYMBOL(journal_lock_updates);
EXPORT_SYMBOL(journal_unlock_updates);
EXPORT_SYMBOL(journal_get_write_access);
EXPORT_SYMBOL(journal_get_create_access);
EXPORT_SYMBOL(journal_get_undo_access);
EXPORT_SYMBOL(journal_dirty_data);
EXPORT_SYMBOL(journal_dirty_metadata);
EXPORT_SYMBOL(journal_release_buffer);
EXPORT_SYMBOL(journal_forget);
#if 0
EXPORT_SYMBOL(journal_sync_buffer);
#endif
EXPORT_SYMBOL(journal_flush);
EXPORT_SYMBOL(journal_revoke);
EXPORT_SYMBOL(journal_init_dev);
EXPORT_SYMBOL(journal_init_inode);
EXPORT_SYMBOL(journal_update_format);
EXPORT_SYMBOL(journal_check_used_features);
EXPORT_SYMBOL(journal_check_available_features);
EXPORT_SYMBOL(journal_set_features);
EXPORT_SYMBOL(journal_create);
EXPORT_SYMBOL(journal_load);
EXPORT_SYMBOL(journal_destroy);
EXPORT_SYMBOL(journal_update_superblock);
EXPORT_SYMBOL(journal_abort);
EXPORT_SYMBOL(journal_errno);
EXPORT_SYMBOL(journal_ack_err);
EXPORT_SYMBOL(journal_clear_err);
EXPORT_SYMBOL(log_wait_commit);
EXPORT_SYMBOL(journal_start_commit);
EXPORT_SYMBOL(journal_force_commit_nested);
EXPORT_SYMBOL(journal_wipe);
EXPORT_SYMBOL(journal_blocks_per_page);
EXPORT_SYMBOL(journal_invalidatepage);
EXPORT_SYMBOL(journal_try_to_free_buffers);
EXPORT_SYMBOL(journal_force_commit);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
static int journal_create_jbd_slab(size_t slab_size);
/*
* Helper function used to manage commit timeouts
*/
static void commit_timeout(unsigned long __data)
{
struct task_struct * p = (struct task_struct *) __data;
wake_up_process(p);
}
/*
* kjournald: The main thread function used to manage a logging device
* journal.
*
* This kernel thread is responsible for two things:
*
* 1) COMMIT: Every so often we need to commit the current state of the
* filesystem to disk. The journal thread is responsible for writing
* all of the metadata buffers to disk.
*
* 2) CHECKPOINT: We cannot reuse a used section of the log file until all
* of the data in that part of the log has been rewritten elsewhere on
* the disk. Flushing these old buffers to reclaim space in the log is
* known as checkpointing, and this thread is responsible for that job.
*/
static int kjournald(void *arg)
{
journal_t *journal = arg;
transaction_t *transaction;
/*
* Set up an interval timer which can be used to trigger a commit wakeup
* after the commit interval expires
*/
setup_timer(&journal->j_commit_timer, commit_timeout,
(unsigned long)current);
/* Record that the journal thread is running */
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
journal->j_commit_interval / HZ);
/*
* And now, wait forever for commit wakeup events.
*/
spin_lock(&journal->j_state_lock);
loop:
if (journal->j_flags & JFS_UNMOUNT)
goto end_loop;
jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
journal->j_commit_sequence, journal->j_commit_request);
if (journal->j_commit_sequence != journal->j_commit_request) {
jbd_debug(1, "OK, requests differ\n");
spin_unlock(&journal->j_state_lock);
del_timer_sync(&journal->j_commit_timer);
journal_commit_transaction(journal);
spin_lock(&journal->j_state_lock);
goto loop;
}
wake_up(&journal->j_wait_done_commit);
if (freezing(current)) {
/*
* The simpler the better. Flushing journal isn't a
* good idea, because that depends on threads that may
* be already stopped.
*/
jbd_debug(1, "Now suspending kjournald\n");
spin_unlock(&journal->j_state_lock);
refrigerator();
spin_lock(&journal->j_state_lock);
} else {
/*
* We assume on resume that commits are already there,
* so we don't sleep
*/
DEFINE_WAIT(wait);
int should_sleep = 1;
prepare_to_wait(&journal->j_wait_commit, &wait,
TASK_INTERRUPTIBLE);
if (journal->j_commit_sequence != journal->j_commit_request)
should_sleep = 0;
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies,
transaction->t_expires))
should_sleep = 0;
if (journal->j_flags & JFS_UNMOUNT)
should_sleep = 0;
if (should_sleep) {
spin_unlock(&journal->j_state_lock);
schedule();
spin_lock(&journal->j_state_lock);
}
finish_wait(&journal->j_wait_commit, &wait);
}
jbd_debug(1, "kjournald wakes\n");
/*
* Were we woken up by a commit wakeup event?
*/
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
journal->j_commit_request = transaction->t_tid;
jbd_debug(1, "woke because of timeout\n");
}
goto loop;
end_loop:
spin_unlock(&journal->j_state_lock);
del_timer_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jbd_debug(1, "Journal thread exiting.\n");
return 0;
}
static void journal_start_thread(journal_t *journal)
{
kthread_run(kjournald, journal, "kjournald");
wait_event(journal->j_wait_done_commit, journal->j_task != 0);
}
static void journal_kill_thread(journal_t *journal)
{
spin_lock(&journal->j_state_lock);
journal->j_flags |= JFS_UNMOUNT;
while (journal->j_task) {
wake_up(&journal->j_wait_commit);
spin_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_done_commit, journal->j_task == 0);
spin_lock(&journal->j_state_lock);
}
spin_unlock(&journal->j_state_lock);
}
/*
* journal_write_metadata_buffer: write a metadata buffer to the journal.
*
* Writes a metadata buffer to a given disk block. The actual IO is not
* performed but a new buffer_head is constructed which labels the data
* to be written with the correct destination disk block.
*
* Any magic-number escaping which needs to be done will cause a
* copy-out here. If the buffer happens to start with the
* JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
* magic number is only written to the log for descripter blocks. In
* this case, we copy the data and replace the first word with 0, and we
* return a result code which indicates that this buffer needs to be
* marked as an escaped buffer in the corresponding log descriptor
* block. The missing word can then be restored when the block is read
* during recovery.
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
* that data for IO. If we end up using the existing buffer_head's data
* for the write, then we *have* to lock the buffer to prevent anyone
* else from using and possibly modifying it while the IO is in
* progress.
*
* The function returns a pointer to the buffer_heads to be used for IO.
*
* We assume that the journal has already been locked in this function.
*
* Return value:
* <0: Error
* >=0: Finished OK
*
* On success:
* Bit 0 set == escape performed on the data
* Bit 1 set == buffer copy-out performed (kfree the data after IO)
*/
int journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct journal_head **jh_out,
unsigned long blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
struct journal_head *new_jh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
/*
* The buffer really shouldn't be locked: only the current committing
* transaction is allowed to write it, so nobody else is allowed
* to do any IO.
*
* akpm: except if we're journalling data, and write() output is
* also part of a shared mapping, and another thread has
* decided to launch a writepage() against this buffer.
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
jbd_lock_bh_state(bh_in);
repeat:
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
}
mapped_data = kmap_atomic(new_page, KM_USER0);
/*
* Check for escaping
*/
if (*((__be32 *)(mapped_data + new_offset)) ==
cpu_to_be32(JFS_MAGIC_NUMBER)) {
need_copy_out = 1;
do_escape = 1;
}
kunmap_atomic(mapped_data, KM_USER0);
/*
* Do we need to do a data copy?
*/
if (need_copy_out && !done_copy_out) {
char *tmp;
jbd_unlock_bh_state(bh_in);
tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
jbd_lock_bh_state(bh_in);
if (jh_in->b_frozen_data) {
jbd_slab_free(tmp, bh_in->b_size);
goto repeat;
}
jh_in->b_frozen_data = tmp;
mapped_data = kmap_atomic(new_page, KM_USER0);
memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
kunmap_atomic(mapped_data, KM_USER0);
new_page = virt_to_page(tmp);
new_offset = offset_in_page(tmp);
done_copy_out = 1;
}
/*
* Did we need to do an escaping? Now we've done all the
* copying, we can finally do so.
*/
if (do_escape) {
mapped_data = kmap_atomic(new_page, KM_USER0);
*((unsigned int *)(mapped_data + new_offset)) = 0;
kunmap_atomic(mapped_data, KM_USER0);
}
/* keep subsequent assertions sane */
new_bh->b_state = 0;
init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
jbd_unlock_bh_state(bh_in);
new_jh = journal_add_journal_head(new_bh); /* This sleeps */
set_bh_page(new_bh, new_page, new_offset);
new_jh->b_transaction = NULL;
new_bh->b_size = jh2bh(jh_in)->b_size;
new_bh->b_bdev = transaction->t_journal->j_dev;
new_bh->b_blocknr = blocknr;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
*jh_out = new_jh;
/*
* The to-be-written buffer needs to get moved to the io queue,
* and the original buffer whose contents we are shadowing or
* copying is moved to the transaction's shadow queue.
*/
JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
journal_file_buffer(jh_in, transaction, BJ_Shadow);
JBUFFER_TRACE(new_jh, "file as BJ_IO");
journal_file_buffer(new_jh, transaction, BJ_IO);
return do_escape | (done_copy_out << 1);
}
/*
* Allocation code for the journal file. Manage the space left in the
* journal, so that we can begin checkpointing when appropriate.
*/
/*
* __log_space_left: Return the number of free blocks left in the journal.
*
* Called with the journal already locked.
*
* Called under j_state_lock
*/
int __log_space_left(journal_t *journal)
{
int left = journal->j_free;
assert_spin_locked(&journal->j_state_lock);
/*
* Be pessimistic here about the number of those free blocks which
* might be required for log descriptor control blocks.
*/
#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
left -= MIN_LOG_RESERVED_BLOCKS;
if (left <= 0)
return 0;
left -= (left >> 3);
return left;
}
/*
* Called under j_state_lock. Returns true if a transaction was started.
*/
int __log_start_commit(journal_t *journal, tid_t target)
{
/*
* Are we already doing a recent enough commit?
*/
if (!tid_geq(journal->j_commit_request, target)) {
/*
* We want a new commit: OK, mark the request and wakup the
* commit thread. We do _not_ do the commit ourselves.
*/
journal->j_commit_request = target;
jbd_debug(1, "JBD: requesting commit %d/%d\n",
journal->j_commit_request,
journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
return 1;
}
return 0;
}
int log_start_commit(journal_t *journal, tid_t tid)
{
int ret;
spin_lock(&journal->j_state_lock);
ret = __log_start_commit(journal, tid);
spin_unlock(&journal->j_state_lock);
return ret;
}
/*
* Force and wait upon a commit if the calling process is not within
* transaction. This is used for forcing out undo-protected data which contains
* bitmaps, when the fs is running out of space.
*
* We can only force the running transaction if we don't have an active handle;
* otherwise, we will deadlock.
*
* Returns true if a transaction was started.
*/
int journal_force_commit_nested(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
spin_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
transaction = journal->j_running_transaction;
__log_start_commit(journal, transaction->t_tid);
} else if (journal->j_committing_transaction)
transaction = journal->j_committing_transaction;
if (!transaction) {
spin_unlock(&journal->j_state_lock);
return 0; /* Nothing to retry */
}
tid = transaction->t_tid;
spin_unlock(&journal->j_state_lock);
log_wait_commit(journal, tid);
return 1;
}
/*
* Start a commit of the current running transaction (if any). Returns true
* if a transaction was started, and fills its tid in at *ptid
*/
int journal_start_commit(journal_t *journal, tid_t *ptid)
{
int ret = 0;
spin_lock(&journal->j_state_lock);
if (journal->j_running_transaction) {
tid_t tid = journal->j_running_transaction->t_tid;
ret = __log_start_commit(journal, tid);
if (ret && ptid)
*ptid = tid;
} else if (journal->j_committing_transaction && ptid) {
/*
* If ext3_write_super() recently started a commit, then we
* have to wait for completion of that transaction
*/
*ptid = journal->j_committing_transaction->t_tid;
ret = 1;
}
spin_unlock(&journal->j_state_lock);
return ret;
}
/*
* Wait for a specified commit to complete.
* The caller may not hold the journal lock.
*/
int log_wait_commit(journal_t *journal, tid_t tid)
{
int err = 0;
#ifdef CONFIG_JBD_DEBUG
spin_lock(&journal->j_state_lock);
if (!tid_geq(journal->j_commit_request, tid)) {
printk(KERN_EMERG
"%s: error: j_commit_request=%d, tid=%d\n",
__FUNCTION__, journal->j_commit_request, tid);
}
spin_unlock(&journal->j_state_lock);
#endif
spin_lock(&journal->j_state_lock);
while (tid_gt(tid, journal->j_commit_sequence)) {
jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
tid, journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
spin_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_done_commit,
!tid_gt(tid, journal->j_commit_sequence));
spin_lock(&journal->j_state_lock);
}
spin_unlock(&journal->j_state_lock);
if (unlikely(is_journal_aborted(journal))) {
printk(KERN_EMERG "journal commit I/O error\n");
err = -EIO;
}
return err;
}
/*
* Log buffer allocation routines:
*/
int journal_next_log_block(journal_t *journal, unsigned long *retp)
{
unsigned long blocknr;
spin_lock(&journal->j_state_lock);
J_ASSERT(journal->j_free > 1);
blocknr = journal->j_head;
journal->j_head++;
journal->j_free--;
if (journal->j_head == journal->j_last)
journal->j_head = journal->j_first;
spin_unlock(&journal->j_state_lock);
return journal_bmap(journal, blocknr, retp);
}
/*
* Conversion of logical to physical block numbers for the journal
*
* On external journals the journal blocks are identity-mapped, so
* this is a no-op. If needed, we can use j_blk_offset - everything is
* ready.
*/
int journal_bmap(journal_t *journal, unsigned long blocknr,
unsigned long *retp)
{
int err = 0;
unsigned long ret;
if (journal->j_inode) {
ret = bmap(journal->j_inode, blocknr);
if (ret)
*retp = ret;
else {
char b[BDEVNAME_SIZE];
printk(KERN_ALERT "%s: journal block not found "
"at offset %lu on %s\n",
__FUNCTION__,
blocknr,
bdevname(journal->j_dev, b));
err = -EIO;
__journal_abort_soft(journal, err);
}
} else {
*retp = blocknr; /* +journal->j_blk_offset */
}
return err;
}
/*
* We play buffer_head aliasing tricks to write data/metadata blocks to
* the journal without copying their contents, but for journal
* descriptor blocks we do need to generate bona fide buffers.
*
* After the caller of journal_get_descriptor_buffer() has finished modifying
* the buffer's contents they really should run flush_dcache_page(bh->b_page).
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
unsigned long blocknr;
int err;
err = journal_next_log_block(journal, &blocknr);
if (err)
return NULL;
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
return journal_add_journal_head(bh);
}
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk. */
/* First: create and setup a journal_t object in memory. We initialise
* very few fields yet: that has to wait until we have created the
* journal structures from from scratch, or loaded them from disk. */
static journal_t * journal_init_common (void)
{
journal_t *journal;
int err;
journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
if (!journal)
goto fail;
memset(journal, 0, sizeof(*journal));
init_waitqueue_head(&journal->j_wait_transaction_locked);
init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
spin_lock_init(&journal->j_list_lock);
spin_lock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JFS_ABORT;
/* Set up a default-sized revoke table for the new mount. */
err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
if (err) {
kfree(journal);
goto fail;
}
return journal;
fail:
return NULL;
}
/* journal_init_dev and journal_init_inode:
*
* Create a journal structure assigned some fixed set of disk blocks to
* the journal. We don't actually touch those disk blocks yet, but we
* need to set up all of the mapping information to tell the journaling
* system where the journal blocks are.
*
*/
/**
* journal_t * journal_init_dev() - creates an initialises a journal structure
* @bdev: Block device on which to create the journal
* @fs_dev: Device which hold journalled filesystem for this journal.
* @start: Block nr Start of journal.
* @len: Length of the journal in blocks.
* @blocksize: blocksize of journalling device
* @returns: a newly created journal_t *
*
* journal_init_dev creates a journal which maps a fixed contiguous
* range of blocks on an arbitrary block device.
*
*/
journal_t * journal_init_dev(struct block_device *bdev,
struct block_device *fs_dev,
int start, int len, int blocksize)
{
journal_t *journal = journal_init_common();
struct buffer_head *bh;
int n;
if (!journal)
return NULL;
/* journal descriptor can store up to n blocks -bzzz */
journal->j_blocksize = blocksize;
n = journal->j_blocksize / sizeof(journal_block_tag_t);
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
if (!journal->j_wbuf) {
printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
__FUNCTION__);
kfree(journal);
journal = NULL;
}
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_maxlen = len;
bh = __getblk(journal->j_dev, start, journal->j_blocksize);
J_ASSERT(bh != NULL);
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
return journal;
}
/**
* journal_t * journal_init_inode () - creates a journal which maps to a inode.
* @inode: An inode to create the journal in
*
* journal_init_inode creates a journal which maps an on-disk inode as
* the journal. The inode must exist already, must support bmap() and
* must have all data blocks preallocated.
*/
journal_t * journal_init_inode (struct inode *inode)
{
struct buffer_head *bh;
journal_t *journal = journal_init_common();
int err;
int n;
unsigned long blocknr;
if (!journal)
return NULL;
journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
journal->j_inode = inode;
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
journal, inode->i_sb->s_id, inode->i_ino,
(long long) inode->i_size,
inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
journal->j_blocksize = inode->i_sb->s_blocksize;
/* journal descriptor can store up to n blocks -bzzz */
n = journal->j_blocksize / sizeof(journal_block_tag_t);
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
if (!journal->j_wbuf) {
printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
__FUNCTION__);
kfree(journal);
return NULL;
}
err = journal_bmap(journal, 0, &blocknr);
/* If that failed, give up */
if (err) {
printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
__FUNCTION__);
kfree(journal);
return NULL;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
J_ASSERT(bh != NULL);
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
return journal;
}
/*
* If the journal init or create aborts, we need to mark the journal
* superblock as being NULL to prevent the journal destroy from writing
* back a bogus superblock.
*/
static void journal_fail_superblock (journal_t *journal)
{
struct buffer_head *bh = journal->j_sb_buffer;
brelse(bh);
journal->j_sb_buffer = NULL;
}
/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
* a journal, and after recovering an old journal to reset it for
* subsequent use.
*/
static int journal_reset(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
unsigned long first, last;
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
journal->j_first = first;
journal->j_last = last;
journal->j_head = first;
journal->j_tail = first;
journal->j_free = last - first;
journal->j_tail_sequence = journal->j_transaction_sequence;
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
journal->j_max_transaction_buffers = journal->j_maxlen / 4;
/* Add the dynamic fields and write it to disk. */
journal_update_superblock(journal, 1);
journal_start_thread(journal);
return 0;
}
/**
* int journal_create() - Initialise the new journal file
* @journal: Journal to create. This structure must have been initialised
*
* Given a journal_t structure which tells us which disk blocks we can
* use, create a new journal superblock and initialise all of the
* journal fields from scratch.
**/
int journal_create(journal_t *journal)
{
unsigned long blocknr;
struct buffer_head *bh;
journal_superblock_t *sb;
int i, err;
if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
printk (KERN_ERR "Journal length (%d blocks) too short.\n",
journal->j_maxlen);
journal_fail_superblock(journal);
return -EINVAL;
}
if (journal->j_inode == NULL) {
/*
* We don't know what block to start at!
*/
printk(KERN_EMERG
"%s: creation of journal on external device!\n",
__FUNCTION__);
BUG();
}
/* Zero out the entire journal on disk. We cannot afford to
have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
for (i = 0; i < journal->j_maxlen; i++) {
err = journal_bmap(journal, i, &blocknr);
if (err)
return err;
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
lock_buffer(bh);
memset (bh->b_data, 0, journal->j_blocksize);
BUFFER_TRACE(bh, "marking dirty");
mark_buffer_dirty(bh);
BUFFER_TRACE(bh, "marking uptodate");
set_buffer_uptodate(bh);
unlock_buffer(bh);
__brelse(bh);
}
sync_blockdev(journal->j_dev);
jbd_debug(1, "JBD: journal cleared.\n");
/* OK, fill in the initial static fields in the new superblock */
sb = journal->j_superblock;
sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
sb->s_first = cpu_to_be32(1);
journal->j_transaction_sequence = 1;
journal->j_flags &= ~JFS_ABORT;
journal->j_format_version = 2;
return journal_reset(journal);
}
/**
* void journal_update_superblock() - Update journal sb on disk.
* @journal: The journal to update.
* @wait: Set to '0' if you don't want to wait for IO completion.
*
* Update a journal's dynamic superblock fields and write it to disk,
* optionally waiting for the IO to complete.
*/
void journal_update_superblock(journal_t *journal, int wait)
{
journal_superblock_t *sb = journal->j_superblock;
struct buffer_head *bh = journal->j_sb_buffer;
/*
* As a special case, if the on-disk copy is already marked as needing
* no recovery (s_start == 0) and there are no outstanding transactions
* in the filesystem, then we can safely defer the superblock update
* until the next commit by setting JFS_FLUSHED. This avoids
* attempting a write to a potential-readonly device.
*/
if (sb->s_start == 0 && journal->j_tail_sequence ==
journal->j_transaction_sequence) {
jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
"(start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
goto out;
}
spin_lock(&journal->j_state_lock);
jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start = cpu_to_be32(journal->j_tail);
sb->s_errno = cpu_to_be32(journal->j_errno);
spin_unlock(&journal->j_state_lock);
BUFFER_TRACE(bh, "marking dirty");
mark_buffer_dirty(bh);
if (wait)
sync_dirty_buffer(bh);
else
ll_rw_block(SWRITE, 1, &bh);
out:
/* If we have just flushed the log (by marking s_start==0), then
* any future commit will have to be careful to update the
* superblock again to re-record the true start of the log. */
spin_lock(&journal->j_state_lock);
if (sb->s_start)
journal->j_flags &= ~JFS_FLUSHED;
else
journal->j_flags |= JFS_FLUSHED;
spin_unlock(&journal->j_state_lock);
}
/*
* Read the superblock for a given journal, performing initial
* validation of the format.
*/
static int journal_get_superblock(journal_t *journal)
{
struct buffer_head *bh;
journal_superblock_t *sb;
int err = -EIO;
bh = journal->j_sb_buffer;
J_ASSERT(bh != NULL);
if (!buffer_uptodate(bh)) {
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
printk (KERN_ERR
"JBD: IO error reading journal superblock\n");
goto out;
}
}
sb = journal->j_superblock;
err = -EINVAL;
if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
printk(KERN_WARNING "JBD: no valid journal superblock found\n");
goto out;
}
switch(be32_to_cpu(sb->s_header.h_blocktype)) {
case JFS_SUPERBLOCK_V1:
journal->j_format_version = 1;
break;
case JFS_SUPERBLOCK_V2:
journal->j_format_version = 2;
break;
default:
printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
goto out;
}
if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
printk (KERN_WARNING "JBD: journal file too short\n");
goto out;
}
return 0;
out:
journal_fail_superblock(journal);
return err;
}
/*
* Load the on-disk journal superblock and read the key fields into the
* journal_t.
*/
static int load_superblock(journal_t *journal)
{
int err;
journal_superblock_t *sb;
err = journal_get_superblock(journal);
if (err)
return err;
sb = journal->j_superblock;
journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
journal->j_tail = be32_to_cpu(sb->s_start);
journal->j_first = be32_to_cpu(sb->s_first);
journal->j_last = be32_to_cpu(sb->s_maxlen);
journal->j_errno = be32_to_cpu(sb->s_errno);
return 0;
}
/**
* int journal_load() - Read journal from disk.
* @journal: Journal to act on.
*
* Given a journal_t structure which tells us which disk blocks contain
* a journal, read the journal from disk to initialise the in-memory
* structures.
*/
int journal_load(journal_t *journal)
{
int err;
journal_superblock_t *sb;
err = load_superblock(journal);
if (err)
return err;
sb = journal->j_superblock;
/* If this is a V2 superblock, then we have to check the
* features flags on it. */
if (journal->j_format_version >= 2) {
if ((sb->s_feature_ro_compat &
~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
(sb->s_feature_incompat &
~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
printk (KERN_WARNING
"JBD: Unrecognised features on journal\n");
return -EINVAL;
}
}
/*
* Create a slab for this blocksize
*/
err = journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
if (err)
return err;
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
if (journal_recover(journal))
goto recovery_error;
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
if (journal_reset(journal))
goto recovery_error;
journal->j_flags &= ~JFS_ABORT;
journal->j_flags |= JFS_LOADED;
return 0;
recovery_error:
printk (KERN_WARNING "JBD: recovery failed\n");
return -EIO;
}
/**
* void journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
* Release a journal_t structure once it is no longer in use by the
* journaled object.
*/
void journal_destroy(journal_t *journal)
{
/* Wait for the commit thread to wake up and die. */
journal_kill_thread(journal);
/* Force a final log commit */
if (journal->j_running_transaction)
journal_commit_transaction(journal);
/* Force any old transactions to disk */
/* Totally anal locking here... */
spin_lock(&journal->j_list_lock);
while (journal->j_checkpoint_transactions != NULL) {
spin_unlock(&journal->j_list_lock);
log_do_checkpoint(journal);
spin_lock(&journal->j_list_lock);
}
J_ASSERT(journal->j_running_transaction == NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
J_ASSERT(journal->j_checkpoint_transactions == NULL);
spin_unlock(&journal->j_list_lock);
/* We can now mark the journal as empty. */
journal->j_tail = 0;
journal->j_tail_sequence = ++journal->j_transaction_sequence;
if (journal->j_sb_buffer) {
journal_update_superblock(journal, 1);
brelse(journal->j_sb_buffer);
}
if (journal->j_inode)
iput(journal->j_inode);
if (journal->j_revoke)
journal_destroy_revoke(journal);
kfree(journal->j_wbuf);
kfree(journal);
}
/**
*int journal_check_used_features () - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
* @incompat: bitmask of incompatible features
*
* Check whether the journal uses all of a given set of
* features. Return true (non-zero) if it does.
**/
int journal_check_used_features (journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
if (!compat && !ro && !incompat)
return 1;
if (journal->j_format_version == 1)
return 0;
sb = journal->j_superblock;
if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
return 1;
return 0;
}
/**
* int journal_check_available_features() - Check feature set in journalling layer
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
* @incompat: bitmask of incompatible features
*
* Check whether the journaling code supports the use of
* all of a given set of features on this journal. Return true
* (non-zero) if it can. */
int journal_check_available_features (journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
if (!compat && !ro && !incompat)
return 1;
sb = journal->j_superblock;
/* We can support any known requested features iff the
* superblock is in version 2. Otherwise we fail to support any
* extended sb features. */
if (journal->j_format_version != 2)
return 0;
if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
(ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
(incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
return 1;
return 0;
}
/**
* int journal_set_features () - Mark a given journal feature in the superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
* @incompat: bitmask of incompatible features
*
* Mark a given journal feature as present on the
* superblock. Returns true if the requested features could be set.
*
*/
int journal_set_features (journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
if (journal_check_used_features(journal, compat, ro, incompat))
return 1;
if (!journal_check_available_features(journal, compat, ro, incompat))
return 0;
jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
return 1;
}
/**
* int journal_update_format () - Update on-disk journal structure.
* @journal: Journal to act on.
*
* Given an initialised but unloaded journal struct, poke about in the
* on-disk structure to update it to the most recent supported version.
*/
int journal_update_format (journal_t *journal)
{
journal_superblock_t *sb;
int err;
err = journal_get_superblock(journal);
if (err)
return err;
sb = journal->j_superblock;
switch (be32_to_cpu(sb->s_header.h_blocktype)) {
case JFS_SUPERBLOCK_V2:
return 0;
case JFS_SUPERBLOCK_V1:
return journal_convert_superblock_v1(journal, sb);
default:
break;
}
return -EINVAL;
}
static int journal_convert_superblock_v1(journal_t *journal,
journal_superblock_t *sb)
{
int offset, blocksize;
struct buffer_head *bh;
printk(KERN_WARNING
"JBD: Converting superblock from version 1 to 2.\n");
/* Pre-initialise new fields to zero */
offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
blocksize = be32_to_cpu(sb->s_blocksize);
memset(&sb->s_feature_compat, 0, blocksize-offset);
sb->s_nr_users = cpu_to_be32(1);
sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
journal->j_format_version = 2;
bh = journal->j_sb_buffer;
BUFFER_TRACE(bh, "marking dirty");
mark_buffer_dirty(bh);
sync_dirty_buffer(bh);
return 0;
}
/**
* int journal_flush () - Flush journal
* @journal: Journal to act on.
*
* Flush all data for a given journal to disk and empty the journal.
* Filesystems can use this when remounting readonly to ensure that
* recovery does not need to happen on remount.
*/
int journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
unsigned long old_tail;
spin_lock(&journal->j_state_lock);
/* Force everything buffered to the log... */
if (journal->j_running_transaction) {
transaction = journal->j_running_transaction;
__log_start_commit(journal, transaction->t_tid);
} else if (journal->j_committing_transaction)
transaction = journal->j_committing_transaction;
/* Wait for the log commit to complete... */
if (transaction) {
tid_t tid = transaction->t_tid;
spin_unlock(&journal->j_state_lock);
log_wait_commit(journal, tid);
} else {
spin_unlock(&journal->j_state_lock);
}
/* ...and flush everything in the log out to disk. */
spin_lock(&journal->j_list_lock);
while (!err && journal->j_checkpoint_transactions != NULL) {
spin_unlock(&journal->j_list_lock);
err = log_do_checkpoint(journal);
spin_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
cleanup_journal_tail(journal);
/* Finally, mark the journal as really needing no recovery.
* This sets s_start==0 in the underlying superblock, which is
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
spin_lock(&journal->j_state_lock);
old_tail = journal->j_tail;
journal->j_tail = 0;
spin_unlock(&journal->j_state_lock);
journal_update_superblock(journal, 1);
spin_lock(&journal->j_state_lock);
journal->j_tail = old_tail;
J_ASSERT(!journal->j_running_transaction);
J_ASSERT(!journal->j_committing_transaction);
J_ASSERT(!journal->j_checkpoint_transactions);
J_ASSERT(journal->j_head == journal->j_tail);
J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
spin_unlock(&journal->j_state_lock);
return err;
}
/**
* int journal_wipe() - Wipe journal contents
* @journal: Journal to act on.
* @write: flag (see below)
*
* Wipe out all of the contents of a journal, safely. This will produce
* a warning if the journal contains any valid recovery information.
* Must be called between journal_init_*() and journal_load().
*
* If 'write' is non-zero, then we wipe out the journal on disk; otherwise
* we merely suppress recovery.
*/
int journal_wipe(journal_t *journal, int write)
{
journal_superblock_t *sb;
int err = 0;
J_ASSERT (!(journal->j_flags & JFS_LOADED));
err = load_superblock(journal);
if (err)
return err;
sb = journal->j_superblock;
if (!journal->j_tail)
goto no_recovery;
printk (KERN_WARNING "JBD: %s recovery information on journal\n",
write ? "Clearing" : "Ignoring");
err = journal_skip_recovery(journal);
if (write)
journal_update_superblock(journal, 1);
no_recovery:
return err;
}
/*
* journal_dev_name: format a character string to describe on what
* device this journal is present.
*/
static const char *journal_dev_name(journal_t *journal, char *buffer)
{
struct block_device *bdev;
if (journal->j_inode)
bdev = journal->j_inode->i_sb->s_bdev;
else
bdev = journal->j_dev;
return bdevname(bdev, buffer);
}
/*
* Journal abort has very specific semantics, which we describe
* for journal abort.
*
* Two internal function, which provide abort to te jbd layer
* itself are here.
*/
/*
* Quick version for internal journal use (doesn't lock the journal).
* Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
* and don't attempt to make any other journal updates.
*/
void __journal_abort_hard(journal_t *journal)
{
transaction_t *transaction;
char b[BDEVNAME_SIZE];
if (journal->j_flags & JFS_ABORT)
return;
printk(KERN_ERR "Aborting journal on device %s.\n",
journal_dev_name(journal, b));
spin_lock(&journal->j_state_lock);
journal->j_flags |= JFS_ABORT;
transaction = journal->j_running_transaction;
if (transaction)
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
}
/* Soft abort: record the abort error status in the journal superblock,
* but don't do any other IO. */
static void __journal_abort_soft (journal_t *journal, int errno)
{
if (journal->j_flags & JFS_ABORT)
return;
if (!journal->j_errno)
journal->j_errno = errno;
__journal_abort_hard(journal);
if (errno)
journal_update_superblock(journal, 1);
}
/**
* void journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
* @errno: an error number to record in the journal indicating
* the reason for the shutdown.
*
* Perform a complete, immediate shutdown of the ENTIRE
* journal (not of a single transaction). This operation cannot be
* undone without closing and reopening the journal.
*
* The journal_abort function is intended to support higher level error
* recovery mechanisms such as the ext2/ext3 remount-readonly error
* mode.
*
* Journal abort has very specific semantics. Any existing dirty,
* unjournaled buffers in the main filesystem will still be written to
* disk by bdflush, but the journaling mechanism will be suspended
* immediately and no further transaction commits will be honoured.
*
* Any dirty, journaled buffers will be written back to disk without
* hitting the journal. Atomicity cannot be guaranteed on an aborted
* filesystem, but we _do_ attempt to leave as much data as possible
* behind for fsck to use for cleanup.
*
* Any attempt to get a new transaction handle on a journal which is in
* ABORT state will just result in an -EROFS error return. A
* journal_stop on an existing handle will return -EIO if we have
* entered abort state during the update.
*
* Recursive transactions are not disturbed by journal abort until the
* final journal_stop, which will receive the -EIO error.
*
* Finally, the journal_abort call allows the caller to supply an errno
* which will be recorded (if possible) in the journal superblock. This
* allows a client to record failure conditions in the middle of a
* transaction without having to complete the transaction to record the
* failure to disk. ext3_error, for example, now uses this
* functionality.
*
* Errors which originate from within the journaling layer will NOT
* supply an errno; a null errno implies that absolutely no further
* writes are done to the journal (unless there are any already in
* progress).
*
*/
void journal_abort(journal_t *journal, int errno)
{
__journal_abort_soft(journal, errno);
}
/**
* int journal_errno () - returns the journal's error state.
* @journal: journal to examine.
*
* This is the errno numbet set with journal_abort(), the last
* time the journal was mounted - if the journal was stopped
* without calling abort this will be 0.
*
* If the journal has been aborted on this mount time -EROFS will
* be returned.
*/
int journal_errno(journal_t *journal)
{
int err;
spin_lock(&journal->j_state_lock);
if (journal->j_flags & JFS_ABORT)
err = -EROFS;
else
err = journal->j_errno;
spin_unlock(&journal->j_state_lock);
return err;
}
/**
* int journal_clear_err () - clears the journal's error state
* @journal: journal to act on.
*
* An error must be cleared or Acked to take a FS out of readonly
* mode.
*/
int journal_clear_err(journal_t *journal)
{
int err = 0;
spin_lock(&journal->j_state_lock);
if (journal->j_flags & JFS_ABORT)
err = -EROFS;
else
journal->j_errno = 0;
spin_unlock(&journal->j_state_lock);
return err;
}
/**
* void journal_ack_err() - Ack journal err.
* @journal: journal to act on.
*
* An error must be cleared or Acked to take a FS out of readonly
* mode.
*/
void journal_ack_err(journal_t *journal)
{
spin_lock(&journal->j_state_lock);
if (journal->j_errno)
journal->j_flags |= JFS_ACK_ERR;
spin_unlock(&journal->j_state_lock);
}
int journal_blocks_per_page(struct inode *inode)
{
return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
}
/*
* Simple support for retrying memory allocations. Introduced to help to
* debug different VM deadlock avoidance strategies.
*/
void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
{
return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
}
/*
* jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
* and allocate frozen and commit buffers from these slabs.
*
* Reason for doing this is to avoid, SLAB_DEBUG - since it could
* cause bh to cross page boundary.
*/
#define JBD_MAX_SLABS 5
#define JBD_SLAB_INDEX(size) (size >> 11)
static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
static const char *jbd_slab_names[JBD_MAX_SLABS] = {
"jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
};
static void journal_destroy_jbd_slabs(void)
{
int i;
for (i = 0; i < JBD_MAX_SLABS; i++) {
if (jbd_slab[i])
kmem_cache_destroy(jbd_slab[i]);
jbd_slab[i] = NULL;
}
}
static int journal_create_jbd_slab(size_t slab_size)
{
int i = JBD_SLAB_INDEX(slab_size);
BUG_ON(i >= JBD_MAX_SLABS);
/*
* Check if we already have a slab created for this size
*/
if (jbd_slab[i])
return 0;
/*
* Create a slab and force alignment to be same as slabsize -
* this will make sure that allocations won't cross the page
* boundary.
*/
jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
slab_size, slab_size, 0, NULL, NULL);
if (!jbd_slab[i]) {
printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
return -ENOMEM;
}
return 0;
}
void * jbd_slab_alloc(size_t size, gfp_t flags)
{
int idx;
idx = JBD_SLAB_INDEX(size);
BUG_ON(jbd_slab[idx] == NULL);
return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
}
void jbd_slab_free(void *ptr, size_t size)
{
int idx;
idx = JBD_SLAB_INDEX(size);
BUG_ON(jbd_slab[idx] == NULL);
kmem_cache_free(jbd_slab[idx], ptr);
}
/*
* Journal_head storage management
*/
static kmem_cache_t *journal_head_cache;
#ifdef CONFIG_JBD_DEBUG
static atomic_t nr_journal_heads = ATOMIC_INIT(0);
#endif
static int journal_init_journal_head_cache(void)
{
int retval;
J_ASSERT(journal_head_cache == 0);
journal_head_cache = kmem_cache_create("journal_head",
sizeof(struct journal_head),
0, /* offset */
0, /* flags */
NULL, /* ctor */
NULL); /* dtor */
retval = 0;
if (journal_head_cache == 0) {
retval = -ENOMEM;
printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
}
return retval;
}
static void journal_destroy_journal_head_cache(void)
{
J_ASSERT(journal_head_cache != NULL);
kmem_cache_destroy(journal_head_cache);
journal_head_cache = NULL;
}
/*
* journal_head splicing and dicing
*/
static struct journal_head *journal_alloc_journal_head(void)
{
struct journal_head *ret;
static unsigned long last_warning;
#ifdef CONFIG_JBD_DEBUG
atomic_inc(&nr_journal_heads);
#endif
ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
if (ret == 0) {
jbd_debug(1, "out of memory for journal_head\n");
if (time_after(jiffies, last_warning + 5*HZ)) {
printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
__FUNCTION__);
last_warning = jiffies;
}
while (ret == 0) {
yield();
ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
}
}
return ret;
}
static void journal_free_journal_head(struct journal_head *jh)
{
#ifdef CONFIG_JBD_DEBUG
atomic_dec(&nr_journal_heads);
memset(jh, JBD_POISON_FREE, sizeof(*jh));
#endif
kmem_cache_free(journal_head_cache, jh);
}
/*
* A journal_head is attached to a buffer_head whenever JBD has an
* interest in the buffer.
*
* Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
* is set. This bit is tested in core kernel code where we need to take
* JBD-specific actions. Testing the zeroness of ->b_private is not reliable
* there.
*
* When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
*
* When a buffer has its BH_JBD bit set it is immune from being released by
* core kernel code, mainly via ->b_count.
*
* A journal_head may be detached from its buffer_head when the journal_head's
* b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
* Various places in JBD call journal_remove_journal_head() to indicate that the
* journal_head can be dropped if needed.
*
* Various places in the kernel want to attach a journal_head to a buffer_head
* _before_ attaching the journal_head to a transaction. To protect the
* journal_head in this situation, journal_add_journal_head elevates the
* journal_head's b_jcount refcount by one. The caller must call
* journal_put_journal_head() to undo this.
*
* So the typical usage would be:
*
* (Attach a journal_head if needed. Increments b_jcount)
* struct journal_head *jh = journal_add_journal_head(bh);
* ...
* jh->b_transaction = xxx;
* journal_put_journal_head(jh);
*
* Now, the journal_head's b_jcount is zero, but it is safe from being released
* because it has a non-zero b_transaction.
*/
/*
* Give a buffer_head a journal_head.
*
* Doesn't need the journal lock.
* May sleep.
*/
struct journal_head *journal_add_journal_head(struct buffer_head *bh)
{
struct journal_head *jh;
struct journal_head *new_jh = NULL;
repeat:
if (!buffer_jbd(bh)) {
new_jh = journal_alloc_journal_head();
memset(new_jh, 0, sizeof(*new_jh));
}
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
jh = bh2jh(bh);
} else {
J_ASSERT_BH(bh,
(atomic_read(&bh->b_count) > 0) ||
(bh->b_page && bh->b_page->mapping));
if (!new_jh) {
jbd_unlock_bh_journal_head(bh);
goto repeat;
}
jh = new_jh;
new_jh = NULL; /* We consumed it */
set_buffer_jbd(bh);
bh->b_private = jh;
jh->b_bh = bh;
get_bh(bh);
BUFFER_TRACE(bh, "added journal_head");
}
jh->b_jcount++;
jbd_unlock_bh_journal_head(bh);
if (new_jh)
journal_free_journal_head(new_jh);
return bh->b_private;
}
/*
* Grab a ref against this buffer_head's journal_head. If it ended up not
* having a journal_head, return NULL
*/
struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
{
struct journal_head *jh = NULL;
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
jh = bh2jh(bh);
jh->b_jcount++;
}
jbd_unlock_bh_journal_head(bh);
return jh;
}
static void __journal_remove_journal_head(struct buffer_head *bh)
{
struct journal_head *jh = bh2jh(bh);
J_ASSERT_JH(jh, jh->b_jcount >= 0);
get_bh(bh);
if (jh->b_jcount == 0) {
if (jh->b_transaction == NULL &&
jh->b_next_transaction == NULL &&
jh->b_cp_transaction == NULL) {
J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
J_ASSERT_BH(bh, buffer_jbd(bh));
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
if (jh->b_frozen_data) {
printk(KERN_WARNING "%s: freeing "
"b_frozen_data\n",
__FUNCTION__);
jbd_slab_free(jh->b_frozen_data, bh->b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing "
"b_committed_data\n",
__FUNCTION__);
jbd_slab_free(jh->b_committed_data, bh->b_size);
}
bh->b_private = NULL;
jh->b_bh = NULL; /* debug, really */
clear_buffer_jbd(bh);
__brelse(bh);
journal_free_journal_head(jh);
} else {
BUFFER_TRACE(bh, "journal_head was locked");
}
}
}
/*
* journal_remove_journal_head(): if the buffer isn't attached to a transaction
* and has a zero b_jcount then remove and release its journal_head. If we did
* see that the buffer is not used by any transaction we also "logically"
* decrement ->b_count.
*
* We in fact take an additional increment on ->b_count as a convenience,
* because the caller usually wants to do additional things with the bh
* after calling here.
* The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
* time. Once the caller has run __brelse(), the buffer is eligible for
* reaping by try_to_free_buffers().
*/
void journal_remove_journal_head(struct buffer_head *bh)
{
jbd_lock_bh_journal_head(bh);
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
}
/*
* Drop a reference on the passed journal_head. If it fell to zero then try to
* release the journal_head from the buffer_head.
*/
void journal_put_journal_head(struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
jbd_lock_bh_journal_head(bh);
J_ASSERT_JH(jh, jh->b_jcount > 0);
--jh->b_jcount;
if (!jh->b_jcount && !jh->b_transaction) {
__journal_remove_journal_head(bh);
__brelse(bh);
}
jbd_unlock_bh_journal_head(bh);
}
/*
* /proc tunables
*/
#if defined(CONFIG_JBD_DEBUG)
int journal_enable_debug;
EXPORT_SYMBOL(journal_enable_debug);
#endif
#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
static struct proc_dir_entry *proc_jbd_debug;
static int read_jbd_debug(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int ret;
ret = sprintf(page + off, "%d\n", journal_enable_debug);
*eof = 1;
return ret;
}
static int write_jbd_debug(struct file *file, const char __user *buffer,
unsigned long count, void *data)
{
char buf[32];
if (count > ARRAY_SIZE(buf) - 1)
count = ARRAY_SIZE(buf) - 1;
if (copy_from_user(buf, buffer, count))
return -EFAULT;
buf[ARRAY_SIZE(buf) - 1] = '\0';
journal_enable_debug = simple_strtoul(buf, NULL, 10);
return count;
}
#define JBD_PROC_NAME "sys/fs/jbd-debug"
static void __init create_jbd_proc_entry(void)
{
proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
if (proc_jbd_debug) {
/* Why is this so hard? */
proc_jbd_debug->read_proc = read_jbd_debug;
proc_jbd_debug->write_proc = write_jbd_debug;
}
}
static void __exit remove_jbd_proc_entry(void)
{
if (proc_jbd_debug)
remove_proc_entry(JBD_PROC_NAME, NULL);
}
#else
#define create_jbd_proc_entry() do {} while (0)
#define remove_jbd_proc_entry() do {} while (0)
#endif
kmem_cache_t *jbd_handle_cache;
static int __init journal_init_handle_cache(void)
{
jbd_handle_cache = kmem_cache_create("journal_handle",
sizeof(handle_t),
0, /* offset */
0, /* flags */
NULL, /* ctor */
NULL); /* dtor */
if (jbd_handle_cache == NULL) {
printk(KERN_EMERG "JBD: failed to create handle cache\n");
return -ENOMEM;
}
return 0;
}
static void journal_destroy_handle_cache(void)
{
if (jbd_handle_cache)
kmem_cache_destroy(jbd_handle_cache);
}
/*
* Module startup and shutdown
*/
static int __init journal_init_caches(void)
{
int ret;
ret = journal_init_revoke_caches();
if (ret == 0)
ret = journal_init_journal_head_cache();
if (ret == 0)
ret = journal_init_handle_cache();
return ret;
}
static void journal_destroy_caches(void)
{
journal_destroy_revoke_caches();
journal_destroy_journal_head_cache();
journal_destroy_handle_cache();
journal_destroy_jbd_slabs();
}
static int __init journal_init(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
ret = journal_init_caches();
if (ret != 0)
journal_destroy_caches();
create_jbd_proc_entry();
return ret;
}
static void __exit journal_exit(void)
{
#ifdef CONFIG_JBD_DEBUG
int n = atomic_read(&nr_journal_heads);
if (n)
printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
#endif
remove_jbd_proc_entry();
journal_destroy_caches();
}
MODULE_LICENSE("GPL");
module_init(journal_init);
module_exit(journal_exit);
/*
* linux/fs/recovery.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#endif
/*
* Maintain information about the progress of the recovery job, so that
* the different passes can carry information between them.
*/
struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
/* Release readahead buffers after use */
static void journal_brelse_array(struct buffer_head *b[], int n)
{
while (--n >= 0)
brelse (b[n]);
}
/*
* When reading from the journal, we are going through the block device
* layer directly and so there is no readahead being done for us. We
* need to implement any readahead ourselves if we want it to happen at
* all. Recovery is basically one long sequential read, so make sure we
* do the IO in reasonably large chunks.
*
* This is not so critical that we need to be enormously clever about
* the readahead size, though. 128K is a purely arbitrary, good-enough
* fixed value.
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned long blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
nbufs = 0;
for (next = start; next < max; next++) {
err = journal_bmap(journal, next, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
next);
goto failed;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
} else
brelse(bh);
}
if (nbufs)
ll_rw_block(READ, nbufs, bufs);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
/*
* Read a block from the journal
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
unsigned long blocknr;
struct buffer_head *bh;
*bhp = NULL;
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD: corrupted journal superblock\n");
return -EIO;
}
err = journal_bmap(journal, offset, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
offset);
return err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return -ENOMEM;
if (!buffer_uptodate(bh)) {
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
do_readahead(journal, offset);
wait_on_buffer(bh);
}
if (!buffer_uptodate(bh)) {
printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
}
*bhp = bh;
return 0;
}
/*
* Count the number of in-use tags in a journal descriptor block.
*/
static int count_tags(struct buffer_head *bh, int size)
{
char * tagp;
journal_block_tag_t * tag;
int nr = 0;
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
tag = (journal_block_tag_t *) tagp;
nr++;
tagp += sizeof(journal_block_tag_t);
if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
tagp += 16;
if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
break;
}
return nr;
}
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
if (var >= (journal)->j_last) \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
/**
* journal_recover - recovers a on-disk journal
* @journal: the journal to recover
*
* The primary function for recovering the log contents when mounting a
* journaled device.
*
* Recovery is done in three passes. In the first pass, we look for the
* end of the log. In the second, we assemble the list of revoke
* blocks. In the third and final pass, we replay any un-revoked blocks
* in the log.
*/
int journal_recover(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
*/
if (!sb->s_start) {
jbd_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
}
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
jbd_debug(0, "JBD: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
journal_clear_revoke(journal);
sync_blockdev(journal->j_fs_dev);
return err;
}
/**
* journal_skip_recovery - Start journal and wipe exiting records
* @journal: journal to startup
*
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
* This function does'nt appear to be exorted..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
* the journal transaction sequence numbers to the next unused ID.
*/
int journal_skip_recovery(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset (&info, 0, sizeof(info));
sb = journal->j_superblock;
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
printk(KERN_ERR "JBD: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD_DEBUG
int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
#endif
jbd_debug(0,
"JBD: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
journal->j_transaction_sequence = ++info.end_transaction;
}
journal->j_tail = 0;
return err;
}
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
unsigned long next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
/ sizeof(journal_block_tag_t));
/*
* First thing is to establish what we expect to find in the log
* (in terms of transaction IDs), and where (in terms of log
* block offsets): query the superblock.
*/
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
jbd_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
* making sure that each transaction has a commit block in the
* expected place. Each complete transaction gets replayed back
* into the main filesystem.
*/
while (1) {
int flags;
char * tagp;
journal_block_tag_t * tag;
struct buffer_head * obh;
struct buffer_head * nbh;
cond_resched(); /* We're under lock_kernel() */
/* If we already know where to stop the log traversal,
* check right now that we haven't gone past the end of
* the log. */
if (pass != PASS_SCAN)
if (tid_geq(next_commit_ID, info->end_transaction))
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
next_log_block++;
wrap(journal, next_log_block);
/* What kind of buffer is it?
*
* If it is a descriptor block, check that it has the
* expected sequence number. Otherwise, we're all done
* here. */
tmp = (journal_header_t *)bh->b_data;
if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
brelse(bh);
break;
}
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
brelse(bh);
break;
}
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
* to do with it? That depends on the pass... */
switch(blocktype) {
case JFS_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
* in pass REPLAY; otherwise, just skip over the
* blocks it describes. */
if (pass != PASS_REPLAY) {
next_log_block +=
count_tags(bh, journal->j_blocksize);
wrap(journal, next_log_block);
brelse(bh);
continue;
}
/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
* getting done here! */
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
/* Recover what we can, but
* report failure at the end. */
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
"block %ld in log\n",
err, io_block);
} else {
unsigned long blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
/* If the block has been
* revoked, then we're all done
* here. */
if (journal_test_revoke
(journal, blocknr,
next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
blocknr,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
"JBD: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
brelse(obh);
goto failed;
}
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data,
journal->j_blocksize);
if (flags & JFS_FLAG_ESCAPE) {
*((__be32 *)bh->b_data) =
cpu_to_be32(JFS_MAGIC_NUMBER);
}
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += sizeof(journal_block_tag_t);
if (!(flags & JFS_FLAG_SAME_UUID))
tagp += 16;
if (flags & JFS_FLAG_LAST_TAG)
break;
}
brelse(bh);
continue;
case JFS_COMMIT_BLOCK:
/* Found an expected commit block: not much to
* do other than move on to the next sequence
* number. */
brelse(bh);
next_commit_ID++;
continue;
case JFS_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
if (err)
goto failed;
continue;
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
}
}
done:
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
* log. If the latter happened, then we know that the "current"
* transaction marks the end of the valid log.
*/
if (pass == PASS_SCAN)
info->end_transaction = next_commit_ID;
else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
printk (KERN_ERR "JBD: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
success = -EIO;
}
}
return success;
failed:
return err;
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
{
journal_revoke_header_t *header;
int offset, max;
header = (journal_revoke_header_t *) bh->b_data;
offset = sizeof(journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
while (offset < max) {
unsigned long blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
offset += 4;
err = journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}
/*
* linux/fs/revoke.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 2000
*
* Copyright 2000 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal revoke routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*
* Revoke is the mechanism used to prevent old log records for deleted
* metadata from being replayed on top of newer data using the same
* blocks. The revoke mechanism is used in two separate places:
*
* + Commit: during commit we write the entire list of the current
* transaction's revoked blocks to the journal
*
* + Recovery: during recovery we record the transaction ID of all
* revoked blocks. If there are multiple revoke records in the log
* for a single block, only the last one counts, and if there is a log
* entry for a block beyond the last revoke, then that log entry still
* gets replayed.
*
* We can get interactions between revokes and new log data within a
* single transaction:
*
* Block is revoked and then journaled:
* The desired end result is the journaling of the new block, so we
* cancel the revoke before the transaction commits.
*
* Block is journaled and then revoked:
* The revoke must take precedence over the write of the block, so we
* need either to cancel the journal entry or to write the revoke
* later in the log than the log block. In this case, we choose the
* latter: journaling a block cancels any revoke record for that block
* in the current transaction, so any revoke for that block in the
* transaction must have happened after the block was journaled and so
* the revoke must take precedence.
*
* Block is revoked and then written as data:
* The data write is allowed to succeed, but the revoke is _not_
* cancelled. We still need to prevent old log records from
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
* RevokeValid set, Revoked clear:
* buffer has not been revoked, and cancel_revoke
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#endif
static kmem_cache_t *revoke_record_cache;
static kmem_cache_t *revoke_table_cache;
/* Each revoke record represents one single revoked block. During
journal replay, this involves recording the transaction ID of the
last transaction to revoke this block. */
struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
unsigned long blocknr;
};
/* The revoke table is just a simple hash table of revoke records. */
struct jbd_revoke_table_s
{
/* It is conceivable that we might want a larger hash table
* for recovery. Must be a power of two. */
int hash_size;
int hash_shift;
struct list_head *hash_table;
};
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
struct journal_head **, int *,
struct jbd_revoke_record_s *);
static void flush_descriptor(journal_t *, struct journal_head *, int);
#endif
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned long block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
return ((block << (hash_shift - 6)) ^
(block >> 13) ^
(block << (hash_shift - 12))) & (table->hash_size - 1);
}
static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
tid_t seq)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
repeat:
record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
if (!record)
goto oom;
record->sequence = seq;
record->blocknr = blocknr;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
oom:
if (!journal_oom_retry)
return -ENOMEM;
jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
yield();
goto repeat;
}
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
unsigned long blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
record = (struct jbd_revoke_record_s *) hash_list->next;
while (&(record->hash) != hash_list) {
if (record->blocknr == blocknr) {
spin_unlock(&journal->j_revoke_lock);
return record;
}
record = (struct jbd_revoke_record_s *) record->hash.next;
}
spin_unlock(&journal->j_revoke_lock);
return NULL;
}
int __init journal_init_revoke_caches(void)
{
revoke_record_cache = kmem_cache_create("revoke_record",
sizeof(struct jbd_revoke_record_s),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
if (revoke_record_cache == 0)
return -ENOMEM;
revoke_table_cache = kmem_cache_create("revoke_table",
sizeof(struct jbd_revoke_table_s),
0, 0, NULL, NULL);
if (revoke_table_cache == 0) {
kmem_cache_destroy(revoke_record_cache);
revoke_record_cache = NULL;
return -ENOMEM;
}
return 0;
}
void journal_destroy_revoke_caches(void)
{
kmem_cache_destroy(revoke_record_cache);
revoke_record_cache = NULL;
kmem_cache_destroy(revoke_table_cache);
revoke_table_cache = NULL;
}
/* Initialise the revoke table for a given journal to a given size. */
int journal_init_revoke(journal_t *journal, int hash_size)
{
int shift, tmp;
J_ASSERT (journal->j_revoke_table[0] == NULL);
shift = 0;
tmp = hash_size;
while((tmp >>= 1UL) != 0UL)
shift++;
journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!journal->j_revoke_table[0])
return -ENOMEM;
journal->j_revoke = journal->j_revoke_table[0];
/* Check that the hash_size is a power of two */
J_ASSERT ((hash_size & (hash_size-1)) == 0);
journal->j_revoke->hash_size = hash_size;
journal->j_revoke->hash_shift = shift;
journal->j_revoke->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!journal->j_revoke->hash_table) {
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
journal->j_revoke = NULL;
return -ENOMEM;
}
for (tmp = 0; tmp < hash_size; tmp++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!journal->j_revoke_table[1]) {
kfree(journal->j_revoke_table[0]->hash_table);
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
return -ENOMEM;
}
journal->j_revoke = journal->j_revoke_table[1];
/* Check that the hash_size is a power of two */
J_ASSERT ((hash_size & (hash_size-1)) == 0);
journal->j_revoke->hash_size = hash_size;
journal->j_revoke->hash_shift = shift;
journal->j_revoke->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!journal->j_revoke->hash_table) {
kfree(journal->j_revoke_table[0]->hash_table);
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
journal->j_revoke = NULL;
return -ENOMEM;
}
for (tmp = 0; tmp < hash_size; tmp++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
spin_lock_init(&journal->j_revoke_lock);
return 0;
}
/* Destoy a journal's revoke table. The table must already be empty! */
void journal_destroy_revoke(journal_t *journal)
{
struct jbd_revoke_table_s *table;
struct list_head *hash_list;
int i;
table = journal->j_revoke_table[0];
if (!table)
return;
for (i=0; i<table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT (list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(revoke_table_cache, table);
journal->j_revoke = NULL;
table = journal->j_revoke_table[1];
if (!table)
return;
for (i=0; i<table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT (list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(revoke_table_cache, table);
journal->j_revoke = NULL;
}
#ifdef __KERNEL__
/*
* journal_revoke: revoke a given buffer_head from the journal. This
* prevents the block from being replayed during recovery if we take a
* crash after this current transaction commits. Any subsequent
* metadata writes of the buffer in this transaction cancel the
* revoke.
*
* Note that this call may block --- it is up to the caller to make
* sure that there are no further calls to journal_write_metadata
* before the revoke is complete. In ext3, this implies calling the
* revoke before clearing the block bitmap when we are deleting
* metadata.
*
* Revoke performs a journal_forget on any buffer_head passed in as a
* parameter, but does _not_ forget the buffer_head if the bh was only
* found implicitly.
*
* bh_in may not be a journalled buffer - it may have come off
* the hash tables without an attached journal_head.
*
* If bh_in is non-zero, journal_revoke() will decrement its b_count
* by one.
*/
int journal_revoke(handle_t *handle, unsigned long blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
journal_t *journal;
struct block_device *bdev;
int err;
might_sleep();
if (bh_in)
BUFFER_TRACE(bh_in, "enter");
journal = handle->h_transaction->t_journal;
if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
J_ASSERT (!"Cannot set revoke feature!");
return -EINVAL;
}
bdev = journal->j_fs_dev;
bh = bh_in;
if (!bh) {
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
#ifdef JBD_EXPENSIVE_CHECKING
else {
struct buffer_head *bh2;
/* If there is a different buffer_head lying around in
* memory anywhere... */
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
/* ...then it better be revoked too,
* since it's illegal to create a revoke
* record against a buffer_head which is
* not marked revoked --- that would
* risk missing a subsequent revoke
* cancel. */
J_ASSERT_BH(bh2, buffer_revoked(bh2));
put_bh(bh2);
}
}
#endif
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
if (bh) {
if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
"inconsistent data on disk")) {
if (!bh_in)
brelse(bh);
return -EIO;
}
set_buffer_revoked(bh);
set_buffer_revokevalid(bh);
if (bh_in) {
BUFFER_TRACE(bh_in, "call journal_forget");
journal_forget(handle, bh_in);
} else {
BUFFER_TRACE(bh, "call brelse");
__brelse(bh);
}
}
jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
return err;
}
/*
* Cancel an outstanding revoke. For use only internally by the
* journaling code (called from journal_get_write_access).
*
* We trust buffer_revoked() on the buffer if the buffer is already
* being journaled: if there is no revoke pending on the buffer, then we
* don't do anything here.
*
* This would break if it were possible for a buffer to be revoked and
* discarded, and then reallocated within the same transaction. In such
* a case we would have lost the revoked bit, but when we arrived here
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*
* The caller must have the journal locked.
*/
int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
* not, we can't trust the revoke bit, and we need to do the
* full search for a revoke record. */
if (test_set_buffer_revokevalid(bh)) {
need_cancel = test_clear_buffer_revoked(bh);
} else {
need_cancel = 1;
clear_buffer_revoked(bh);
}
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
jbd_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(revoke_record_cache, record);
did_revoke = 1;
}
}
#ifdef JBD_EXPENSIVE_CHECKING
/* There better not be one left behind by now! */
record = find_revoke_record(journal, bh->b_blocknr);
J_ASSERT_JH(jh, record == NULL);
#endif
/* Finally, have we just cleared revoke on an unhashed
* buffer_head? If so, we'd better make sure we clear the
* revoked status on any hashed alias too, otherwise the revoke
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
return did_revoke;
}
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
*/
void journal_switch_revoke_table(journal_t *journal)
{
int i;
if (journal->j_revoke == journal->j_revoke_table[0])
journal->j_revoke = journal->j_revoke_table[1];
else
journal->j_revoke = journal->j_revoke_table[0];
for (i = 0; i < journal->j_revoke->hash_size; i++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*
* Called with the journal lock held.
*/
void journal_write_revoke_records(journal_t *journal,
transaction_t *transaction)
{
struct journal_head *descriptor;
struct jbd_revoke_record_s *record;
struct jbd_revoke_table_s *revoke;
struct list_head *hash_list;
int i, offset, count;
descriptor = NULL;
offset = 0;
count = 0;
/* select revoke table for committing transaction */
revoke = journal->j_revoke == journal->j_revoke_table[0] ?
journal->j_revoke_table[1] : journal->j_revoke_table[0];
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd_revoke_record_s *)
hash_list->next;
write_one_revoke_record(journal, transaction,
&descriptor, &offset,
record);
count++;
list_del(&record->hash);
kmem_cache_free(revoke_record_cache, record);
}
}
if (descriptor)
flush_descriptor(journal, descriptor, offset);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
/*
* Write out one revoke record. We need to create a new descriptor
* block if the old one is full or if we have not already created one.
*/
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
struct journal_head **descriptorp,
int *offsetp,
struct jbd_revoke_record_s *record)
{
struct journal_head *descriptor;
int offset;
journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
journal_write_revoke_records in order to free all of the
revoke records: only the IO to the journal is omitted. */
if (is_journal_aborted(journal))
return;
descriptor = *descriptorp;
offset = *offsetp;
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset == journal->j_blocksize) {
flush_descriptor(journal, descriptor, offset);
descriptor = NULL;
}
}
if (!descriptor) {
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
journal_file_buffer(descriptor, transaction, BJ_LogCtl);
offset = sizeof(journal_revoke_header_t);
*descriptorp = descriptor;
}
* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
*offsetp = offset;
}
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
* be waited for during commit, so it has to go onto the appropriate
* journal buffer list.
*/
static void flush_descriptor(journal_t *journal,
struct journal_head *descriptor,
int offset)
{
journal_revoke_header_t *header;
struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
put_bh(bh);
return;
}
header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
header->r_count = cpu_to_be32(offset);
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
ll_rw_block(SWRITE, 1, &bh);
}
#endif
/*
* Revoke support for recovery.
*
* Recovery needs to be able to:
*
* record all revoke records, including the tid of the latest instance
* of each revoke in the journal
*
* check whether a given block in a given transaction should be replayed
* (ie. has not been revoked by a revoke record in that or a subsequent
* transaction)
*
* empty the revoke table after recovery.
*/
/*
* First, setting revoke records. We create a new revoke record for
* every block ever revoked in the log as we scan it for recovery, and
* we update the existing records if we find multiple revokes for a
* single block.
*/
int journal_set_revoke(journal_t *journal,
unsigned long blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (record) {
/* If we have multiple occurrences, only record the
* latest sequence number in the hashed record */
if (tid_gt(sequence, record->sequence))
record->sequence = sequence;
return 0;
}
return insert_revoke_hash(journal, blocknr, sequence);
}
/*
* Test revoke records. For a given block referenced in the log, has
* that block been revoked? A revoke record with a given transaction
* sequence number revokes all blocks in that transaction and earlier
* ones, but later transactions still need replayed.
*/
int journal_test_revoke(journal_t *journal,
unsigned long blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (!record)
return 0;
if (tid_gt(sequence, record->sequence))
return 0;
return 1;
}
/*
* Finally, once recovery is over, we need to clear the revoke table so
* that it can be reused by the running filesystem.
*/
void journal_clear_revoke(journal_t *journal)
{
int i;
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
struct jbd_revoke_table_s *revoke;
revoke = journal->j_revoke;
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd_revoke_record_s*) hash_list->next;
list_del(&record->hash);
kmem_cache_free(revoke_record_cache, record);
}
}
}
/*
* linux/fs/transaction.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Generic filesystem transaction handling code; part of the ext2fs
* journaling system.
*
* This file manages transactions (compound commits managed by the
* journaling code) and handles (individual atomic operations by the
* filesystem).
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/smp_lock.h>
#include <linux/mm.h>
#include <linux/highmem.h>
/*
* get_transaction: obtain a new transaction_t object.
*
* Simply allocate and initialise a new transaction. Create it in
* RUNNING state and add it to the current journal (which should not
* have an existing running transaction: we only make a new transaction
* once we have started to commit the old one).
*
* Preconditions:
* The journal MUST be locked. We don't perform atomic mallocs on the
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
* Called under j_state_lock
*/
static transaction_t *
get_transaction(journal_t *journal, transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = transaction->t_expires;
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
journal->j_running_transaction = transaction;
return transaction;
}
/*
* Handle management.
*
* A handle_t is an object which represents a single atomic update to a
* filesystem, and which tracks all of the modifications which form part
* of that one update.
*/
/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
* transaction's buffer credits.
*/
static int start_this_handle(journal_t *journal, handle_t *handle)
{
transaction_t *transaction;
int needed;
int nblocks = handle->h_buffer_credits;
transaction_t *new_transaction = NULL;
int ret = 0;
if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
current->comm, nblocks,
journal->j_max_transaction_buffers);
ret = -ENOSPC;
goto out;
}
alloc_transaction:
if (!journal->j_running_transaction) {
new_transaction = jbd_kmalloc(sizeof(*new_transaction),
GFP_NOFS);
if (!new_transaction) {
ret = -ENOMEM;
goto out;
}
memset(new_transaction, 0, sizeof(*new_transaction));
}
jbd_debug(3, "New handle %p going live.\n", handle);
repeat:
/*
* We need to hold j_state_lock until t_updates has been incremented,
* for proper journal barrier handling
*/
spin_lock(&journal->j_state_lock);
repeat_locked:
if (is_journal_aborted(journal) ||
(journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
spin_unlock(&journal->j_state_lock);
ret = -EROFS;
goto out;
}
/* Wait on the journal's transaction barrier if necessary */
if (journal->j_barrier_count) {
spin_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_transaction_locked,
journal->j_barrier_count == 0);
goto repeat;
}
if (!journal->j_running_transaction) {
if (!new_transaction) {
spin_unlock(&journal->j_state_lock);
goto alloc_transaction;
}
get_transaction(journal, new_transaction);
new_transaction = NULL;
}
transaction = journal->j_running_transaction;
/*
* If the current transaction is locked down for commit, wait for the
* lock to be released.
*/
if (transaction->t_state == T_LOCKED) {
DEFINE_WAIT(wait);
prepare_to_wait(&journal->j_wait_transaction_locked,
&wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&journal->j_state_lock);
schedule();
finish_wait(&journal->j_wait_transaction_locked, &wait);
goto repeat;
}
/*
* If there is not enough space left in the log to write all potential
* buffers requested by this operation, we need to stall pending a log
* checkpoint to free some more log space.
*/
spin_lock(&transaction->t_handle_lock);
needed = transaction->t_outstanding_credits + nblocks;
if (needed > journal->j_max_transaction_buffers) {
/*
* If the current transaction is already too large, then start
* to commit it: we can then go back and attach this handle to
* a new transaction.
*/
DEFINE_WAIT(wait);
jbd_debug(2, "Handle %p starting new commit...\n", handle);
spin_unlock(&transaction->t_handle_lock);
prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
schedule();
finish_wait(&journal->j_wait_transaction_locked, &wait);
goto repeat;
}
/*
* The commit code assumes that it can get enough log space
* without forcing a checkpoint. This is *critical* for
* correctness: a checkpoint of a buffer which is also
* associated with a committing transaction creates a deadlock,
* so commit simply cannot force through checkpoints.
*
* We must therefore ensure the necessary space in the journal
* *before* starting to dirty potentially checkpointed buffers
* in the new transaction.
*
* The worst part is, any transaction currently committing can
* reduce the free space arbitrarily. Be careful to account for
* those buffers when checkpointing.
*/
/*
* @@@ AKPM: This seems rather over-defensive. We're giving commit
* a _lot_ of headroom: 1/4 of the journal plus the size of
* the committing transaction. Really, we only need to give it
* committing_transaction->t_outstanding_credits plus "enough" for
* the log control blocks.
* Also, this test is inconsitent with the matching one in
* journal_extend().
*/
if (__log_space_left(journal) < jbd_space_needed(journal)) {
jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
spin_unlock(&transaction->t_handle_lock);
__log_wait_for_space(journal);
goto repeat_locked;
}
/* OK, account for the buffers that this operation expects to
* use and add the handle to the running transaction. */
handle->h_transaction = transaction;
transaction->t_outstanding_credits += nblocks;
transaction->t_updates++;
transaction->t_handle_count++;
jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
handle, nblocks, transaction->t_outstanding_credits,
__log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
return ret;
}
/* Allocate a new handle. This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
handle_t *handle = jbd_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
return handle;
}
/**
* handle_t *journal_start() - Obtain a new handle.
* @journal: Journal to start transaction on.
* @nblocks: number of block buffer we might modify
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
* that much space.
*
* This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked.
*
* Return a pointer to a newly allocated handle, or NULL on failure
*/
handle_t *journal_start(journal_t *journal, int nblocks)
{
handle_t *handle = journal_current_handle();
int err;
if (!journal)
return ERR_PTR(-EROFS);
if (handle) {
J_ASSERT(handle->h_transaction->t_journal == journal);
handle->h_ref++;
return handle;
}
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
current->journal_info = handle;
err = start_this_handle(journal, handle);
if (err < 0) {
jbd_free_handle(handle);
current->journal_info = NULL;
handle = ERR_PTR(err);
}
return handle;
}
/**
* int journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
*
* Some transactions, such as large extends and truncates, can be done
* atomically all at once or in several stages. The operation requests
* a credit for a number of buffer modications in advance, but can
* extend its credit if it needs more.
*
* journal_extend tries to give the running handle more buffer credits.
* It does not guarantee that allocation - this is a best-effort only.
* The calling process MUST be able to deal cleanly with a failure to
* extend here.
*
* Return 0 on success, non-zero on failure.
*
* return code < 0 implies an error
* return code > 0 implies normal transaction-full status.
*/
int journal_extend(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
int result;
int wanted;
result = -EIO;
if (is_handle_aborted(handle))
goto out;
result = 1;
spin_lock(&journal->j_state_lock);
/* Don't extend a locked-down transaction! */
if (handle->h_transaction->t_state != T_RUNNING) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
spin_lock(&transaction->t_handle_lock);
wanted = transaction->t_outstanding_credits + nblocks;
if (wanted > journal->j_max_transaction_buffers) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
goto unlock;
}
if (wanted > __log_space_left(journal)) {
jbd_debug(3, "denied handle %p %d blocks: "
"insufficient log space\n", handle, nblocks);
goto unlock;
}
handle->h_buffer_credits += nblocks;
transaction->t_outstanding_credits += nblocks;
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
unlock:
spin_unlock(&transaction->t_handle_lock);
error_out:
spin_unlock(&journal->j_state_lock);
out:
return result;
}
/**
* int journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
*
* Restart a handle for a multi-transaction filesystem
* operation.
*
* If the journal_extend() call above fails to grant new buffer credits
* to a running handle, a call to journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
* transaction capabable of guaranteeing the requested number of
* credits.
*/
int journal_restart(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
int ret;
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
/*
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
J_ASSERT(transaction->t_updates > 0);
J_ASSERT(journal_current_handle() == handle);
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
transaction->t_outstanding_credits -= handle->h_buffer_credits;
transaction->t_updates--;
if (!transaction->t_updates)
wake_up(&journal->j_wait_updates);
spin_unlock(&transaction->t_handle_lock);
jbd_debug(2, "restarting handle %p\n", handle);
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
}
/**
* void journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
* This locks out any further updates from being started, and blocks
* until all existing updates have completed, returning only once the
* journal is in a quiescent state with no updates running.
*
* The journal lock should not be held on entry.
*/
void journal_lock_updates(journal_t *journal)
{
DEFINE_WAIT(wait);
spin_lock(&journal->j_state_lock);
++journal->j_barrier_count;
/* Wait until there are no running updates */
while (1) {
transaction_t *transaction = journal->j_running_transaction;
if (!transaction)
break;
spin_lock(&transaction->t_handle_lock);
if (!transaction->t_updates) {
spin_unlock(&transaction->t_handle_lock);
break;
}
prepare_to_wait(&journal->j_wait_updates, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
schedule();
finish_wait(&journal->j_wait_updates, &wait);
spin_lock(&journal->j_state_lock);
}
spin_unlock(&journal->j_state_lock);
/*
* We have now established a barrier against other normal updates, but
* we also need to barrier against other journal_lock_updates() calls
* to make sure that we serialise special journal-locked operations
* too.
*/
mutex_lock(&journal->j_barrier);
}
/**
* void journal_unlock_updates (journal_t* journal) - release barrier
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with journal_lock_updates().
*
* Should be called without the journal lock held.
*/
void journal_unlock_updates (journal_t *journal)
{
J_ASSERT(journal->j_barrier_count != 0);
mutex_unlock(&journal->j_barrier);
spin_lock(&journal->j_state_lock);
--journal->j_barrier_count;
spin_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_transaction_locked);
}
/*
* Report any unexpected dirty buffers which turn up. Normally those
* indicate an error, but they can occur if the user is running (say)
* tune2fs to modify the live filesystem, so we need the option of
* continuing as gracefully as possible. #
*
* The caller should already hold the journal lock and
* j_list_lock spinlock: most callers will need those anyway
* in order to probe the buffer's journaling state safely.
*/
static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
{
int jlist;
/* If this buffer is one which might reasonably be dirty
* --- ie. data, or not part of this journal --- then
* we're OK to leave it alone, but otherwise we need to
* move the dirty bit to the journal's own internal
* JBDDirty bit. */
jlist = jh->b_jlist;
if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
jlist == BJ_Shadow || jlist == BJ_Forget) {
struct buffer_head *bh = jh2bh(jh);
if (test_clear_buffer_dirty(bh))
set_buffer_jbddirty(bh);
}
}
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
* transaction which we are still committing to disk, then we need to
* make sure that we do not overwrite the old copy: we do copy-out to
* preserve the copy going to disk. We also account the buffer against
* the handle's metadata buffer credits (unless the buffer is already
* part of the transaction, that is).
*
*/
static int
do_get_write_access(handle_t *handle, struct journal_head *jh,
int force_copy)
{
struct buffer_head *bh;
transaction_t *transaction;
journal_t *journal;
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
if (is_handle_aborted(handle))
return -EROFS;
transaction = handle->h_transaction;
journal = transaction->t_journal;
jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
JBUFFER_TRACE(jh, "entry");
repeat:
bh = jh2bh(jh);
/* @@@ Need to check for errors here at some point. */
lock_buffer(bh);
jbd_lock_bh_state(bh);
/* We now hold the buffer lock so it is safe to query the buffer
* state. Is the buffer dirty?
*
* If so, there are two possibilities. The buffer may be
* non-journaled, and undergoing a quite legitimate writeback.
* Otherwise, it is journaled, and we don't expect dirty buffers
* in that state (the buffers should be marked JBD_Dirty
* instead.) So either the IO is being done under our own
* control and this is a bug, or it's a third party IO such as
* dump(8) (which may leave the buffer scheduled for read ---
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
if (buffer_dirty(bh)) {
/*
* First question: is this buffer already part of the current
* transaction or the existing committing transaction?
*/
if (jh->b_transaction) {
J_ASSERT_JH(jh,
jh->b_transaction == transaction ||
jh->b_transaction ==
journal->j_committing_transaction);
if (jh->b_next_transaction)
J_ASSERT_JH(jh, jh->b_next_transaction ==
transaction);
}
/*
* In any case we need to clean the dirty flag and we must
* do it under the buffer lock to be sure we don't race
* with running write-out.
*/
JBUFFER_TRACE(jh, "Unexpected dirty buffer");
jbd_unexpected_dirty_buffer(jh);
}
unlock_buffer(bh);
error = -EROFS;
if (is_handle_aborted(handle)) {
jbd_unlock_bh_state(bh);
goto out;
}
error = 0;
/*
* The buffer is already part of this transaction if b_transaction or
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
jh->b_next_transaction == transaction)
goto done;
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
*/
if (jh->b_frozen_data) {
JBUFFER_TRACE(jh, "has frozen data");
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
jh->b_next_transaction = transaction;
goto done;
}
/* Is there data here we need to preserve? */
if (jh->b_transaction && jh->b_transaction != transaction) {
JBUFFER_TRACE(jh, "owned by older transaction");
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
J_ASSERT_JH(jh, jh->b_transaction ==
journal->j_committing_transaction);
/* There is one case we have to be very careful about.
* If the committing transaction is currently writing
* this buffer out to disk and has NOT made a copy-out,
* then we cannot modify the buffer contents at all
* right now. The essence of copy-out is that it is the
* extra copy, not the primary copy, which gets
* journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */
if (jh->b_jlist == BJ_Shadow) {
DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
wait_queue_head_t *wqh;
wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh);
/* commit wakes up all shadow buffers after IO */
for ( ; ; ) {
prepare_to_wait(wqh, &wait.wait,
TASK_UNINTERRUPTIBLE);
if (jh->b_jlist != BJ_Shadow)
break;
schedule();
}
finish_wait(wqh, &wait.wait);
goto repeat;
}
/* Only do the copy if the currently-owning transaction
* still needs it. If it is on the Forget list, the
* committing transaction is past that stage. The
* buffer had better remain locked during the kmalloc,
* but that should be true --- we hold the journal lock
* still and the buffer is already on the BUF_JOURNAL
* list so won't be flushed.
*
* Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy
* in that case. */
if (jh->b_jlist != BJ_Forget || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
frozen_buffer =
jbd_slab_alloc(jh2bh(jh)->b_size,
GFP_NOFS);
if (!frozen_buffer) {
printk(KERN_EMERG
"%s: OOM for frozen_buffer\n",
__FUNCTION__);
JBUFFER_TRACE(jh, "oom!");
error = -ENOMEM;
jbd_lock_bh_state(bh);
goto done;
}
goto repeat;
}
jh->b_frozen_data = frozen_buffer;
frozen_buffer = NULL;
need_copy = 1;
}
jh->b_next_transaction = transaction;
}
/*
* Finally, if the buffer is not journaled right now, we need to make
* sure it doesn't get written to disk before the caller actually
* commits the new data
*/
if (!jh->b_transaction) {
JBUFFER_TRACE(jh, "no transaction");
J_ASSERT_JH(jh, !jh->b_next_transaction);
jh->b_transaction = transaction;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
spin_lock(&journal->j_list_lock);
__journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
}
done:
if (need_copy) {
struct page *page;
int offset;
char *source;
J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
"Possible IO failure.\n");
page = jh2bh(jh)->b_page;
offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
source = kmap_atomic(page, KM_USER0);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
kunmap_atomic(source, KM_USER0);
}
jbd_unlock_bh_state(bh);
/*
* If we are about to journal a buffer, then any revoke pending on it is
* no longer valid
*/
journal_cancel_revoke(handle, jh);
out:
if (unlikely(frozen_buffer)) /* It's usually NULL */
jbd_slab_free(frozen_buffer, bh->b_size);
JBUFFER_TRACE(jh, "exit");
return error;
}
/**
* int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
* @credits: variable that will receive credits for the buffer
*
* Returns an error code or 0 on success.
*
* In full data journalling mode the buffer may be of type BJ_AsyncData,
* because we're write()ing a buffer which is also part of a shared mapping.
*/
int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
struct journal_head *jh = journal_add_journal_head(bh);
int rc;
/* We do not want to get caught playing with fields which the
* log thread also manipulates. Make sure that the buffer
* completes any outstanding IO before proceeding. */
rc = do_get_write_access(handle, jh, 0);
journal_put_journal_head(jh);
return rc;
}
/*
* When the user wants to journal a newly created buffer_head
* (ie. getblk() returned a new buffer and we are going to populate it
* manually rather than reading off disk), then we need to keep the
* buffer_head locked until it has been completely filled with new
* data. In this case, we should be able to make the assertion that
* the bh is not already part of an existing transaction.
*
* The buffer should already be locked by the caller by this point.
* There is no lock ranking violation: it was a newly created,
* unlocked buffer beforehand. */
/**
* int journal_get_create_access () - notify intent to use newly created bh
* @handle: transaction to new buffer to
* @bh: new buffer.
*
* Call this if you create a new bh.
*/
int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
struct journal_head *jh = journal_add_journal_head(bh);
int err;
jbd_debug(5, "journal_head %p\n", jh);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
err = 0;
JBUFFER_TRACE(jh, "entry");
/*
* The buffer may already belong to this transaction due to pre-zeroing
* in the filesystem's new_block code. It may also be on the previous,
* committing transaction's lists, but it HAS to be in Forget state in
* that case: the transaction must have deleted the buffer for it to be
* reused here.
*/
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
jh->b_jlist == BJ_Forget)));
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
if (jh->b_transaction == NULL) {
jh->b_transaction = transaction;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
__journal_file_buffer(jh, transaction, BJ_Reserved);
} else if (jh->b_transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "set next transaction");
jh->b_next_transaction = transaction;
}
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
/*
* akpm: I added this. ext3_alloc_branch can pick up new indirect
* blocks which contain freed but then revoked metadata. We need
* to cancel the revoke in case we end up freeing it yet again
* and the reallocating as data - this would cause a second revoke,
* which hits an assertion error.
*/
JBUFFER_TRACE(jh, "cancelling revoke");
journal_cancel_revoke(handle, jh);
journal_put_journal_head(jh);
out:
return err;
}
/**
* int journal_get_undo_access() - Notify intent to modify metadata with
* non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
* @credits: store the number of taken credits here (if not NULL)
*
* Sometimes there is a need to distinguish between metadata which has
* been committed to disk and that which has not. The ext3fs code uses
* this for freeing and allocating space, we have to make sure that we
* do not reuse freed space until the deallocation has been committed,
* since if we overwrote that space we would make the delete
* un-rewindable in case of a crash.
*
* To deal with that, journal_get_undo_access requests write access to a
* buffer for parts of non-rewindable operations such as delete
* operations on the bitmaps. The journaling code must keep a copy of
* the buffer's contents prior to the undo_access call until such time
* as we know that the buffer has definitely been committed to disk.
*
* We never need to know which transaction the committed data is part
* of, buffers touched here are guaranteed to be dirtied later and so
* will be committed to a new transaction in due course, at which point
* we can discard the old committed data pointer.
*
* Returns error number or 0 on success.
*/
int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
{
int err;
struct journal_head *jh = journal_add_journal_head(bh);
char *committed_data = NULL;
JBUFFER_TRACE(jh, "entry");
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
* atomically wrt. completion of any outstanding commits.
*/
err = do_get_write_access(handle, jh, 1);
if (err)
goto out;
repeat:
if (!jh->b_committed_data) {
committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
printk(KERN_EMERG "%s: No memory for committed data\n",
__FUNCTION__);
err = -ENOMEM;
goto out;
}
}
jbd_lock_bh_state(bh);
if (!jh->b_committed_data) {
/* Copy out the current buffer contents into the
* preserved, committed copy. */
JBUFFER_TRACE(jh, "generate b_committed data");
if (!committed_data) {
jbd_unlock_bh_state(bh);
goto repeat;
}
jh->b_committed_data = committed_data;
committed_data = NULL;
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
}
jbd_unlock_bh_state(bh);
out:
journal_put_journal_head(jh);
if (unlikely(committed_data))
jbd_slab_free(committed_data, bh->b_size);
return err;
}
/**
* int journal_dirty_data() - mark a buffer as containing dirty data which
* needs to be flushed before we can commit the
* current transaction.
* @handle: transaction
* @bh: bufferhead to mark
*
* The buffer is placed on the transaction's data list and is marked as
* belonging to the transaction.
*
* Returns error number or 0 on success.
*
* journal_dirty_data() can be called via page_launder->ext3_writepage
* by kswapd.
*/
int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
{
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
if (is_handle_aborted(handle))
return 0;
jh = journal_add_journal_head(bh);
JBUFFER_TRACE(jh, "entry");
/*
* The buffer could *already* be dirty. Writeout can start
* at any time.
*/
jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
/*
* What if the buffer is already part of a running transaction?
*
* There are two cases:
* 1) It is part of the current running transaction. Refile it,
* just in case we have allocated it as metadata, deallocated
* it, then reallocated it as data.
* 2) It is part of the previous, still-committing transaction.
* If all we want to do is to guarantee that the buffer will be
* written to disk before this new transaction commits, then
* being sure that the *previous* transaction has this same
* property is sufficient for us! Just leave it on its old
* transaction.
*
* In case (2), the buffer must not already exist as metadata
* --- that would violate write ordering (a transaction is free
* to write its data at any point, even before the previous
* committing transaction has committed). The caller must
* never, ever allow this to happen: there's nothing we can do
* about it in this layer.
*/
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
if (jh->b_transaction) {
JBUFFER_TRACE(jh, "has transaction");
if (jh->b_transaction != handle->h_transaction) {
JBUFFER_TRACE(jh, "belongs to older transaction");
J_ASSERT_JH(jh, jh->b_transaction ==
journal->j_committing_transaction);
/* @@@ IS THIS TRUE ? */
/*
* Not any more. Scenario: someone does a write()
* in data=journal mode. The buffer's transaction has
* moved into commit. Then someone does another
* write() to the file. We do the frozen data copyout
* and set b_next_transaction to point to j_running_t.
* And while we're in that state, someone does a
* writepage() in an attempt to pageout the same area
* of the file via a shared mapping. At present that
* calls journal_dirty_data(), and we get right here.
* It may be too late to journal the data. Simply
* falling through to the next test will suffice: the
* data will be dirty and wil be checkpointed. The
* ordering comments in the next comment block still
* apply.
*/
//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
/*
* If we're journalling data, and this buffer was
* subject to a write(), it could be metadata, forget
* or shadow against the committing transaction. Now,
* someone has dirtied the same darn page via a mapping
* and it is being writepage()'d.
* We *could* just steal the page from commit, with some
* fancy locking there. Instead, we just skip it -
* don't tie the page's buffers to the new transaction
* at all.
* Implication: if we crash before the writepage() data
* is written into the filesystem, recovery will replay
* the write() data.
*/
if (jh->b_jlist != BJ_None &&
jh->b_jlist != BJ_SyncData &&
jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "Not stealing");
goto no_journal;
}
/*
* This buffer may be undergoing writeout in commit. We
* can't return from here and let the caller dirty it
* again because that can cause the write-out loop in
* commit to never terminate.
*/
if (buffer_dirty(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
need_brelse = 1;
sync_dirty_buffer(bh);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
/* The buffer may become locked again at any
time if it is redirtied */
}
/* journal_clean_data_list() may have got there first */
if (jh->b_transaction != NULL) {
JBUFFER_TRACE(jh, "unfile from commit");
__journal_temp_unlink_buffer(jh);
/* It still points to the committing
* transaction; move it to this one so
* that the refile assert checks are
* happy. */
jh->b_transaction = handle->h_transaction;
}
/* The buffer will be refiled below */
}
/*
* Special case --- the buffer might actually have been
* allocated and then immediately deallocated in the previous,
* committing transaction, so might still be left on that
* transaction's metadata lists.
*/
if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
__journal_temp_unlink_buffer(jh);
jh->b_transaction = handle->h_transaction;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
BJ_SyncData);
}
} else {
JBUFFER_TRACE(jh, "not on a transaction");
__journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
}
no_journal:
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
if (need_brelse) {
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
JBUFFER_TRACE(jh, "exit");
journal_put_journal_head(jh);
return 0;
}
/**
* int journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
* mark dirty metadata which needs to be journaled as part of the current
* transaction.
*
* The buffer is placed on the transaction's metadata list and is marked
* as belonging to the transaction.
*
* Returns error number or 0 on success.
*
* Special care needs to be taken if the buffer already belongs to the
* current committing transaction (in which case we should have frozen
* data present for that commit). In that case, we don't relink the
* buffer: that only gets done when the old transaction finally
* completes its commit.
*/
int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
struct journal_head *jh = bh2jh(bh);
jbd_debug(5, "journal_head %p\n", jh);
JBUFFER_TRACE(jh, "entry");
if (is_handle_aborted(handle))
goto out;
jbd_lock_bh_state(bh);
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
* of the transaction. This needs to be done
* once a transaction -bzzz
*/
jh->b_modified = 1;
J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
handle->h_buffer_credits--;
}
/*
* fastpath, to avoid expensive locking. If this buffer is already
* on the running transaction's metadata list there is nothing to do.
* Nobody can take it off again because there is a handle open.
* I _think_ we're OK here with SMP barriers - a mistaken decision will
* result in this test being false, so we go in and take the locks.
*/
if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
JBUFFER_TRACE(jh, "fastpath");
J_ASSERT_JH(jh, jh->b_transaction ==
journal->j_running_transaction);
goto out_unlock_bh;
}
set_buffer_jbddirty(bh);
/*
* Metadata already on the current transaction list doesn't
* need to be filed. Metadata on another transaction's list must
* be committing, and will be refiled once the commit completes:
* leave it alone for now.
*/
if (jh->b_transaction != transaction) {
JBUFFER_TRACE(jh, "already on other transaction");
J_ASSERT_JH(jh, jh->b_transaction ==
journal->j_committing_transaction);
J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
/* And this case is illegal: we can't reuse another
* transaction's data buffer, ever. */
goto out_unlock_bh;
}
/* That test should have eliminated the following case: */
J_ASSERT_JH(jh, jh->b_frozen_data == 0);
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
out:
JBUFFER_TRACE(jh, "exit");
return 0;
}
/*
* journal_release_buffer: undo a get_write_access without any buffer
* updates, if the update decided in the end that it didn't need access.
*
*/
void
journal_release_buffer(handle_t *handle, struct buffer_head *bh)
{
BUFFER_TRACE(bh, "entry");
}
/**
* void journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
* @bh: bh to 'forget'
*
* We can only do the bforget if there are no commits pending against the
* buffer. If the buffer is dirty in the current running transaction we
* can safely unlink it.
*
* bh may not be a journalled buffer at all - it may be a non-JBD
* buffer which came off the hashtable. Check for this.
*
* Decrements bh->b_count by one.
*
* Allow this call even if the handle has aborted --- it may be part of
* the caller's cleanup after an abort.
*/
int journal_forget (handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
if (!buffer_jbd(bh))
goto not_jbd;
jh = bh2jh(bh);
/* Critical error: attempting to delete a bitmap buffer, maybe?
* Don't do any jbd operations, and return an error. */
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
"inconsistent data on disk")) {
err = -EIO;
goto not_jbd;
}
/*
* The buffer's going from the transaction, we must drop
* all references -bzzz
*/
jh->b_modified = 0;
if (jh->b_transaction == handle->h_transaction) {
J_ASSERT_JH(jh, !jh->b_frozen_data);
/* If we are forgetting a buffer which is already part
* of this transaction, then we can just drop it from
* the transaction immediately. */
clear_buffer_dirty(bh);
clear_buffer_jbddirty(bh);
JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
drop_reserve = 1;
/*
* We are no longer going to journal this buffer.
* However, the commit of this transaction is still
* important to the buffer: the delete that we are now
* processing might obsolete an old log entry, so by
* committing, we can satisfy the buffer's checkpoint.
*
* So, if we have a checkpoint on the buffer, we should
* now refile the buffer on our BJ_Forget list so that
* we know to remove the checkpoint after we commit.
*/
if (jh->b_cp_transaction) {
__journal_temp_unlink_buffer(jh);
__journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__journal_unfile_buffer(jh);
journal_remove_journal_head(bh);
__brelse(bh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
__bforget(bh);
goto drop;
}
}
} else if (jh->b_transaction) {
J_ASSERT_JH(jh, (jh->b_transaction ==
journal->j_committing_transaction));
/* However, if the buffer is still owned by a prior
* (committing) transaction, we can't drop it yet... */
JBUFFER_TRACE(jh, "belongs to older transaction");
/* ... but we CAN drop it from the new transaction if we
* have also modified it since the original commit. */
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction == transaction);
jh->b_next_transaction = NULL;
drop_reserve = 1;
}
}
not_jbd:
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
__brelse(bh);
drop:
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
handle->h_buffer_credits++;
}
return err;
}
/**
* int journal_stop() - complete a transaction
* @handle: tranaction to complete.
*
* All done for a particular handle.
*
* There is not much action needed here. We just return any remaining
* buffer credits to the transaction and remove the handle. The only
* complication is that we need to start a commit operation if the
* filesystem is marked for synchronous update.
*
* journal_stop itself will not usually return an error, but it may
* do so in unusual circumstances. In particular, expect it to
* return -EIO if a journal_abort has been executed since the
* transaction began.
*/
int journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
int old_handle_count, err;
pid_t pid;
J_ASSERT(transaction->t_updates > 0);
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
else
err = 0;
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
handle->h_ref);
return err;
}
jbd_debug(4, "Handle %p going down\n", handle);
/*
* Implement synchronous transaction batching. If the handle
* was synchronous, don't force a commit immediately. Let's
* yield and let another thread piggyback onto this transaction.
* Keep doing that while new threads continue to arrive.
* It doesn't cost much - we're about to run a commit and sleep
* on IO anyway. Speeds up many-threaded, many-dir operations
* by 30x or more...
*
* But don't do this if this process was the most recent one to
* perform a synchronous write. We do this to detect the case where a
* single process is doing a stream of sync writes. No point in waiting
* for joiners in that case.
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid) {
journal->j_last_sync_writer = pid;
do {
old_handle_count = transaction->t_handle_count;
schedule_timeout_uninterruptible(1);
} while (old_handle_count != transaction->t_handle_count);
}
current->journal_info = NULL;
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
transaction->t_outstanding_credits -= handle->h_buffer_credits;
transaction->t_updates--;
if (!transaction->t_updates) {
wake_up(&journal->j_wait_updates);
if (journal->j_barrier_count)
wake_up(&journal->j_wait_transaction_locked);
}
/*
* If the handle is marked SYNC, we need to set another commit
* going! We also want to force a commit if the current
* transaction is occupying too much of the log, or if the
* transaction is too old now.
*/
if (handle->h_sync ||
transaction->t_outstanding_credits >
journal->j_max_transaction_buffers ||
time_after_eq(jiffies, transaction->t_expires)) {
/* Do this even for aborted journals: an abort still
* completes the commit thread, it just doesn't write
* anything to disk. */
tid_t tid = transaction->t_tid;
spin_unlock(&transaction->t_handle_lock);
jbd_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
/*
* Special case: JFS_SYNC synchronous updates require us
* to wait for the commit to complete.
*/
if (handle->h_sync && !(current->flags & PF_MEMALLOC))
err = log_wait_commit(journal, tid);
} else {
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
}
jbd_free_handle(handle);
return err;
}
/**int journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* For synchronous operations: force any uncommitted transactions
* to disk. May seem kludgy, but it reuses all the handle batching
* code in a very simple manner.
*/
int journal_force_commit(journal_t *journal)
{
handle_t *handle;
int ret;
handle = journal_start(journal, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
} else {
handle->h_sync = 1;
ret = journal_stop(handle);
}
return ret;
}
/*
*
* List management code snippets: various functions for manipulating the
* transaction buffer lists.
*
*/
/*
* Append a buffer to a transaction list, given the transaction's list head
* pointer.
*
* j_list_lock is held.
*
* jbd_lock_bh_state(jh2bh(jh)) is held.
*/
static inline void
__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
{
if (!*list) {
jh->b_tnext = jh->b_tprev = jh;
*list = jh;
} else {
/* Insert at the tail of the list to preserve order */
struct journal_head *first = *list, *last = first->b_tprev;
jh->b_tprev = last;
jh->b_tnext = first;
last->b_tnext = first->b_tprev = jh;
}
}
/*
* Remove a buffer from a transaction list, given the transaction's list
* head pointer.
*
* Called with j_list_lock held, and the journal may not be locked.
*
* jbd_lock_bh_state(jh2bh(jh)) is held.
*/
static inline void
__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
{
if (*list == jh) {
*list = jh->b_tnext;
if (*list == jh)
*list = NULL;
}
jh->b_tprev->b_tnext = jh->b_tnext;
jh->b_tnext->b_tprev = jh->b_tprev;
}
/*
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
* bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
* t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
* is holding onto a copy of one of thee pointers, it could go bad.
* Generally the caller needs to re-read the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
void __journal_temp_unlink_buffer(struct journal_head *jh)
{
struct journal_head **list = NULL;
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
transaction = jh->b_transaction;
if (transaction)
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
if (jh->b_jlist != BJ_None)
J_ASSERT_JH(jh, transaction != 0);
switch (jh->b_jlist) {
case BJ_None:
return;
case BJ_SyncData:
list = &transaction->t_sync_datalist;
break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
list = &transaction->t_buffers;
break;
case BJ_Forget:
list = &transaction->t_forget;
break;
case BJ_IO:
list = &transaction->t_iobuf_list;
break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
case BJ_LogCtl:
list = &transaction->t_log_list;
break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
case BJ_Locked:
list = &transaction->t_locked_list;
break;
}
__blist_del_buffer(list, jh);
jh->b_jlist = BJ_None;
if (test_clear_buffer_jbddirty(bh))
mark_buffer_dirty(bh); /* Expose it to the VM */
}
void __journal_unfile_buffer(struct journal_head *jh)
{
__journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
}
void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
jbd_lock_bh_state(jh2bh(jh));
spin_lock(&journal->j_list_lock);
__journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(jh2bh(jh));
}
/*
* Called from journal_try_to_free_buffers().
*
* Called under jbd_lock_bh_state(bh)
*/
static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
{
struct journal_head *jh;
jh = bh2jh(bh);
if (buffer_locked(bh) || buffer_dirty(bh))
goto out;
if (jh->b_next_transaction != 0)
goto out;
spin_lock(&journal->j_list_lock);
if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh);
journal_remove_journal_head(bh);
__brelse(bh);
}
} else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
__journal_remove_checkpoint(jh);
journal_remove_journal_head(bh);
__brelse(bh);
}
}
spin_unlock(&journal->j_list_lock);
out:
return;
}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
* @unused_gfp_mask: unused
*
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
* so try_to_free_buffers() can reap them.
*
* This function returns non-zero if we wish try_to_free_buffers()
* to be called. We do this if the page is releasable by try_to_free_buffers().
* We also do it if the page has locked or dirty buffers and the caller wants
* us to perform sync or async writeout.
*
* This complicates JBD locking somewhat. We aren't protected by the
* BKL here. We wish to remove the buffer from its committing or
* running transaction's ->t_datalist via __journal_unfile_buffer.
*
* This may *change* the value of transaction_t->t_datalist, so anyone
* who looks at t_datalist needs to lock against this function.
*
* Even worse, someone may be doing a journal_dirty_data on this
* buffer. So we need to lock against that. journal_dirty_data()
* will come out of the lock with the buffer dirty, which makes it
* ineligible for release here.
*
* Who else is affected by this? hmm... Really the only contender
* is do_get_write_access() - it could be looking at the buffer while
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
*/
int journal_try_to_free_buffers(journal_t *journal,
struct page *page, gfp_t unused_gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
int ret = 0;
J_ASSERT(PageLocked(page));
head = page_buffers(page);
bh = head;
do {
struct journal_head *jh;
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
* journal_remove_journal_head() and journal_put_journal_head().
*/
jh = journal_grab_journal_head(bh);
if (!jh)
continue;
jbd_lock_bh_state(bh);
__journal_try_to_free_buffer(journal, bh);
journal_put_journal_head(jh);
jbd_unlock_bh_state(bh);
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
ret = try_to_free_buffers(page);
busy:
return ret;
}
/*
* This buffer is no longer needed. If it is on an older transaction's
* checkpoint list we need to record it on this transaction's forget list
* to pin this buffer (and hence its checkpointing transaction) down until
* this transaction commits. If the buffer isn't on a checkpoint list, we
* release it.
* Returns non-zero if JBD no longer has an interest in the buffer.
*
* Called under j_list_lock.
*
* Called under jbd_lock_bh_state(bh).
*/
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
int may_free = 1;
struct buffer_head *bh = jh2bh(jh);
__journal_unfile_buffer(jh);
if (jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "on running+cp transaction");
__journal_file_buffer(jh, transaction, BJ_Forget);
clear_buffer_jbddirty(bh);
may_free = 0;
} else {
JBUFFER_TRACE(jh, "on running transaction");
journal_remove_journal_head(bh);
__brelse(bh);
}
return may_free;
}
/*
* journal_invalidatepage
*
* This code is tricky. It has a number of cases to deal with.
*
* There are two invariants which this code relies on:
*
* i_size must be updated on disk before we start calling invalidatepage on the
* data.
*
* This is done in ext3 by defining an ext3_setattr method which
* updates i_size before truncate gets going. By maintaining this
* invariant, we can be sure that it is safe to throw away any buffers
* attached to the current transaction: once the transaction commits,
* we know that the data will not be needed.
*
* Note however that we can *not* throw away data belonging to the
* previous, committing transaction!
*
* Any disk blocks which *are* part of the previous, committing
* transaction (and which therefore cannot be discarded immediately) are
* not going to be reused in the new running transaction
*
* The bitmap committed_data images guarantee this: any block which is
* allocated in one transaction and removed in the next will be marked
* as in-use in the committed_data bitmap, so cannot be reused until
* the next transaction to delete the block commits. This means that
* leaving committing buffers dirty is quite safe: the disk blocks
* cannot be reallocated to a different file and so buffer aliasing is
* not possible.
*
*
* The above applies mainly to ordered data mode. In writeback mode we
* don't make guarantees about the order in which data hits disk --- in
* particular we don't guarantee that new dirty data is flushed before
* transaction commit --- so it is always safe just to discard data
* immediately in that mode. --sct
*/
/*
* The journal_unmap_buffer helper function returns zero if the buffer
* concerned remains pinned as an anonymous buffer belonging to an older
* transaction.
*
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
int ret;
BUFFER_TRACE(bh, "entry");
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
* holding the page lock. --sct
*/
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
jh = journal_grab_journal_head(bh);
if (!jh)
goto zap_buffer_no_jh;
transaction = jh->b_transaction;
if (transaction == NULL) {
/* First case: not on any transaction. If it
* has no checkpoint link, then we can zap it:
* it's a writeback-mode buffer so we don't care
* if it hits disk safely. */
if (!jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "not on any transaction: zap");
goto zap_buffer;
}
if (!buffer_dirty(bh)) {
/* bdflush has written it. We can drop it now */
goto zap_buffer;
}
/* OK, it must be in the journal but still not
* written fully to disk: it's metadata or
* journaled data... */
if (journal->j_running_transaction) {
/* ... and once the current transaction has
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
ret = __dispose_buffer(jh,
journal->j_running_transaction);
journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
spin_unlock(&journal->j_state_lock);
return ret;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
* passed into commit. We must attach this buffer to
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
ret = __dispose_buffer(jh,
journal->j_committing_transaction);
journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
spin_unlock(&journal->j_state_lock);
return ret;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
clear_buffer_jbddirty(bh);
goto zap_buffer;
}
}
} else if (transaction == journal->j_committing_transaction) {
if (jh->b_jlist == BJ_Locked) {
/*
* The buffer is on the committing transaction's locked
* list. We have the buffer locked, so I/O has
* completed. So we can nail the buffer now.
*/
may_free = __dispose_buffer(jh, transaction);
goto zap_buffer;
}
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
* running transaction if that is set, but nothing
* else. */
JBUFFER_TRACE(jh, "on committing transaction");
set_buffer_freed(bh);
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction ==
journal->j_running_transaction);
jh->b_next_transaction = NULL;
}
journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
spin_unlock(&journal->j_state_lock);
return 0;
} else {
/* Good, the buffer belongs to the running transaction.
* We are writing our own transaction's data, not any
* previous one's, so it is safe to throw it away
* (remember that we expect the filesystem to have set
* i_size already for this truncate so recovery will not
* expose the disk blocks we are discarding here.) */
J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
may_free = __dispose_buffer(jh, transaction);
}
zap_buffer:
journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
spin_unlock(&journal->j_state_lock);
zap_buffer_unlocked:
clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
clear_buffer_mapped(bh);
clear_buffer_req(bh);
clear_buffer_new(bh);
bh->b_bdev = NULL;
return may_free;
}
/**
* void journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
* @offset: length of page to invalidate.
*
* Reap page buffers containing data after offset in page.
*
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
unsigned long offset)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
int may_free = 1;
if (!PageLocked(page))
BUG();
if (!page_has_buffers(page))
return;
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
head = bh = page_buffers(page);
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh);
unlock_buffer(bh);
}
curr_off = next_off;
bh = next;
} while (bh != head);
if (!offset) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
}
/*
* File a buffer on the given transaction list.
*/
void __journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
struct journal_head **list = NULL;
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_transaction == 0);
if (jh->b_transaction && jh->b_jlist == jlist)
return;
/* The following list of buffer states needs to be consistent
* with __jbd_unexpected_dirty_buffer()'s handling of dirty
* state. */
if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
jlist == BJ_Shadow || jlist == BJ_Forget) {
if (test_clear_buffer_dirty(bh) ||
test_clear_buffer_jbddirty(bh))
was_dirty = 1;
}
if (jh->b_transaction)
__journal_temp_unlink_buffer(jh);
jh->b_transaction = transaction;
switch (jlist) {
case BJ_None:
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
case BJ_SyncData:
list = &transaction->t_sync_datalist;
break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
break;
case BJ_Forget:
list = &transaction->t_forget;
break;
case BJ_IO:
list = &transaction->t_iobuf_list;
break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
case BJ_LogCtl:
list = &transaction->t_log_list;
break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
case BJ_Locked:
list = &transaction->t_locked_list;
break;
}
__blist_add_buffer(list, jh);
jh->b_jlist = jlist;
if (was_dirty)
set_buffer_jbddirty(bh);
}
void journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
jbd_lock_bh_state(jh2bh(jh));
spin_lock(&transaction->t_journal->j_list_lock);
__journal_file_buffer(jh, transaction, jlist);
spin_unlock(&transaction->t_journal->j_list_lock);
jbd_unlock_bh_state(jh2bh(jh));
}
/*
* Remove a buffer from its current buffer list in preparation for
* dropping it from its current transaction entirely. If the buffer has
* already started to be used by a subsequent transaction, refile the
* buffer on that transaction's metadata list.
*
* Called under journal->j_list_lock
*
* Called under jbd_lock_bh_state(jh2bh(jh))
*/
void __journal_refile_buffer(struct journal_head *jh)
{
int was_dirty;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
if (jh->b_transaction)
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
__journal_unfile_buffer(jh);
return;
}
/*
* It has been modified by a later transaction: add it to the new
* transaction's metadata list.
*/
was_dirty = test_clear_buffer_jbddirty(bh);
__journal_temp_unlink_buffer(jh);
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
__journal_file_buffer(jh, jh->b_transaction,
was_dirty ? BJ_Metadata : BJ_Reserved);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
if (was_dirty)
set_buffer_jbddirty(bh);
}
/*
* For the unlocked version of this call, also make sure that any
* hanging journal_head is cleaned up if necessary.
*
* __journal_refile_buffer is usually called as part of a single locked
* operation on a buffer_head, in which the caller is probably going to
* be hooking the journal_head onto other lists. In that case it is up
* to the caller to remove the journal_head if necessary. For the
* unlocked journal_refile_buffer call, the caller isn't going to be
* doing anything else to the buffer so we need to do the cleanup
* ourselves to avoid a jh leak.
*
* *** The journal_head may be freed by this call! ***
*/
void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
/*
* linux/include/linux/ext4_jbd.h
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1998--1999 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Ext4-specific journaling extensions.
*/
#ifndef _LINUX_EXT4_JBD_H
#define _LINUX_EXT4_JBD_H
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/ext4_fs.h>
#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
/* Define the number of blocks we need to account to a transaction to
* modify one block of data.
*
* We may have to touch one inode, one bitmap buffer, up to three
* indirection blocks, the group and superblock summaries, and the data
* block to complete the transaction. */
#define EXT4_SINGLEDATA_TRANS_BLOCKS 8U
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
* and the superblock, which are already accounted for. */
#define EXT4_XATTR_TRANS_BLOCKS 6U
/* Define the minimum size for a transaction which modifies data. This
* needs to take into account the fact that we may end up modifying two
* quota files too (one for the group, one for the user quota). The
* superblock only gets updated once, of course, so don't bother
* counting that again for the quota updates. */
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
2*EXT4_QUOTA_TRANS_BLOCKS(sb))
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
* generous. We can grow the delete transaction later if necessary. */
#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
/* Define an arbitrary limit for the amount of data we will anticipate
* writing to any given transaction. For unbounded transactions such as
* write(2) and truncate(2) we can write more than this, but we always
* start off at the maximum transaction size and grow the transaction
* optimistically as we go. */
#define EXT4_MAX_TRANS_DATA 64U
/* We break up a large truncate or write transaction once the handle's
* buffer credits gets this low, we need either to extend the
* transaction or to start a new one. Reserve enough space here for
* inode, bitmap, superblock, group and indirection updates for at least
* one block, plus two quota updates. Quota allocations are not
* needed. */
#define EXT4_RESERVE_TRANS_BLOCKS 12U
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only inode+data */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc);
/*
* On success, We end up with an outstanding reference count against
* iloc->bh. This _must_ be cleaned up later.
*/
int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
* Wrapper functions with which ext4 calls into JBD. The intent here is
* to allow these to be turned into appropriate stubs so ext4 can control
* ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
* been done yet.
*/
void ext4_journal_abort_handle(const char *caller, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
static inline int
__ext4_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
int err = journal_get_undo_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_get_write_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
int err = journal_get_write_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline void
ext4_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
{
journal_release_buffer(handle, bh);
}
static inline int
__ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh)
{
int err = journal_forget(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_revoke(const char *where, handle_t *handle,
unsigned long blocknr, struct buffer_head *bh)
{
int err = journal_revoke(handle, blocknr, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_get_create_access(const char *where,
handle_t *handle, struct buffer_head *bh)
{
int err = journal_get_create_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
static inline int
__ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh)
{
int err = journal_dirty_metadata(handle, bh);
if (err)
ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
return err;
}
#define ext4_journal_get_undo_access(handle, bh) \
__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
__ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
__ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
__ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
__ext4_journal_forget(__FUNCTION__, (handle), (bh))
int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
{
return ext4_journal_start_sb(inode->i_sb, nblocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__FUNCTION__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
}
static inline int ext4_journal_extend(handle_t *handle, int nblocks)
{
return journal_extend(handle, nblocks);
}
static inline int ext4_journal_restart(handle_t *handle, int nblocks)
{
return journal_restart(handle, nblocks);
}
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
return journal_blocks_per_page(inode);
}
static inline int ext4_journal_force_commit(journal_t *journal)
{
return journal_force_commit(journal);
}
/* super.c */
int ext4_force_commit(struct super_block *sb);
static inline int ext4_should_journal_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return 1;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
return 1;
return 0;
}
static inline int ext4_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 0;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return 1;
return 0;
}
static inline int ext4_should_writeback_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
return 0;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return 1;
return 0;
}
#endif /* _LINUX_EXT4_JBD_H */
/*
* linux/include/linux/jbd.h
*
* Written by Stephen C. Tweedie <sct@redhat.com>
*
* Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Definitions for transaction data structures for the buffer cache
* filesystem journaling support.
*/
#ifndef _LINUX_JBD_H
#define _LINUX_JBD_H
/* Allow this file to be included directly into e2fsprogs */
#ifndef __KERNEL__
#include "jfs_compat.h"
#define JFS_DEBUG
#define jfs_debug jbd_debug
#else
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
#include <linux/stddef.h>
#include <linux/bit_spinlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <asm/semaphore.h>
#endif
#define journal_oom_retry 1
/*
* Define JBD_PARANIOD_IOFAIL to cause a kernel BUG() if ext3 finds
* certain classes of error which can occur due to failed IOs. Under
* normal use we want ext3 to continue after such errors, because
* hardware _can_ fail, but for debugging purposes when running tests on
* known-good hardware we may want to trap these errors.
*/
#undef JBD_PARANOID_IOFAIL
/*
* The default maximum commit age, in seconds.
*/
#define JBD_DEFAULT_MAX_COMMIT_AGE 5
#ifdef CONFIG_JBD_DEBUG
/*
* Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
* consistency checks. By default we don't do this unless
* CONFIG_JBD_DEBUG is on.
*/
#define JBD_EXPENSIVE_CHECKING
extern int journal_enable_debug;
#define jbd_debug(n, f, a...) \
do { \
if ((n) <= journal_enable_debug) { \
printk (KERN_DEBUG "(%s, %d): %s: ", \
__FILE__, __LINE__, __FUNCTION__); \
printk (f, ## a); \
} \
} while (0)
#else
#define jbd_debug(f, a...) /**/
#endif
extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
extern void * jbd_slab_alloc(size_t size, gfp_t flags);
extern void jbd_slab_free(void *ptr, size_t size);
#define jbd_kmalloc(size, flags) \
__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
#define jbd_rep_kmalloc(size, flags) \
__jbd_kmalloc(__FUNCTION__, (size), (flags), 1)
#define JFS_MIN_JOURNAL_BLOCKS 1024
#ifdef __KERNEL__
/**
* typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
*
* All filesystem modifications made by the process go
* through this handle. Recursive operations (such as quota operations)
* are gathered into a single update.
*
* The buffer credits field is used to account for journaled buffers
* being modified by the running process. To ensure that there is
* enough log space for all outstanding operations, we need to limit the
* number of outstanding buffers possible at any time. When the
* operation completes, any buffer credits not used are credited back to
* the transaction, so that at all times we know how many buffers the
* outstanding updates on a transaction might possibly touch.
*
* This is an opaque datatype.
**/
typedef struct handle_s handle_t; /* Atomic operation type */
/**
* typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
*
* journal_t is linked to from the fs superblock structure.
*
* We use the journal_t to keep track of all outstanding transaction
* activity on the filesystem, and to manage the state of the log
* writing process.
*
* This is an opaque datatype.
**/
typedef struct journal_s journal_t; /* Journal control structure */
#endif
/*
* Internal structures used by the logging mechanism:
*/
#define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
/*
* On-disk structures
*/
/*
* Descriptor block types:
*/
#define JFS_DESCRIPTOR_BLOCK 1
#define JFS_COMMIT_BLOCK 2
#define JFS_SUPERBLOCK_V1 3
#define JFS_SUPERBLOCK_V2 4
#define JFS_REVOKE_BLOCK 5
/*
* Standard header for all descriptor blocks:
*/
typedef struct journal_header_s
{
__be32 h_magic;
__be32 h_blocktype;
__be32 h_sequence;
} journal_header_t;
/*
* The block tag: used to describe a single buffer in the journal
*/
typedef struct journal_block_tag_s
{
__be32 t_blocknr; /* The on-disk block number */
__be32 t_flags; /* See below */
} journal_block_tag_t;
/*
* The revoke descriptor: used on disk to describe a series of blocks to
* be revoked from the log
*/
typedef struct journal_revoke_header_s
{
journal_header_t r_header;
__be32 r_count; /* Count of bytes used in the block */
} journal_revoke_header_t;
/* Definitions for the journal tag flags word: */
#define JFS_FLAG_ESCAPE 1 /* on-disk block is escaped */
#define JFS_FLAG_SAME_UUID 2 /* block has same uuid as previous */
#define JFS_FLAG_DELETED 4 /* block deleted by this transaction */
#define JFS_FLAG_LAST_TAG 8 /* last tag in this descriptor block */
/*
* The journal superblock. All fields are in big-endian byte order.
*/
typedef struct journal_superblock_s
{
/* 0x0000 */
journal_header_t s_header;
/* 0x000C */
/* Static information describing the journal */
__be32 s_blocksize; /* journal device blocksize */
__be32 s_maxlen; /* total blocks in journal file */
__be32 s_first; /* first block of log information */
/* 0x0018 */
/* Dynamic information describing the current state of the log */
__be32 s_sequence; /* first commit ID expected in log */
__be32 s_start; /* blocknr of start of log */
/* 0x0020 */
/* Error value, as set by journal_abort(). */
__be32 s_errno;
/* 0x0024 */
/* Remaining fields are only valid in a version-2 superblock */
__be32 s_feature_compat; /* compatible feature set */
__be32 s_feature_incompat; /* incompatible feature set */
__be32 s_feature_ro_compat; /* readonly-compatible feature set */
/* 0x0030 */
__u8 s_uuid[16]; /* 128-bit uuid for journal */
/* 0x0040 */
__be32 s_nr_users; /* Nr of filesystems sharing log */
__be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/
/* 0x0048 */
__be32 s_max_transaction; /* Limit of journal blocks per trans.*/
__be32 s_max_trans_data; /* Limit of data blocks per trans. */
/* 0x0050 */
__u32 s_padding[44];
/* 0x0100 */
__u8 s_users[16*48]; /* ids of all fs'es sharing the log */
/* 0x0400 */
} journal_superblock_t;
#define JFS_HAS_COMPAT_FEATURE(j,mask) \
((j)->j_format_version >= 2 && \
((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
#define JFS_HAS_RO_COMPAT_FEATURE(j,mask) \
((j)->j_format_version >= 2 && \
((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
#define JFS_HAS_INCOMPAT_FEATURE(j,mask) \
((j)->j_format_version >= 2 && \
((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
/* Features known to this kernel version: */
#define JFS_KNOWN_COMPAT_FEATURES 0
#define JFS_KNOWN_ROCOMPAT_FEATURES 0
#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
#ifdef __KERNEL__
#include <linux/fs.h>
#include <linux/sched.h>
#define JBD_ASSERTIONS
#ifdef JBD_ASSERTIONS
#define J_ASSERT(assert) \
do { \
if (!(assert)) { \
printk (KERN_EMERG \
"Assertion failure in %s() at %s:%d: \"%s\"\n", \
__FUNCTION__, __FILE__, __LINE__, # assert); \
BUG(); \
} \
} while (0)
#if defined(CONFIG_BUFFER_DEBUG)
void buffer_assertion_failure(struct buffer_head *bh);
#define J_ASSERT_BH(bh, expr) \
do { \
if (!(expr)) \
buffer_assertion_failure(bh); \
J_ASSERT(expr); \
} while (0)
#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr)
#else
#define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
#endif
#else
#define J_ASSERT(assert) do { } while (0)
#endif /* JBD_ASSERTIONS */
#if defined(JBD_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...) J_ASSERT(expr)
#define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr)
#define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr)
#else
#define __journal_expect(expr, why...) \
({ \
int val = (expr); \
if (!val) { \
printk(KERN_ERR \
"EXT3-fs unexpected failure: %s;\n",# expr); \
printk(KERN_ERR why "\n"); \
} \
val; \
})
#define J_EXPECT(expr, why...) __journal_expect(expr, ## why)
#define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why)
#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why)
#endif
enum jbd_state_bits {
BH_JBD /* Has an attached ext3 journal_head */
= BH_PrivateStart,
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
BH_Freed, /* Has been freed (truncated) */
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
BH_State, /* Pins most journal_head state */
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
};
BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
return jh->b_bh;
}
static inline struct journal_head *bh2jh(struct buffer_head *bh)
{
return bh->b_private;
}
static inline void jbd_lock_bh_state(struct buffer_head *bh)
{
bit_spin_lock(BH_State, &bh->b_state);
}
static inline int jbd_trylock_bh_state(struct buffer_head *bh)
{
return bit_spin_trylock(BH_State, &bh->b_state);
}
static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
{
return bit_spin_is_locked(BH_State, &bh->b_state);
}
static inline void jbd_unlock_bh_state(struct buffer_head *bh)
{
bit_spin_unlock(BH_State, &bh->b_state);
}
static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
bit_spin_lock(BH_JournalHead, &bh->b_state);
}
static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
{
bit_spin_unlock(BH_JournalHead, &bh->b_state);
}
struct jbd_revoke_table_s;
/**
* struct handle_s - The handle_s type is the concrete type associated with
* handle_t.
* @h_transaction: Which compound transaction is this update a part of?
* @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
* @h_ref: Reference count on this handle
* @h_err: Field for caller's use to track errors through large fs operations
* @h_sync: flag for sync-on-close
* @h_jdata: flag to force data journaling
* @h_aborted: flag indicating fatal error on handle
**/
/* Docbook can't yet cope with the bit fields, but will leave the documentation
* in so it can be fixed later.
*/
struct handle_s
{
/* Which compound transaction is this update a part of? */
transaction_t *h_transaction;
/* Number of remaining buffers we are allowed to dirty: */
int h_buffer_credits;
/* Reference count on this handle */
int h_ref;
/* Field for caller's use to track errors through large fs */
/* operations */
int h_err;
/* Flags [no locking] */
unsigned int h_sync: 1; /* sync-on-close */
unsigned int h_jdata: 1; /* force data journaling */
unsigned int h_aborted: 1; /* fatal error on handle */
};
/* The transaction_t type is the guts of the journaling mechanism. It
* tracks a compound transaction through its various states:
*
* RUNNING: accepting new updates
* LOCKED: Updates still running but we don't accept new ones
* RUNDOWN: Updates are tidying up but have finished requesting
* new buffers to modify (state not used for now)
* FLUSH: All updates complete, but we are still writing to disk
* COMMIT: All data on disk, writing commit record
* FINISHED: We still have to keep the transaction for checkpointing.
*
* The transaction keeps track of all of the buffers modified by a
* running transaction, and all of the buffers committed but not yet
* flushed to home for finished transactions.
*/
/*
* Lock ranking:
*
* j_list_lock
* ->jbd_lock_bh_journal_head() (This is "innermost")
*
* j_state_lock
* ->jbd_lock_bh_state()
*
* jbd_lock_bh_state()
* ->j_list_lock
*
* j_state_lock
* ->t_handle_lock
*
* j_state_lock
* ->j_list_lock (journal_unmap_buffer)
*
*/
struct transaction_s
{
/* Pointer to the journal for this transaction. [no locking] */
journal_t *t_journal;
/* Sequence number for this transaction [no locking] */
tid_t t_tid;
/*
* Transaction's current state
* [no locking - only kjournald alters this]
* FIXME: needs barriers
* KLUDGE: [use j_state_lock]
*/
enum {
T_RUNNING,
T_LOCKED,
T_RUNDOWN,
T_FLUSH,
T_COMMIT,
T_FINISHED
} t_state;
/*
* Where in the log does this transaction's commit start? [no locking]
*/
unsigned long t_log_start;
/* Number of buffers on the t_buffers list [j_list_lock] */
int t_nr_buffers;
/*
* Doubly-linked circular list of all buffers reserved but not yet
* modified by this transaction [j_list_lock]
*/
struct journal_head *t_reserved_list;
/*
* Doubly-linked circular list of all buffers under writeout during
* commit [j_list_lock]
*/
struct journal_head *t_locked_list;
/*
* Doubly-linked circular list of all metadata buffers owned by this
* transaction [j_list_lock]
*/
struct journal_head *t_buffers;
/*
* Doubly-linked circular list of all data buffers still to be
* flushed before this transaction can be committed [j_list_lock]
*/
struct journal_head *t_sync_datalist;
/*
* Doubly-linked circular list of all forget buffers (superseded
* buffers which we can un-checkpoint once this transaction commits)
* [j_list_lock]
*/
struct journal_head *t_forget;
/*
* Doubly-linked circular list of all buffers still to be flushed before
* this transaction can be checkpointed. [j_list_lock]
*/
struct journal_head *t_checkpoint_list;
/*
* Doubly-linked circular list of all buffers submitted for IO while
* checkpointing. [j_list_lock]
*/
struct journal_head *t_checkpoint_io_list;
/*
* Doubly-linked circular list of temporary buffers currently undergoing
* IO in the log [j_list_lock]
*/
struct journal_head *t_iobuf_list;
/*
* Doubly-linked circular list of metadata buffers being shadowed by log
* IO. The IO buffers on the iobuf list and the shadow buffers on this
* list match each other one for one at all times. [j_list_lock]
*/
struct journal_head *t_shadow_list;
/*
* Doubly-linked circular list of control buffers being written to the
* log. [j_list_lock]
*/
struct journal_head *t_log_list;
/*
* Protects info related to handles
*/
spinlock_t t_handle_lock;
/*
* Number of outstanding updates running on this transaction
* [t_handle_lock]
*/
int t_updates;
/*
* Number of buffers reserved for use by all handles in this transaction
* handle but not yet modified. [t_handle_lock]
*/
int t_outstanding_credits;
/*
* Forward and backward links for the circular list of all transactions
* awaiting checkpoint. [j_list_lock]
*/
transaction_t *t_cpnext, *t_cpprev;
/*
* When will the transaction expire (become due for commit), in jiffies?
* [no locking]
*/
unsigned long t_expires;
/*
* How many handles used this transaction? [t_handle_lock]
*/
int t_handle_count;
};
/**
* struct journal_s - The journal_s type is the concrete type associated with
* journal_t.
* @j_flags: General journaling state flags
* @j_errno: Is there an outstanding uncleared error on the journal (from a
* prior abort)?
* @j_sb_buffer: First part of superblock buffer
* @j_superblock: Second part of superblock buffer
* @j_format_version: Version of the superblock format
* @j_state_lock: Protect the various scalars in the journal
* @j_barrier_count: Number of processes waiting to create a barrier lock
* @j_barrier: The barrier lock itself
* @j_running_transaction: The current running transaction..
* @j_committing_transaction: the transaction we are pushing to disk
* @j_checkpoint_transactions: a linked circular list of all transactions
* waiting for checkpointing
* @j_wait_transaction_locked: Wait queue for waiting for a locked transaction
* to start committing, or for a barrier lock to be released
* @j_wait_logspace: Wait queue for waiting for checkpointing to complete
* @j_wait_done_commit: Wait queue for waiting for commit to complete
* @j_wait_checkpoint: Wait queue to trigger checkpointing
* @j_wait_commit: Wait queue to trigger commit
* @j_wait_updates: Wait queue to wait for updates to complete
* @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
* @j_head: Journal head - identifies the first unused block in the journal
* @j_tail: Journal tail - identifies the oldest still-used block in the
* journal.
* @j_free: Journal free - how many free blocks are there in the journal?
* @j_first: The block number of the first usable block
* @j_last: The block number one beyond the last usable block
* @j_dev: Device where we store the journal
* @j_blocksize: blocksize for the location where we store the journal.
* @j_blk_offset: starting block offset for into the device where we store the
* journal
* @j_fs_dev: Device which holds the client fs. For internal journal this will
* be equal to j_dev
* @j_maxlen: Total maximum capacity of the journal region on disk.
* @j_list_lock: Protects the buffer lists and internal buffer state.
* @j_inode: Optional inode where we store the journal. If present, all journal
* block numbers are mapped into this inode via bmap().
* @j_tail_sequence: Sequence number of the oldest transaction in the log
* @j_transaction_sequence: Sequence number of the next transaction to grant
* @j_commit_sequence: Sequence number of the most recently committed
* transaction
* @j_commit_request: Sequence number of the most recent transaction wanting
* commit
* @j_uuid: Uuid of client object.
* @j_task: Pointer to the current commit thread for this journal
* @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a
* single compound commit transaction
* @j_commit_interval: What is the maximum transaction lifetime before we begin
* a commit?
* @j_commit_timer: The timer used to wakeup the commit thread
* @j_revoke_lock: Protect the revoke table
* @j_revoke: The revoke table - maintains the list of revoked blocks in the
* current transaction.
* @j_revoke_table: alternate revoke tables for j_revoke
* @j_wbuf: array of buffer_heads for journal_commit_transaction
* @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
* number that will fit in j_blocksize
* @j_last_sync_writer: most recent pid which did a synchronous write
* @j_private: An opaque pointer to fs-private information.
*/
struct journal_s
{
/* General journaling state flags [j_state_lock] */
unsigned long j_flags;
/*
* Is there an outstanding uncleared error on the journal (from a prior
* abort)? [j_state_lock]
*/
int j_errno;
/* The superblock buffer */
struct buffer_head *j_sb_buffer;
journal_superblock_t *j_superblock;
/* Version of the superblock format */
int j_format_version;
/*
* Protect the various scalars in the journal
*/
spinlock_t j_state_lock;
/*
* Number of processes waiting to create a barrier lock [j_state_lock]
*/
int j_barrier_count;
/* The barrier lock itself */
struct mutex j_barrier;
/*
* Transactions: The current running transaction...
* [j_state_lock] [caller holding open handle]
*/
transaction_t *j_running_transaction;
/*
* the transaction we are pushing to disk
* [j_state_lock] [caller holding open handle]
*/
transaction_t *j_committing_transaction;
/*
* ... and a linked circular list of all transactions waiting for
* checkpointing. [j_list_lock]
*/
transaction_t *j_checkpoint_transactions;
/*
* Wait queue for waiting for a locked transaction to start committing,
* or for a barrier lock to be released
*/
wait_queue_head_t j_wait_transaction_locked;
/* Wait queue for waiting for checkpointing to complete */
wait_queue_head_t j_wait_logspace;
/* Wait queue for waiting for commit to complete */
wait_queue_head_t j_wait_done_commit;
/* Wait queue to trigger checkpointing */
wait_queue_head_t j_wait_checkpoint;
/* Wait queue to trigger commit */
wait_queue_head_t j_wait_commit;
/* Wait queue to wait for updates to complete */
wait_queue_head_t j_wait_updates;
/* Semaphore for locking against concurrent checkpoints */
struct mutex j_checkpoint_mutex;
/*
* Journal head: identifies the first unused block in the journal.
* [j_state_lock]
*/
unsigned long j_head;
/*
* Journal tail: identifies the oldest still-used block in the journal.
* [j_state_lock]
*/
unsigned long j_tail;
/*
* Journal free: how many free blocks are there in the journal?
* [j_state_lock]
*/
unsigned long j_free;
/*
* Journal start and end: the block numbers of the first usable block
* and one beyond the last usable block in the journal. [j_state_lock]
*/
unsigned long j_first;
unsigned long j_last;
/*
* Device, blocksize and starting block offset for the location where we
* store the journal.
*/
struct block_device *j_dev;
int j_blocksize;
unsigned long j_blk_offset;
/*
* Device which holds the client fs. For internal journal this will be
* equal to j_dev.
*/
struct block_device *j_fs_dev;
/* Total maximum capacity of the journal region on disk. */
unsigned int j_maxlen;
/*
* Protects the buffer lists and internal buffer state.
*/
spinlock_t j_list_lock;
/* Optional inode where we store the journal. If present, all */
/* journal block numbers are mapped into this inode via */
/* bmap(). */
struct inode *j_inode;
/*
* Sequence number of the oldest transaction in the log [j_state_lock]
*/
tid_t j_tail_sequence;
/*
* Sequence number of the next transaction to grant [j_state_lock]
*/
tid_t j_transaction_sequence;
/*
* Sequence number of the most recently committed transaction
* [j_state_lock].
*/
tid_t j_commit_sequence;
/*
* Sequence number of the most recent transaction wanting commit
* [j_state_lock]
*/
tid_t j_commit_request;
/*
* Journal uuid: identifies the object (filesystem, LVM volume etc)
* backed by this journal. This will eventually be replaced by an array
* of uuids, allowing us to index multiple devices within a single
* journal and to perform atomic updates across them.
*/
__u8 j_uuid[16];
/* Pointer to the current commit thread for this journal */
struct task_struct *j_task;
/*
* Maximum number of metadata buffers to allow in a single compound
* commit transaction
*/
int j_max_transaction_buffers;
/*
* What is the maximum transaction lifetime before we begin a commit?
*/
unsigned long j_commit_interval;
/* The timer used to wakeup the commit thread: */
struct timer_list j_commit_timer;
/*
* The revoke table: maintains the list of revoked blocks in the
* current transaction. [j_revoke_lock]
*/
spinlock_t j_revoke_lock;
struct jbd_revoke_table_s *j_revoke;
struct jbd_revoke_table_s *j_revoke_table[2];
/*
* array of bhs for journal_commit_transaction
*/
struct buffer_head **j_wbuf;
int j_wbufsize;
pid_t j_last_sync_writer;
/*
* An opaque pointer to fs-private information. ext3 puts its
* superblock pointer here
*/
void *j_private;
};
/*
* Journal flag definitions
*/
#define JFS_UNMOUNT 0x001 /* Journal thread is being destroyed */
#define JFS_ABORT 0x002 /* Journaling has been aborted for errors. */
#define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */
#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
#define JFS_LOADED 0x010 /* The journal superblock has been loaded */
#define JFS_BARRIER 0x020 /* Use IDE barriers */
/*
* Function declarations for the journaling transaction and buffer
* management
*/
/* Filing buffers */
extern void __journal_temp_unlink_buffer(struct journal_head *jh);
extern void journal_unfile_buffer(journal_t *, struct journal_head *);
extern void __journal_unfile_buffer(struct journal_head *);
extern void __journal_refile_buffer(struct journal_head *);
extern void journal_refile_buffer(journal_t *, struct journal_head *);
extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_free_buffer(struct journal_head *bh);
extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_clean_data_list(transaction_t *transaction);
/* Log buffer allocation */
extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
int journal_next_log_block(journal_t *, unsigned long *);
/* Commit management */
extern void journal_commit_transaction(journal_t *);
/* Checkpoint list management */
int __journal_clean_checkpoint_list(journal_t *journal);
int __journal_remove_checkpoint(struct journal_head *);
void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
/* Buffer IO */
extern int
journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct journal_head **jh_out,
unsigned long blocknr);
/* Transaction locking */
extern void __wait_on_journal (journal_t *);
/*
* Journal locking.
*
* We need to lock the journal during transaction state changes so that nobody
* ever tries to take a handle on the running transaction while we are in the
* middle of moving it to the commit phase. j_state_lock does this.
*
* Note that the locking is completely interrupt unsafe. We never touch
* journal structures from interrupts.
*/
static inline handle_t *journal_current_handle(void)
{
return current->journal_info;
}
/* The journaling code user interface:
*
* Create and destroy handles
* Register buffer modifications against the current transaction.
*/
extern handle_t *journal_start(journal_t *, int nblocks);
extern int journal_restart (handle_t *, int nblocks);
extern int journal_extend (handle_t *, int nblocks);
extern int journal_get_write_access(handle_t *, struct buffer_head *);
extern int journal_get_create_access (handle_t *, struct buffer_head *);
extern int journal_get_undo_access(handle_t *, struct buffer_head *);
extern int journal_dirty_data (handle_t *, struct buffer_head *);
extern int journal_dirty_metadata (handle_t *, struct buffer_head *);
extern void journal_release_buffer (handle_t *, struct buffer_head *);
extern int journal_forget (handle_t *, struct buffer_head *);
extern void journal_sync_buffer (struct buffer_head *);
extern void journal_invalidatepage(journal_t *,
struct page *, unsigned long);
extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
extern int journal_stop(handle_t *);
extern int journal_flush (journal_t *);
extern void journal_lock_updates (journal_t *);
extern void journal_unlock_updates (journal_t *);
extern journal_t * journal_init_dev(struct block_device *bdev,
struct block_device *fs_dev,
int start, int len, int bsize);
extern journal_t * journal_init_inode (struct inode *);
extern int journal_update_format (journal_t *);
extern int journal_check_used_features
(journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_check_available_features
(journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_set_features
(journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_create (journal_t *);
extern int journal_load (journal_t *journal);
extern void journal_destroy (journal_t *);
extern int journal_recover (journal_t *journal);
extern int journal_wipe (journal_t *, int);
extern int journal_skip_recovery (journal_t *);
extern void journal_update_superblock (journal_t *, int);
extern void __journal_abort_hard (journal_t *);
extern void journal_abort (journal_t *, int);
extern int journal_errno (journal_t *);
extern void journal_ack_err (journal_t *);
extern int journal_clear_err (journal_t *);
extern int journal_bmap(journal_t *, unsigned long, unsigned long *);
extern int journal_force_commit(journal_t *);
/*
* journal_head management
*/
struct journal_head *journal_add_journal_head(struct buffer_head *bh);
struct journal_head *journal_grab_journal_head(struct buffer_head *bh);
void journal_remove_journal_head(struct buffer_head *bh);
void journal_put_journal_head(struct journal_head *jh);
/*
* handle management
*/
extern kmem_cache_t *jbd_handle_cache;
static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
{
return kmem_cache_alloc(jbd_handle_cache, gfp_flags);
}
static inline void jbd_free_handle(handle_t *handle)
{
kmem_cache_free(jbd_handle_cache, handle);
}
/* Primary revoke support */
#define JOURNAL_REVOKE_DEFAULT_HASH 256
extern int journal_init_revoke(journal_t *, int);
extern void journal_destroy_revoke_caches(void);
extern int journal_init_revoke_caches(void);
extern void journal_destroy_revoke(journal_t *);
extern int journal_revoke (handle_t *,
unsigned long, struct buffer_head *);
extern int journal_cancel_revoke(handle_t *, struct journal_head *);
extern void journal_write_revoke_records(journal_t *, transaction_t *);
/* Recovery revoke support */
extern int journal_set_revoke(journal_t *, unsigned long, tid_t);
extern int journal_test_revoke(journal_t *, unsigned long, tid_t);
extern void journal_clear_revoke(journal_t *);
extern void journal_switch_revoke_table(journal_t *journal);
/*
* The log thread user interface:
*
* Request space in the current transaction, and force transaction commit
* transitions on demand.
*/
int __log_space_left(journal_t *); /* Called with journal locked */
int log_start_commit(journal_t *journal, tid_t tid);
int __log_start_commit(journal_t *journal, tid_t tid);
int journal_start_commit(journal_t *journal, tid_t *tid);
int journal_force_commit_nested(journal_t *journal);
int log_wait_commit(journal_t *journal, tid_t tid);
int log_do_checkpoint(journal_t *journal);
void __log_wait_for_space(journal_t *journal);
extern void __journal_drop_transaction(journal_t *, transaction_t *);
extern int cleanup_journal_tail(journal_t *);
/* Debugging code only: */
#define jbd_ENOSYS() \
do { \
printk (KERN_ERR "JBD unimplemented function %s\n", __FUNCTION__); \
current->state = TASK_UNINTERRUPTIBLE; \
schedule(); \
} while (1)
/*
* is_journal_abort
*
* Simple test wrapper function to test the JFS_ABORT state flag. This
* bit, when set, indicates that we have had a fatal error somewhere,
* either inside the journaling layer or indicated to us by the client
* (eg. ext3), and that we and should not commit any further
* transactions.
*/
static inline int is_journal_aborted(journal_t *journal)
{
return journal->j_flags & JFS_ABORT;
}
static inline int is_handle_aborted(handle_t *handle)
{
if (handle->h_aborted)
return 1;
return is_journal_aborted(handle->h_transaction->t_journal);
}
static inline void journal_abort_handle(handle_t *handle)
{
handle->h_aborted = 1;
}
#endif /* __KERNEL__ */
/* Comparison functions for transaction IDs: perform comparisons using
* modulo arithmetic so that they work over sequence number wraps. */
static inline int tid_gt(tid_t x, tid_t y)
{
int difference = (x - y);
return (difference > 0);
}
static inline int tid_geq(tid_t x, tid_t y)
{
int difference = (x - y);
return (difference >= 0);
}
extern int journal_blocks_per_page(struct inode *inode);
/*
* Return the minimum number of blocks which must be free in the journal
* before a new transaction may be started. Must be called under j_state_lock.
*/
static inline int jbd_space_needed(journal_t *journal)
{
int nblocks = journal->j_max_transaction_buffers;
if (journal->j_committing_transaction)
nblocks += journal->j_committing_transaction->
t_outstanding_credits;
return nblocks;
}
/*
* Definitions which augment the buffer_head layer
*/
/* journaling buffer types */
#define BJ_None 0 /* Not journaled */
#define BJ_SyncData 1 /* Normal data: flush before commit */
#define BJ_Metadata 2 /* Normal journaled metadata */
#define BJ_Forget 3 /* Buffer superseded by this transaction */
#define BJ_IO 4 /* Buffer is for temporary IO use */
#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
#define BJ_LogCtl 6 /* Buffer contains log descriptors */
#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
#define BJ_Locked 8 /* Locked for I/O during commit */
#define BJ_Types 9
extern int jbd_blocks_per_page(struct inode *inode);
#ifdef __KERNEL__
#define buffer_trace_init(bh) do {} while (0)
#define print_buffer_fields(bh) do {} while (0)
#define print_buffer_trace(bh) do {} while (0)
#define BUFFER_TRACE(bh, info) do {} while (0)
#define BUFFER_TRACE2(bh, bh2, info) do {} while (0)
#define JBUFFER_TRACE(jh, info) do {} while (0)
#endif /* __KERNEL__ */
#endif /* _LINUX_JBD_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册