[PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module. Signed-off-by: N Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: N Kurt Hackel <kurt.hackel@oracle.com>

[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module. Signed-off-by: N Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: N Kurt Hackel <kurt.hackel@oracle.com>
ccd979bd · Mark Fasheh · Joel Becker · 8df08c89 · ccd979bd · ccd979bd
55 changed file
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -36,6 +36,8 @@ ntfs.txt
 	- info and mount options for the NTFS filesystem (Windows NT).
 proc.txt
 	- info on Linux's /proc filesystem.
+ocfs2.txt
+	- info and mount options for the OCFS2 clustered filesystem.
 romfs.txt
 	- Description of the ROMFS filesystem.
 smbfs.txt

--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+All code copyright 2005 Oracle except when otherwise noted.
+CREDITS:
+Lots of code taken from ext3 and other projects.
+Authors in alphabetical order:
+Joel Becker   <joel.becker@oracle.com>
+Zach Brown    <zach.brown@oracle.com>
+Mark Fasheh   <mark.fasheh@oracle.com>
+Kurt Hackel   <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh  <manish.singh@oracle.com>
+Caveats
+=======
+Features which OCFS2 does not support yet:
+	- sparse files
+	- extended attributes
+	- shared writeable mmap
+	- loopback is supported, but data written will not
+	  be cluster coherent.
+	- quotas
+	- cluster aware flock
+	- Directory change notification (F_NOTIFY)
+	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	- POSIX ACLs
+	- readpages / writepages (not user visible)
+Mount options
+=============
+OCFS2 supports the following mount options:
+(*) == default
+barrier=1		This enables/disables barriers. barrier=0 disables it,
+			barrier=1 enables it.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
+errors=panic		Panic and halt the machine if an error occurs.
+intr		(*)	Allow signals to interrupt cluster operations.
+nointr			Do not allow signals to interrupt cluster
+			operations.
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1905,6 +1905,15 @@ M:	ajoshi@shell.unixbox.com
 L:	linux-nvidia@lists.surfsouth.com
 S:	Maintained
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P:	Mark Fasheh
+M:	mark.fasheh@oracle.com
+P:	Kurt Hackel
+M:	kurt.hackel@oracle.com
+L:	ocfs2-devel@oss.oracle.com
+W:	http://oss.oracle.com/projects/ocfs2/
+S:	Supported	
 OLYMPIC NETWORK DRIVER
 P:	Peter De Shrijver
 M:	p2@ace.ulyssis.student.kuleuven.ac.be

--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
+EXTRA_CFLAGS += -Ifs/ocfs2
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	mmap.o 			\
+	namei.o 		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	ver.o 			\
+	vote.o
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 blkno,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac);
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+}
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+struct ocfs2_truncate_context {
+	struct inode *tc_ext_alloc_inode;
+	struct buffer_head *tc_ext_alloc_bh;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc);
+#endif /* OCFS2_ALLOC_H */
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "symlink.h"
+#include "buffer_head_io.h"
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  &bh, OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		goto bail;
+	}
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+	err = 0;
+bail:
+	if (bh)
+		brelse(bh);
+	mlog_exit(err);
+	return err;
+}
+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	u64 p_blkno, past_eof;
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+	/* this can happen if another node truncs after our extend! */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+					       OCFS2_I(inode)->ip_clusters))
+		err = -EIO;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (err)
+		goto bail;
+	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  NULL);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%"MLFu64", NULL)\n", err, inode,
+		     (unsigned long long)iblock, p_blkno);
+		goto bail;
+	}
+	map_bh(bh_result, inode->i_sb, p_blkno);
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
+		     "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
+		     p_blkno, OCFS2_I(inode)->ip_blkno);
+	}
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+bail:
+	if (err < 0)
+		err = -EIO;
+	mlog_exit(err);
+	return err;
+}
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		char *addr = kmap(page);
+		memset(addr, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		kunmap(page);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+	ret = ocfs2_data_lock_with_page(inode, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out_alloc;
+	}
+	ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+	ocfs2_data_unlock(inode, 0);
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	mlog_exit(ret);
+	return ret;
+}
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+	mlog_entry("(0x%p)\n", page);
+	ret = block_write_full_page(page, ocfs2_get_block, wbc);
+	mlog_exit(ret);
+	return ret;
+}
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret = 0;
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle->k_handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0) 
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (handle)
+			ocfs2_commit_trans(handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+static int ocfs2_commit_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_journal_handle *handle = NULL;
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_data_lock_with_page(inode, 1, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+	if (extending) {
+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock_data;
+		}
+		/* Mark our buffer early. We'd rather catch this error up here
+		 * as opposed to after a successful commit_write which would
+		 * require us to set back inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+	/* might update i_size */
+	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	if (extending) {
+		loff_t size = (u64) i_size_read(inode);
+		struct ocfs2_dinode *di =
+			(struct ocfs2_dinode *)di_bh->b_data;
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = 
+				cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = 
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
+out_commit:
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+	mlog_exit(ret);
+	return ret;
+}
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+					  NULL);
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_meta_unlock(inode, 0);
+	}
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+bail:
+	status = err ? 0 : p_blkno;
+	mlog_exit((int)status);
+	return status;
+}
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     unsigned long max_blocks,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u64 vbo_max; /* file offset, max_blocks from iblock */
+	u64 p_blkno;
+	int contig_blocks;
+	unsigned char blocksize_bits;
+	if (!inode || !bh_result) {
+		mlog(ML_ERROR, "inode or bh_result is null\n");
+		return -EIO;
+	}
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if ((iblock + max_blocks) >
+	    ocfs2_clusters_to_blocks(inode->i_sb,
+				     OCFS2_I(inode)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		ret = -EIO;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  &contig_blocks);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+	map_bh(bh_result, inode->i_sb, p_blkno);
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	return ret;
+}
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+	ocfs2_iocb_clear_rw_locked(iocb);
+	up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, 0);
+}
+static ssize_t ocfs2_direct_IO(int rw,
+			       struct kiocb *iocb,
+			       const struct iovec *iov,
+			       loff_t offset,
+			       unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+	mlog_entry_void();
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+					    inode->i_sb->s_bdev, iov, offset,
+					    nr_segs, 
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io);
+	mlog_exit(ret);
+	return ret;
+}
+struct address_space_operations ocfs2_aops = {
+	.readpage	= ocfs2_readpage,
+	.writepage	= ocfs2_writepage,
+	.prepare_write	= ocfs2_prepare_write,
+	.commit_write	= ocfs2_commit_write,
+	.bmap		= ocfs2_bmap,
+	.sync_page	= block_sync_page,
+	.direct_IO	= ocfs2_direct_IO
+};
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to);
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_rw_locked(iocb) \
+	set_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(0, (unsigned long *)&iocb->private)
+#endif /* OCFS2_FILE_H */
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct inode *inode)
+{
+	int ret = 0;
+	mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
+		   (unsigned long long)bh->b_blocknr, inode);
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto out;
+	}
+	down(&OCFS2_I(inode)->ip_io_sem);
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+	get_bh(bh); /* for end_buffer_write_sync() */                   
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(inode, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		brelse(bh);
+	}
+	up(&OCFS2_I(inode)->ip_io_sem);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+	mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
+		   block, nr, flags, inode);
+	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (nr == 0) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		status = 0;
+		goto bail;
+	}
+	sb = osb->sb;
+	if (flags & OCFS2_BH_CACHED && !inode)
+		flags &= ~OCFS2_BH_CACHED;
+	if (inode)
+		down(&OCFS2_I(inode)->ip_io_sem);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				if (inode)
+					up(&OCFS2_I(inode)->ip_io_sem);
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = 0;
+		if (flags & OCFS2_BH_CACHED &&
+		    !ocfs2_buffer_uptodate(inode, bh)) {
+			mlog(ML_UPTODATE,
+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
+			     (unsigned long long)bh->b_blocknr,
+			     OCFS2_I(inode)->ip_blkno);
+			ignore_cache = 1;
+		}
+		/* XXX: Can we ever get this and *not* have the cached
+		 * flag set? */
+		if (buffer_jbd(bh)) {
+			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+				mlog(ML_BH_IO, "trying to sync read a jbd "
+					       "managed bh (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				mlog(ML_BH_IO, "asking me to sync read a dirty "
+					       "buffer! (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+				continue;
+			}
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			bh->b_end_io = end_buffer_read_sync;
+			if (flags & OCFS2_BH_READAHEAD)
+				submit_bh(READA, bh);
+			else
+				submit_bh(READ, bh);
+			continue;
+		}
+	}
+	status = 0;
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+		/* We know this can't have changed as we hold the
+		 * inode sem. Avoid doing any work on the bh if the
+		 * journal has it. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. Don't need to
+			 * remove the clustered uptodate information
+			 * for this bh as it's not marked locally
+			 * uptodate. */
+			status = -EIO;
+			brelse(bh);
+			bhs[i] = NULL;
+			continue;
+		}
+		if (inode)
+			ocfs2_set_buffer_uptodate(inode, bh);
+	}
+	if (inode)
+		up(&OCFS2_I(inode)->ip_io_sem);
+	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+bail:
+	mlog_exit(status);
+	return status;
+}
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+#include <linux/buffer_head.h>
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+				   u64                  off,
+				   struct buffer_head **bh,
+				   int                  flags,
+				   struct inode        *inode);
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct inode        *inode);
+int ocfs2_read_blocks(struct ocfs2_super          *osb,
+		      u64                  block,
+		      int                  nr,
+		      struct buffer_head  *bhs[],
+		      int                  flags,
+		      struct inode        *inode);
+#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+				   struct buffer_head **bh, int flags,
+				   struct inode *inode)
+{
+	int status = 0;
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+	status = ocfs2_read_blocks(osb, off, 1, bh,
+				   flags, inode);
+bail:
+	return status;
+}
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#define MLOG_MASK_PREFIX ML_DCACHE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dcache.h"
+#include "file.h"
+#include "inode.h"
+static int ocfs2_dentry_revalidate(struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+	/* Never trust a negative dentry - force a new lookup. */
+	if (inode == NULL) {
+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+		     dentry->d_name.name);
+		goto bail;
+	}
+	osb = OCFS2_SB(inode->i_sb);
+	BUG_ON(!osb);
+	if (inode != osb->root_inode) {
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		/* did we or someone else delete this inode? */
+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
+			     OCFS2_I(inode)->ip_blkno);
+			goto bail;
+		}
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		if (!inode->i_nlink) {
+			mlog(0, "Inode %"MLFu64" orphaned, returning false "
+			     "dir = %d\n", OCFS2_I(inode)->ip_blkno,
+			     S_ISDIR(inode->i_mode));
+			goto bail;
+		}
+	}
+	ret = 1;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+};
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+extern struct dentry_operations ocfs2_dentry_ops;
+#endif /* OCFS2_DCACHE_H */
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh);
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset, blk;
+	int i, num, stored;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	int err;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block * sb = inode->i_sb;
+	int have_disk_lock = 0;
+	mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+	stored = 0;
+	bh = NULL;
+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		stored = error;
+		goto bail;
+	}
+	have_disk_lock = 1;
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
+		blk = (filp->f_pos) >> sb->s_blocksize_bits;
+		bh = ocfs2_bread(inode, blk, &err, 0);
+		if (!bh) {
+			mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
+				       "at offset %lld\n",
+			     OCFS2_I(inode)->ip_blkno,
+			     filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+		/*
+		 * Do the readahead (8k)
+		 */
+		if (!offset) {
+			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+			     i > 0; i--) {
+				tmp = ocfs2_bread(inode, ++blk, &err, 1);
+				if (tmp)
+					brelse(tmp);
+			}
+		}
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+		while (!error && filp->f_pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				filp->f_pos = (filp->f_pos |
+					       (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				goto bail;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = filp->f_version;
+				unsigned char d_type = DT_UNKNOWN;
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						ino_from_blkno(sb, le64_to_cpu(de->inode)),
+						d_type);
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+	}
+	stored = 0;
+bail:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(inode, 0);
+	mlog_exit(stored);
+	return stored;
+}
+/*
+ * NOTE: this should always be called with parent dir i_sem taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent)
+{
+	int status = -ENOENT;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
+		   "inode=%p)\n",
+		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
+	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+	if (!*dirent_bh || !*dirent) {
+		status = -ENOENT;
+		goto leave;
+	}
+	*blkno = le64_to_cpu((*dirent)->inode);
+	status = 0;
+leave:
+	if (status < 0) {
+		*dirent = NULL;
+		if (*dirent_bh) {
+			brelse(*dirent_bh);
+			*dirent_bh = NULL;
+		}
+	}
+	mlog_exit(status);
+	return status;
+}
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_sem + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+	mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
+		   namelen, name);
+	ret = -EEXIST;
+	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+	if (dirent_bh)
+		goto bail;
+	ret = 0;
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+	mlog_exit(ret);
+	return ret;
+}
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ocfs2_dir_entry * de, * de1;
+	struct super_block * sb;
+	int err;
+	sb = inode->i_sb;
+	if ((i_size_read(inode) <
+	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
+	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no data block\n",
+		     OCFS2_I(inode)->ip_blkno);
+		return 1;
+	}
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	de1 = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
+			!le64_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no `.' or `..'\n",
+		     OCFS2_I(inode)->ip_blkno);
+		brelse(bh);
+		return 1;
+	}
+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
+	while (offset < i_size_read(inode) ) {
+		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
+			brelse(bh);
+			bh = ocfs2_bread(inode,
+					 offset >> sb->s_blocksize_bits, &err, 0);
+			if (!bh) {
+				mlog(ML_ERROR, "directory #%"MLFu64" contains "
+					       "a hole at offset %lu\n",
+				     OCFS2_I(inode)->ip_blkno, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+			brelse(bh);
+			return 1;
+		}
+		if (le64_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	}
+	brelse(bh);
+	return 1;
+}
+/* returns a bh of the 1st new block in the allocation. */
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh)
+{
+	int status;
+	int extend;
+	u64 p_blkno;
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+	if (extend) {
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+						   (sb->s_blocksize_bits - 9)),
+					     1, &p_blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+/* assumes you already have a cluster lock on the directory. */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+	mlog_entry_void();
+	dir_i_size = i_size_read(dir);
+	mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
+	     OCFS2_I(dir)->ip_blkno, dir_i_size);
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, handle,
+							    fe, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
+	status = ocfs2_journal_access(handle, dir, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (new_bh)
+		brelse(new_bh);
+	mlog_exit(status);
+	return status;
+}
+/*
+ * Search the dir for a good spot, extending it if necessary. The
+ * block containing an appropriate record is returned in ret_de_bh.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head * bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb;
+	int status;
+	mlog_entry_void();
+	mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
+	     namelen, OCFS2_I(dir)->ip_blkno);
+	BUG_ON(!S_ISDIR(dir->i_mode));
+	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+	sb = dir->i_sb;
+	if (!namelen) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	bh = ocfs2_bread(dir, 0, &status, 0);
+	if (!bh) {
+		mlog_errno(status);
+		goto bail;
+	}
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+			if (i_size_read(dir) <= offset) {
+				status = ocfs2_extend_dir(osb,
+							  dir,
+							  parent_fe_bh,
+							  &bh);
+				if (status < 0) {
+					mlog_errno(status);
+					goto bail;
+				}
+				BUG_ON(!bh);
+				*ret_de_bh = bh;
+				get_bh(*ret_de_bh);
+				goto bail;
+			}
+			bh = ocfs2_bread(dir,
+					 offset >> sb->s_blocksize_bits,
+					 &status,
+					 0);
+			if (!bh) {
+				mlog_errno(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+	mlog_exit(status);
+	return status;
+}
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent);
+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh);
+struct ocfs2_alloc_context;
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh);
+#endif /* OCFS2_DIR_H */
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+#define OCFS2_LVB_VERSION 2
+struct ocfs2_meta_lvb {
+	__be32       lvb_version;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_reserved[3];
+};
+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the vote thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags);
+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page);
+void ocfs2_data_unlock(struct inode *inode,
+		       int write);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags);
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+/* for the vote thread */
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+/* aids in debugging and tracking lvbs */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres);
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+#endif	/* DLMGLUE_H */
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_ENDIAN_H
+#define OCFS2_ENDIAN_H
+static inline void le16_add_cpu(__le16 *var, u16 val)
+{
+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
+}
+static inline void le32_add_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
+}
+static inline void le32_and_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) & val);
+}
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+#endif /* OCFS2_ENDIAN_H */
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#define MLOG_MASK_PREFIX ML_EXPORT
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "inode.h"
+#include "buffer_head_io.h"
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
+{
+	struct ocfs2_inode_handle *handle = vobjp;
+	struct inode *inode;
+	struct dentry *result;
+	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+	if (handle->ih_blkno == 0) {
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		return (void *)inode;
+	}
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+	result = d_alloc_anon(inode);
+	if (!result) {
+		iput(inode);
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+	mlog_exit_ptr(result);
+	return result;
+}
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *inode;
+	struct inode *dir = child->d_inode;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent;
+	mlog_entry("(0x%p, '%.*s')\n", child,
+		   child->d_name.len, child->d_name.name);
+	mlog(0, "find parent of directory %"MLFu64"\n",
+	     OCFS2_I(dir)->ip_blkno);
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
+					  &dirent);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		parent = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+bail_unlock:
+	ocfs2_meta_unlock(dir, 0);
+	if (dirent_bh)
+		brelse(dirent_bh);
+bail:
+	mlog_exit_ptr(parent);
+	return parent;
+}
+static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name,
+		   fh, len, connectable);
+	if (len < 3 || (connectable && len < 6)) {
+		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+		type = 255;
+		goto bail;
+	}
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     blkno, generation);
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+		spin_unlock(&dentry->d_lock);
+		len = 6;
+		type = 2;
+		mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
+		     blkno, generation);
+	}
+	*max_len = len;
+bail:
+	mlog_exit(type);
+	return type;
+}
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+				      int fh_len, int fileid_type,
+				      int (*acceptable)(void *context,
+						        struct dentry *de),
+				      void *context)
+{
+	struct ocfs2_inode_handle handle, parent;
+	struct dentry *ret = NULL;
+	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
+		   sb, fh, fh_len, fileid_type, acceptable, context);
+	if (fh_len < 3 || fileid_type > 2)
+		goto bail;
+	if (fileid_type == 2) {
+		if (fh_len < 6)
+			goto bail;
+		parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
+		parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
+		parent.ih_generation = le32_to_cpu(fh[5]);
+		mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
+		     parent.ih_blkno, parent.ih_generation);
+	}
+	handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
+	handle.ih_generation = le32_to_cpu(fh[2]);
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     handle.ih_blkno, handle.ih_generation);
+	ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
+						    acceptable, context);
+bail:
+	mlog_exit_ptr(ret);
+	return ret;
+}
+struct export_operations ocfs2_export_ops = {
+	.decode_fh	= ocfs2_decode_fh,
+	.encode_fh	= ocfs2_encode_fh,
+	.get_parent	= ocfs2_get_parent,
+	.get_dentry	= ocfs2_get_dentry,
+};
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+extern struct export_operations ocfs2_export_ops;
+#endif /* OCFS2_EXPORT_H */
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+int init_ocfs2_extent_maps(void);
+void exit_ocfs2_extent_maps(void);
+/*
+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+ * to be held.  The allocation cannot change at all while the map is
+ * in the process of being updated.
+ */
+int ocfs2_extent_map_init(struct inode *inode);
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters);
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count);
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+#endif  /* _EXTENT_MAP_H */
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+extern struct file_operations ocfs2_fops;
+extern struct file_operations ocfs2_dops;
+extern struct inode_operations ocfs2_file_iops;
+extern struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size);
+#endif /* OCFS2_FILE_H */
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kmod.h>
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <dlm/dlmapi.h>
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "vote.h"
+#include "buffer_head_io.h"
+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->mounted_map);
+	ocfs2_node_map_init(&osb->recovery_map);
+	ocfs2_node_map_init(&osb->umount_map);
+}
+static void ocfs2_do_node_down(int node_num,
+			       struct ocfs2_super *osb)
+{
+	BUG_ON(osb->node_num == node_num);
+	mlog(0, "ocfs2: node down event for %d\n", node_num);
+	if (!osb->dlm) {
+		/*
+		 * No DLM means we're not even ready to participate yet.
+		 * We check the slots after the DLM comes up, so we will
+		 * notice the node death then.  We can safely ignore it
+		 * here.
+		 */
+		return;
+	}
+	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+		/* If a node is in the umount map, then we've been
+		 * expecting him to go down and we know ahead of time
+		 * that recovery is not necessary. */
+		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+		return;
+	}
+	ocfs2_recovery_thread(osb, node_num);
+	ocfs2_remove_node_from_vote_queues(osb, node_num);
+}
+static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
+				  int node_num,
+				  void *data)
+{
+	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
+}
+/* Called from the dlm when it's about to evict a node. We may also
+ * get a heartbeat callback later. */
+static void ocfs2_dlm_eviction_cb(int node_num,
+				  void *data)
+{
+	struct ocfs2_super *osb = (struct ocfs2_super *) data;
+	struct super_block *sb = osb->sb;
+	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
+	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
+	ocfs2_do_node_down(node_num, osb);
+}
+static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
+				int node_num,
+				void *data)
+{
+	struct ocfs2_super *osb = data;
+	BUG_ON(osb->node_num == node_num);
+	mlog(0, "node up event for %d\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
+{
+	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
+			    ocfs2_hb_node_down_cb, osb,
+			    OCFS2_HB_NODE_DOWN_PRI);
+	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
+			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
+	/* Not exactly a heartbeat callback, but leads to essentially
+	 * the same path so we set it up here. */
+	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
+			      ocfs2_dlm_eviction_cb,
+			      osb);
+}
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+	status = o2hb_register_callback(&osb->osb_hb_down);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = o2hb_register_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	return status;
+}
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+	status = o2hb_unregister_callback(&osb->osb_hb_down);
+	if (status < 0)
+		mlog_errno(status);
+	status = o2hb_unregister_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+}
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
+{
+	int ret;
+	char *argv[5], *envp[3];
+	if (!osb->uuid_str) {
+		/* This can happen if we don't get far enough in mount... */
+		mlog(0, "No UUID with which to stop heartbeat!\n\n");
+		return;
+	}
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = osb->uuid_str;
+	argv[4] = NULL;
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
+{
+	int bit;
+	bit = find_next_bit(map->map, map->num_nodes, 0);
+	if (bit < map->num_nodes)
+		return 0;
+	return 1;
+}
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map)
+{
+	int ret;
+	BUG_ON(map->num_nodes == 0);
+	spin_lock(&osb->node_map_lock);
+	ret = __ocfs2_node_map_is_empty(map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	BUG_ON(from->num_nodes == 0);
+	ocfs2_node_map_init(target);
+	__ocfs2_node_map_set(target, from);
+}
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit)
+{
+	struct ocfs2_node_map temp;
+	int ret;
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_dup(&temp, target);
+	__ocfs2_node_map_clear_bit(&temp, bit);
+	ret = __ocfs2_node_map_is_empty(&temp);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	int num_longs, i;
+	BUG_ON(target->num_nodes != from->num_nodes);
+	BUG_ON(target->num_nodes == 0);
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i = 0; i < num_longs; i++)
+		target->map[i] = from->map[i];
+}
+/* Returns whether the recovery bit was actually set - it may not be
+ * if a node is still marked as needing recovery */
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num)
+{
+	int set = 0;
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
+	if (!test_bit(num, osb->recovery_map.map)) {
+	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
+	    set = 1;
+	}
+	spin_unlock(&osb->node_map_lock);
+	return set;
+}
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx)
+{
+	int i = idx;
+	idx = O2NM_INVALID_NODE_NUM;
+	spin_lock(&osb->node_map_lock);
+	if ((i != O2NM_INVALID_NODE_NUM) &&
+	    (i >= 0) &&
+	    (i < map->num_nodes)) {
+		while(i < map->num_nodes) {
+			if (test_bit(i, map->map)) {
+				idx = i;
+				break;
+			}
+			i++;
+		}
+	}
+	spin_unlock(&osb->node_map_lock);
+	return idx;
+}
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_init(struct ocfs2_node_map *map);
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map);
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx);
+static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
+					       struct ocfs2_node_map *map)
+{
+	return ocfs2_node_map_iterate(osb, map, 0);
+}
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num);
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit);
+#endif /* OCFS2_HEARTBEAT_H */
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
--- a/fs/ocfs2/mmap.h
+++ b/fs/ocfs2/mmap.h
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
--- a/fs/ocfs2/sysfile.h
+++ b/fs/ocfs2/sysfile.h
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
--- a/fs/ocfs2/ver.h
+++ b/fs/ocfs2/ver.h
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h