Orangefs: kernel client part 2

Signed-off-by: N Mike Marshall <hubcap@omnibond.com>

Orangefs: kernel client part 2
Signed-off-by: N Mike Marshall <hubcap@omnibond.com>
5db11c21 · Mike Marshall · f7ab093f · 5db11c21 · 5db11c21 · 5db11c21
6 changed file
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+
+struct posix_acl *pvfs2_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int ret;
+	char *key = NULL, *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		key = PVFS2_XATTR_NAME_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		key = PVFS2_XATTR_NAME_ACL_DEFAULT;
+		break;
+	default:
+		gossip_err("pvfs2_get_acl: bogus value of type %d\n", type);
+		return ERR_PTR(-EINVAL);
+	}
+	/*
+	 * Rather than incurring a network call just to determine the exact
+	 * length of the attribute, I just allocate a max length to save on
+	 * the network call. Conceivably, we could pass NULL to
+	 * pvfs2_inode_getxattr() to probe the length of the value, but
+	 * I don't do that for now.
+	 */
+	value = kmalloc(PVFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+	if (value == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "inode %pU, key %s, type %d\n",
+		     get_khandle_from_ino(inode),
+		     key,
+		     type);
+	ret = pvfs2_inode_getxattr(inode,
+				   "",
+				   key,
+				   value,
+				   PVFS_MAX_XATTR_VALUELEN);
+	/* if the key exists, convert it to an in-memory rep */
+	if (ret > 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+	} else if (ret == -ENODATA || ret == -ENOSYS) {
+		acl = NULL;
+	} else {
+		gossip_err("inode %pU retrieving acl's failed with error %d\n",
+			   get_khandle_from_ino(inode),
+			   ret);
+		acl = ERR_PTR(ret);
+	}
+	/* kfree(NULL) is safe, so don't worry if value ever got used */
+	kfree(value);
+	return acl;
+}
+
+int pvfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+	int error = 0;
+	void *value = NULL;
+	size_t size = 0;
+	const char *name = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = PVFS2_XATTR_NAME_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = inode->i_mode;
+			/*
+			 * can we represent this with the traditional file
+			 * mode permission bits?
+			 */
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0) {
+				gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+					   __func__,
+					   error);
+				return error;
+			}
+
+			if (inode->i_mode != mode)
+				SetModeFlag(pvfs2_inode);
+			inode->i_mode = mode;
+			mark_inode_dirty_sync(inode);
+			if (error == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = PVFS2_XATTR_NAME_ACL_DEFAULT;
+		break;
+	default:
+		gossip_err("%s: invalid type %d!\n", __func__, type);
+		return -EINVAL;
+	}
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "%s: inode %pU, key %s type %d\n",
+		     __func__, get_khandle_from_ino(inode),
+		     name,
+		     type);
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+
+		error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (error < 0)
+			goto out;
+	}
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "%s: name %s, value %p, size %zd, acl %p\n",
+		     __func__, name, value, size, acl);
+	/*
+	 * Go ahead and set the extended attribute now. NOTE: Suppose acl
+	 * was NULL, then value will be NULL and size will be 0 and that
+	 * will xlate to a removexattr. However, we don't want removexattr
+	 * complain if attributes does not exist.
+	 */
+	error = pvfs2_inode_setxattr(inode, "", name, value, size, 0);
+
+out:
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+int pvfs2_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+	struct posix_acl *default_acl, *acl;
+	umode_t mode = inode->i_mode;
+	int error = 0;
+
+	ClearModeFlag(pvfs2_inode);
+
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = pvfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+
+	if (acl) {
+		if (!error)
+			error = pvfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+
+	/* If mode of the inode was changed, then do a forcible ->setattr */
+	if (mode != inode->i_mode) {
+		SetModeFlag(pvfs2_inode);
+		inode->i_mode = mode;
+		pvfs2_flush_inode(inode);
+	}
+
+	return error;
+}
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Implementation of dentry (directory cache) functions.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+
+/* Returns 1 if dentry can still be trusted, else 0. */
+static int pvfs2_revalidate_lookup(struct dentry *dentry)
+{
+	struct dentry *parent_dentry = dget_parent(dentry);
+	struct inode *parent_inode = parent_dentry->d_inode;
+	struct pvfs2_inode_s *parent = PVFS2_I(parent_inode);
+	struct inode *inode = dentry->d_inode;
+	struct pvfs2_kernel_op_s *new_op;
+	int ret = 0;
+	int err = 0;
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
+
+	new_op = op_alloc(PVFS2_VFS_OP_LOOKUP);
+	if (!new_op)
+		goto out_put_parent;
+
+	new_op->upcall.req.lookup.sym_follow = PVFS2_LOOKUP_LINK_NO_FOLLOW;
+	new_op->upcall.req.lookup.parent_refn = parent->refn;
+	strncpy(new_op->upcall.req.lookup.d_name,
+		dentry->d_name.name,
+		PVFS2_NAME_LEN);
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+		     "%s:%s:%d interrupt flag [%d]\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     get_interruptible_flag(parent_inode));
+
+	err = service_operation(new_op, "pvfs2_lookup",
+			get_interruptible_flag(parent_inode));
+	if (err)
+		goto out_drop;
+
+	if (new_op->downcall.status != 0 ||
+	    !match_handle(new_op->downcall.resp.lookup.refn.khandle, inode)) {
+		gossip_debug(GOSSIP_DCACHE_DEBUG,
+			"%s:%s:%d "
+			"lookup failure |%s| or no match |%s|.\n",
+			__FILE__,
+			__func__,
+			__LINE__,
+			new_op->downcall.status ? "true" : "false",
+			match_handle(new_op->downcall.resp.lookup.refn.khandle,
+					inode) ? "false" : "true");
+		gossip_debug(GOSSIP_DCACHE_DEBUG,
+			     "%s:%s:%d revalidate failed\n",
+			     __FILE__, __func__, __LINE__);
+		goto out_drop;
+	}
+
+	ret = 1;
+out_release_op:
+	op_release(new_op);
+out_put_parent:
+	dput(parent_dentry);
+	return ret;
+out_drop:
+	d_drop(dentry);
+	goto out_release_op;
+}
+
+/*
+ * Verify that dentry is valid.
+ *
+ * Should return 1 if dentry can still be trusted, else 0
+ */
+static int pvfs2_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *inode;
+	int ret = 0;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
+		     __func__, dentry);
+
+	/* find inode from dentry */
+	if (!dentry->d_inode) {
+		gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
+			     __func__);
+		goto invalid_exit;
+	}
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: inode valid.\n", __func__);
+	inode = dentry->d_inode;
+
+	/*
+	 * first perform a lookup to make sure that the object not only
+	 * exists, but is still in the expected place in the name space
+	 */
+	if (!is_root_handle(inode)) {
+		if (!pvfs2_revalidate_lookup(dentry))
+			goto invalid_exit;
+	} else {
+		gossip_debug(GOSSIP_DCACHE_DEBUG,
+			     "%s: root handle, lookup skipped.\n",
+			     __func__);
+	}
+
+	/* now perform getattr */
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+		     "%s: doing getattr: inode: %p, handle: %pU\n",
+		     __func__,
+		     inode,
+		     get_khandle_from_ino(inode));
+	ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+		     "%s: getattr %s (ret = %d), returning %s for dentry i_count=%d\n",
+		     __func__,
+		     (ret == 0 ? "succeeded" : "failed"),
+		     ret,
+		     (ret == 0 ? "valid" : "INVALID"),
+		     atomic_read(&inode->i_count));
+	if (ret != 0)
+		goto invalid_exit;
+
+	/* dentry is valid! */
+	return 1;
+
+invalid_exit:
+	return 0;
+}
+
+const struct dentry_operations pvfs2_dentry_operations = {
+	.d_revalidate = pvfs2_d_revalidate,
+};
--- a/fs/orangefs/devpvfs2-req.c
+++ b/fs/orangefs/devpvfs2-req.c
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+struct readdir_handle_s {
+	int buffer_index;
+	struct pvfs2_readdir_response_s readdir_response;
+	void *dents_buf;
+};
+
+/*
+ * decode routine needed by kmod to make sense of the shared page for readdirs.
+ */
+static long decode_dirents(char *ptr, struct pvfs2_readdir_response_s *readdir)
+{
+	int i;
+	struct pvfs2_readdir_response_s *rd =
+		(struct pvfs2_readdir_response_s *) ptr;
+	char *buf = ptr;
+	char **pptr = &buf;
+
+	readdir->token = rd->token;
+	readdir->pvfs_dirent_outcount = rd->pvfs_dirent_outcount;
+	readdir->dirent_array = kmalloc(readdir->pvfs_dirent_outcount *
+					sizeof(*readdir->dirent_array),
+					GFP_KERNEL);
+	if (readdir->dirent_array == NULL)
+		return -ENOMEM;
+	*pptr += offsetof(struct pvfs2_readdir_response_s, dirent_array);
+	for (i = 0; i < readdir->pvfs_dirent_outcount; i++) {
+		dec_string(pptr, &readdir->dirent_array[i].d_name,
+			   &readdir->dirent_array[i].d_length);
+		readdir->dirent_array[i].khandle =
+			*(struct pvfs2_khandle *) *pptr;
+		*pptr += 16;
+	}
+	return (unsigned long)*pptr - (unsigned long)ptr;
+}
+
+static long readdir_handle_ctor(struct readdir_handle_s *rhandle, void *buf,
+				int buffer_index)
+{
+	long ret;
+
+	if (buf == NULL) {
+		gossip_err
+		    ("Invalid NULL buffer specified in readdir_handle_ctor\n");
+		return -ENOMEM;
+	}
+	if (buffer_index < 0) {
+		gossip_err
+		    ("Invalid buffer index specified in readdir_handle_ctor\n");
+		return -EINVAL;
+	}
+	rhandle->buffer_index = buffer_index;
+	rhandle->dents_buf = buf;
+	ret = decode_dirents(buf, &rhandle->readdir_response);
+	if (ret < 0) {
+		gossip_err("Could not decode readdir from buffer %ld\n", ret);
+		rhandle->buffer_index = -1;
+		gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", buf);
+		vfree(buf);
+		rhandle->dents_buf = NULL;
+	}
+	return ret;
+}
+
+static void readdir_handle_dtor(struct pvfs2_bufmap *bufmap,
+		struct readdir_handle_s *rhandle)
+{
+	if (rhandle == NULL)
+		return;
+
+	/* kfree(NULL) is safe */
+	kfree(rhandle->readdir_response.dirent_array);
+	rhandle->readdir_response.dirent_array = NULL;
+
+	if (rhandle->buffer_index >= 0) {
+		readdir_index_put(bufmap, rhandle->buffer_index);
+		rhandle->buffer_index = -1;
+	}
+	if (rhandle->dents_buf) {
+		gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n",
+			     rhandle->dents_buf);
+		vfree(rhandle->dents_buf);
+		rhandle->dents_buf = NULL;
+	}
+}
+
+/*
+ * Read directory entries from an instance of an open directory.
+ *
+ * \note This routine was converted for the readdir to iterate change
+ *       in "struct file_operations". "converted" mostly amounts to
+ *       changing occurrences of "readdir" and "filldir" in the
+ *       comments to "iterate" and "dir_emit". Also filldir calls
+ *       were changed to dir_emit calls.
+ *
+ * \param dir_emit callback function called for each entry read.
+ *
+ * \retval <0 on error
+ * \retval 0  when directory has been completely traversed
+ * \retval >0 if we don't call dir_emit for all entries
+ *
+ * \note If the dir_emit call-back returns non-zero, then iterate should
+ *       assume that it has had enough, and should return as well.
+ */
+static int pvfs2_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct pvfs2_bufmap *bufmap = NULL;
+	int ret = 0;
+	int buffer_index;
+	__u64 *ptoken = file->private_data;
+	__u64 pos = 0;
+	ino_t ino = 0;
+	struct dentry *dentry = file->f_path.dentry;
+	struct pvfs2_kernel_op_s *new_op = NULL;
+	struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(dentry->d_inode);
+	int buffer_full = 0;
+	struct readdir_handle_s rhandle;
+	int i = 0;
+	int len = 0;
+	ino_t current_ino = 0;
+	char *current_entry = NULL;
+	long bytes_decoded;
+
+	gossip_ldebug(GOSSIP_DIR_DEBUG,
+		      "%s: ctx->pos:%lld, token = %llu\n",
+		      __func__,
+		      lld(ctx->pos),
+		      llu(*ptoken));
+
+	pos = (__u64) ctx->pos;
+
+	/* are we done? */
+	if (pos == PVFS_READDIR_END) {
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "Skipping to termination path\n");
+		return 0;
+	}
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "pvfs2_readdir called on %s (pos=%llu)\n",
+		     dentry->d_name.name, llu(pos));
+
+	rhandle.buffer_index = -1;
+	rhandle.dents_buf = NULL;
+	memset(&rhandle.readdir_response, 0, sizeof(rhandle.readdir_response));
+
+	new_op = op_alloc(PVFS2_VFS_OP_READDIR);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->uses_shared_memory = 1;
+	new_op->upcall.req.readdir.refn = pvfs2_inode->refn;
+	new_op->upcall.req.readdir.max_dirent_count = MAX_DIRENT_COUNT_READDIR;
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "%s: upcall.req.readdir.refn.khandle: %pU\n",
+		     __func__,
+		     &new_op->upcall.req.readdir.refn.khandle);
+
+	/*
+	 * NOTE: the position we send to the readdir upcall is out of
+	 * sync with ctx->pos since:
+	 * 1. pvfs2 doesn't include the "." and ".." entries that are
+	 *    added below.
+	 * 2. the introduction of distributed directory logic makes token no
+	 *    longer be related to f_pos and pos. Instead an independent
+	 *    variable is used inside the function and stored in the
+	 *    private_data of the file structure.
+	 */
+	new_op->upcall.req.readdir.token = *ptoken;
+
+get_new_buffer_index:
+	ret = readdir_index_get(&bufmap, &buffer_index);
+	if (ret < 0) {
+		gossip_lerr("pvfs2_readdir: readdir_index_get() failure (%d)\n",
+			    ret);
+		goto out_free_op;
+	}
+	new_op->upcall.req.readdir.buf_index = buffer_index;
+
+	ret = service_operation(new_op,
+				"pvfs2_readdir",
+				get_interruptible_flag(dentry->d_inode));
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "Readdir downcall status is %d.  ret:%d\n",
+		     new_op->downcall.status,
+		     ret);
+
+	if (ret == -EAGAIN && op_state_purged(new_op)) {
+		/*
+		 * readdir shared memory aread has been wiped due to
+		 * pvfs2-client-core restarting, so we must get a new
+		 * index into the shared memory.
+		 */
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			"%s: Getting new buffer_index for retry of readdir..\n",
+			 __func__);
+		readdir_index_put(bufmap, buffer_index);
+		goto get_new_buffer_index;
+	}
+
+	if (ret == -EIO && op_state_purged(new_op)) {
+		gossip_err("%s: Client is down. Aborting readdir call.\n",
+			__func__);
+		readdir_index_put(bufmap, buffer_index);
+		goto out_free_op;
+	}
+
+	if (ret < 0 || new_op->downcall.status != 0) {
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "Readdir request failed.  Status:%d\n",
+			     new_op->downcall.status);
+		readdir_index_put(bufmap, buffer_index);
+		if (ret >= 0)
+			ret = new_op->downcall.status;
+		goto out_free_op;
+	}
+
+	bytes_decoded =
+		readdir_handle_ctor(&rhandle,
+				    new_op->downcall.trailer_buf,
+				    buffer_index);
+	if (bytes_decoded < 0) {
+		gossip_err("pvfs2_readdir: Could not decode trailer buffer into a readdir response %d\n",
+			ret);
+		ret = bytes_decoded;
+		readdir_index_put(bufmap, buffer_index);
+		goto out_free_op;
+	}
+
+	if (bytes_decoded != new_op->downcall.trailer_size) {
+		gossip_err("pvfs2_readdir: # bytes decoded (%ld) != trailer size (%ld)\n",
+			bytes_decoded,
+			(long)new_op->downcall.trailer_size);
+		ret = -EINVAL;
+		goto out_destroy_handle;
+	}
+
+	if (pos == 0) {
+		ino = get_ino_from_khandle(dentry->d_inode);
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "%s: calling dir_emit of \".\" with pos = %llu\n",
+			     __func__,
+			     llu(pos));
+		ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
+		if (ret < 0)
+			goto out_destroy_handle;
+		ctx->pos++;
+		gossip_ldebug(GOSSIP_DIR_DEBUG,
+			      "%s: ctx->pos:%lld\n",
+			      __func__,
+			      lld(ctx->pos));
+		pos++;
+	}
+
+	if (pos == 1) {
+		ino = get_parent_ino_from_dentry(dentry);
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "%s: calling dir_emit of \"..\" with pos = %llu\n",
+			     __func__,
+			     llu(pos));
+		ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
+		if (ret < 0)
+			goto out_destroy_handle;
+		ctx->pos++;
+		gossip_ldebug(GOSSIP_DIR_DEBUG,
+			      "%s: ctx->pos:%lld\n",
+			      __func__,
+			      lld(ctx->pos));
+		pos++;
+	}
+
+	for (i = 0; i < rhandle.readdir_response.pvfs_dirent_outcount; i++) {
+		len = rhandle.readdir_response.dirent_array[i].d_length;
+		current_entry = rhandle.readdir_response.dirent_array[i].d_name;
+		current_ino = pvfs2_khandle_to_ino(
+			&(rhandle.readdir_response.dirent_array[i].khandle));
+
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "calling dir_emit for %s with len %d, pos %ld\n",
+			     current_entry,
+			     len,
+			     (unsigned long)pos);
+		ret =
+		    dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
+		if (ret < 0) {
+			gossip_debug(GOSSIP_DIR_DEBUG,
+				     "dir_emit() failed. ret:%d\n",
+				     ret);
+			if (i < 2) {
+				gossip_err("dir_emit failed on one of the first two true PVFS directory entries.\n");
+				gossip_err("Duplicate entries may appear.\n");
+			}
+			buffer_full = 1;
+			break;
+		}
+		ctx->pos++;
+		gossip_ldebug(GOSSIP_DIR_DEBUG,
+			      "%s: ctx->pos:%lld\n",
+			      __func__,
+			      lld(ctx->pos));
+
+		pos++;
+	}
+
+	/* this means that all of the dir_emit calls succeeded */
+	if (i == rhandle.readdir_response.pvfs_dirent_outcount) {
+		/* update token */
+		*ptoken = rhandle.readdir_response.token;
+	} else {
+		/* this means a dir_emit call failed */
+		if (rhandle.readdir_response.token == PVFS_READDIR_END) {
+			/*
+			 * If PVFS hit end of directory, then there
+			 * is no way to do math on the token that it
+			 * returned. Instead we go by ctx->pos but
+			 * back up to account for the artificial .
+			 * and .. entries.
+			 */
+			ctx->pos -= 3;
+		} else {
+			/*
+			 * this means a dir_emit call failed. !!! need to set
+			 * back to previous ctx->pos, no middle value allowed
+			 */
+			pos -= (i - 1);
+			ctx->pos -= (i - 1);
+		}
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			"at least one dir_emit call failed. Setting ctx->pos to: %lld\n",
+			lld(ctx->pos));
+	}
+
+	/*
+	 * Did we hit the end of the directory?
+	 */
+	if (rhandle.readdir_response.token == PVFS_READDIR_END &&
+	    !buffer_full) {
+		gossip_debug(GOSSIP_DIR_DEBUG, "End of dir detected; setting ctx->pos to PVFS_READDIR_END.\n");
+		ctx->pos = PVFS_READDIR_END;
+	}
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "pos = %llu, token = %llu"
+		     ", ctx->pos should have been %lld\n",
+		     llu(pos),
+		     llu(*ptoken),
+		     lld(ctx->pos));
+
+out_destroy_handle:
+	readdir_handle_dtor(bufmap, &rhandle);
+out_free_op:
+	op_release(new_op);
+	gossip_debug(GOSSIP_DIR_DEBUG, "pvfs2_readdir returning %d\n", ret);
+	return ret;
+}
+
+static int pvfs2_dir_open(struct inode *inode, struct file *file)
+{
+	__u64 *ptoken;
+
+	file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+	if (!file->private_data)
+		return -ENOMEM;
+
+	ptoken = file->private_data;
+	*ptoken = PVFS_READDIR_START;
+	return 0;
+}
+
+static int pvfs2_dir_release(struct inode *inode, struct file *file)
+{
+	pvfs2_flush_inode(inode);
+	kfree(file->private_data);
+	return 0;
+}
+
+/** PVFS2 implementation of VFS directory operations */
+const struct file_operations pvfs2_dir_operations = {
+	.read = generic_read_dir,
+	.iterate = pvfs2_readdir,
+	.open = pvfs2_dir_open,
+	.release = pvfs2_dir_release,
+};
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Linux VFS inode operations.
+ */
+
+#include "protocol.h"
+#include "pvfs2-kernel.h"
+#include "pvfs2-bufmap.h"
+
+static int read_one_page(struct page *page)
+{
+	void *page_data;
+	int ret;
+	int max_block;
+	ssize_t bytes_read = 0;
+	struct inode *inode = page->mapping->host;
+	const __u32 blocksize = PAGE_CACHE_SIZE;	/* inode->i_blksize */
+	const __u32 blockbits = PAGE_CACHE_SHIFT;	/* inode->i_blkbits */
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		    "pvfs2_readpage called with page %p\n",
+		     page);
+	page_data = pvfs2_kmap(page);
+
+	max_block = ((inode->i_size / blocksize) + 1);
+
+	if (page->index < max_block) {
+		loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
+
+		bytes_read = pvfs2_inode_read(inode,
+					      page_data,
+					      blocksize,
+					      &blockptr_offset,
+					      inode->i_size);
+	}
+	/* only zero remaining unread portions of the page data */
+	if (bytes_read > 0)
+		memset(page_data + bytes_read, 0, blocksize - bytes_read);
+	else
+		memset(page_data, 0, blocksize);
+	/* takes care of potential aliasing */
+	flush_dcache_page(page);
+	if (bytes_read < 0) {
+		ret = bytes_read;
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		ret = 0;
+	}
+	pvfs2_kunmap(page);
+	/* unlock the page after the ->readpage() routine completes */
+	unlock_page(page);
+	return ret;
+}
+
+static int pvfs2_readpage(struct file *file, struct page *page)
+{
+	return read_one_page(page);
+}
+
+static int pvfs2_readpages(struct file *file,
+			   struct address_space *mapping,
+			   struct list_head *pages,
+			   unsigned nr_pages)
+{
+	int page_idx;
+	int ret;
+
+	gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_readpages called\n");
+
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page;
+
+		page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		if (!add_to_page_cache(page,
+				       mapping,
+				       page->index,
+				       GFP_KERNEL)) {
+			ret = read_one_page(page);
+			gossip_debug(GOSSIP_INODE_DEBUG,
+				"failure adding page to cache, read_one_page returned: %d\n",
+				ret);
+	      } else {
+			page_cache_release(page);
+	      }
+	}
+	BUG_ON(!list_empty(pages));
+	return 0;
+}
+
+static void pvfs2_invalidatepage(struct page *page,
+				 unsigned int offset,
+				 unsigned int length)
+{
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_invalidatepage called on page %p "
+		     "(offset is %u)\n",
+		     page,
+		     offset);
+
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	return;
+
+}
+
+static int pvfs2_releasepage(struct page *page, gfp_t foo)
+{
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_releasepage called on page %p\n",
+		     page);
+	return 0;
+}
+
+/*
+ * Having a direct_IO entry point in the address_space_operations
+ * struct causes the kernel to allows us to use O_DIRECT on
+ * open. Nothing will ever call this thing, but in the future we
+ * will need to be able to use O_DIRECT on open in order to support
+ * AIO. Modeled after NFS, they do this too.
+ */
+/*
+static ssize_t pvfs2_direct_IO(int rw,
+			struct kiocb *iocb,
+			struct iov_iter *iter,
+			loff_t offset)
+{
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_direct_IO: %s\n",
+		     iocb->ki_filp->f_path.dentry->d_name.name);
+
+	return -EINVAL;
+}
+*/
+
+struct backing_dev_info pvfs2_backing_dev_info = {
+	.name = "pvfs2",
+	.ra_pages = 0,
+	.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+/** PVFS2 implementation of address space operations */
+const struct address_space_operations pvfs2_address_operations = {
+	.readpage = pvfs2_readpage,
+	.readpages = pvfs2_readpages,
+	.invalidatepage = pvfs2_invalidatepage,
+	.releasepage = pvfs2_releasepage,
+/*	.direct_IO = pvfs2_direct_IO */
+};
+
+static int pvfs2_setattr_size(struct inode *inode, struct iattr *iattr)
+{
+	struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
+	struct pvfs2_kernel_op_s *new_op;
+	loff_t orig_size = i_size_read(inode);
+	int ret = -EINVAL;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
+		     __func__,
+		     get_khandle_from_ino(inode),
+		     &pvfs2_inode->refn.khandle,
+		     pvfs2_inode->refn.fs_id,
+		     iattr->ia_size);
+
+	truncate_setsize(inode, iattr->ia_size);
+
+	new_op = op_alloc(PVFS2_VFS_OP_TRUNCATE);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.truncate.refn = pvfs2_inode->refn;
+	new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
+
+	ret = service_operation(new_op, __func__,
+				get_interruptible_flag(inode));
+
+	/*
+	 * the truncate has no downcall members to retrieve, but
+	 * the status value tells us if it went through ok or not
+	 */
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2: pvfs2_truncate got return value of %d\n",
+		     ret);
+
+	op_release(new_op);
+
+	if (ret != 0)
+		return ret;
+
+	/*
+	 * Only change the c/mtime if we are changing the size or we are
+	 * explicitly asked to change it.  This handles the semantic difference
+	 * between truncate() and ftruncate() as implemented in the VFS.
+	 *
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (orig_size != i_size_read(inode) &&
+	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+		iattr->ia_ctime = iattr->ia_mtime =
+			current_fs_time(inode->i_sb);
+		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+	}
+
+	return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int pvfs2_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int ret = -EINVAL;
+	struct inode *inode = dentry->d_inode;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_setattr: called on %s\n",
+		     dentry->d_name.name);
+
+	ret = inode_change_ok(inode, iattr);
+	if (ret)
+		goto out;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		ret = pvfs2_setattr_size(inode, iattr);
+		if (ret)
+			goto out;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	ret = pvfs2_inode_setattr(inode, iattr);
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_setattr: inode_setattr returned %d\n",
+		     ret);
+
+	if (!ret && (iattr->ia_valid & ATTR_MODE))
+		/* change mod on a file that has ACLs */
+		ret = posix_acl_chmod(inode, inode->i_mode);
+
+out:
+	gossip_debug(GOSSIP_INODE_DEBUG, "pvfs2_setattr: returning %d\n", ret);
+	return ret;
+}
+
+/*
+ * Obtain attributes of an object given a dentry
+ */
+int pvfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *kstat)
+{
+	int ret = -ENOENT;
+	struct inode *inode = dentry->d_inode;
+	struct pvfs2_inode_s *pvfs2_inode = NULL;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_getattr: called on %s\n",
+		     dentry->d_name.name);
+
+	/*
+	 * Similar to the above comment, a getattr also expects that all
+	 * fields/attributes of the inode would be refreshed. So again, we
+	 * dont have too much of a choice but refresh all the attributes.
+	 */
+	ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+	if (ret == 0) {
+		generic_fillattr(inode, kstat);
+		/* override block size reported to stat */
+		pvfs2_inode = PVFS2_I(inode);
+		kstat->blksize = pvfs2_inode->blksize;
+	} else {
+		/* assume an I/O error and flag inode as bad */
+		gossip_debug(GOSSIP_INODE_DEBUG,
+			     "%s:%s:%d calling make bad inode\n",
+			     __FILE__,
+			     __func__,
+			     __LINE__);
+		pvfs2_make_bad_inode(inode);
+	}
+	return ret;
+}
+
+/* PVFS2 implementation of VFS inode operations for files */
+struct inode_operations pvfs2_file_inode_operations = {
+	.get_acl = pvfs2_get_acl,
+	.set_acl = pvfs2_set_acl,
+	.setattr = pvfs2_setattr,
+	.getattr = pvfs2_getattr,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.listxattr = pvfs2_listxattr,
+	.removexattr = generic_removexattr,
+};
+
+static int pvfs2_init_iops(struct inode *inode)
+{
+	inode->i_mapping->a_ops = &pvfs2_address_operations;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &pvfs2_file_inode_operations;
+		inode->i_fop = &pvfs2_file_operations;
+		inode->i_blkbits = PAGE_CACHE_SHIFT;
+		break;
+	case S_IFLNK:
+		inode->i_op = &pvfs2_symlink_inode_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &pvfs2_dir_inode_operations;
+		inode->i_fop = &pvfs2_dir_operations;
+		break;
+	default:
+		gossip_debug(GOSSIP_INODE_DEBUG,
+			     "%s: unsupported mode\n",
+			     __func__);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Given a PVFS2 object identifier (fsid, handle), convert it into a ino_t type
+ * that will be used as a hash-index from where the handle will
+ * be searched for in the VFS hash table of inodes.
+ */
+static inline ino_t pvfs2_handle_hash(struct pvfs2_object_kref *ref)
+{
+	if (!ref)
+		return 0;
+	return pvfs2_khandle_to_ino(&(ref->khandle));
+}
+
+/*
+ * Called to set up an inode from iget5_locked.
+ */
+static int pvfs2_set_inode(struct inode *inode, void *data)
+{
+	struct pvfs2_object_kref *ref = (struct pvfs2_object_kref *) data;
+	struct pvfs2_inode_s *pvfs2_inode = NULL;
+
+	/* Make sure that we have sane parameters */
+	if (!data || !inode)
+		return 0;
+	pvfs2_inode = PVFS2_I(inode);
+	if (!pvfs2_inode)
+		return 0;
+	pvfs2_inode->refn.fs_id = ref->fs_id;
+	pvfs2_inode->refn.khandle = ref->khandle;
+	return 0;
+}
+
+/*
+ * Called to determine if handles match.
+ */
+static int pvfs2_test_inode(struct inode *inode, void *data)
+{
+	struct pvfs2_object_kref *ref = (struct pvfs2_object_kref *) data;
+	struct pvfs2_inode_s *pvfs2_inode = NULL;
+
+	pvfs2_inode = PVFS2_I(inode);
+	return (!PVFS_khandle_cmp(&(pvfs2_inode->refn.khandle), &(ref->khandle))
+		&& pvfs2_inode->refn.fs_id == ref->fs_id);
+}
+
+/*
+ * Front-end to lookup the inode-cache maintained by the VFS using the PVFS2
+ * file handle.
+ *
+ * @sb: the file system super block instance.
+ * @ref: The PVFS2 object for which we are trying to locate an inode structure.
+ */
+struct inode *pvfs2_iget(struct super_block *sb, struct pvfs2_object_kref *ref)
+{
+	struct inode *inode = NULL;
+	unsigned long hash;
+	int error;
+
+	hash = pvfs2_handle_hash(ref);
+	inode = iget5_locked(sb, hash, pvfs2_test_inode, pvfs2_set_inode, ref);
+	if (!inode || !(inode->i_state & I_NEW))
+		return inode;
+
+	error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+	if (error) {
+		iget_failed(inode);
+		return ERR_PTR(error);
+	}
+
+	inode->i_ino = hash;	/* needed for stat etc */
+	pvfs2_init_iops(inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
+		     &ref->khandle,
+		     ref->fs_id,
+		     hash,
+		     inode->i_ino);
+
+	return inode;
+}
+
+/*
+ * Allocate an inode for a newly created file and insert it into the inode hash.
+ */
+struct inode *pvfs2_new_inode(struct super_block *sb, struct inode *dir,
+		int mode, dev_t dev, struct pvfs2_object_kref *ref)
+{
+	unsigned long hash = pvfs2_handle_hash(ref);
+	struct inode *inode;
+	int error;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "pvfs2_get_custom_inode_common: called\n"
+		     "(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
+		     sb,
+		     MAJOR(dev),
+		     MINOR(dev),
+		     mode);
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	pvfs2_set_inode(inode, ref);
+	inode->i_ino = hash;	/* needed for stat etc */
+
+	error = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_ALL_NOHINT);
+	if (error)
+		goto out_iput;
+
+	pvfs2_init_iops(inode);
+
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_size = PAGE_CACHE_SIZE;
+	inode->i_rdev = dev;
+
+	error = insert_inode_locked4(inode, hash, pvfs2_test_inode, ref);
+	if (error < 0)
+		goto out_iput;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "Initializing ACL's for inode %pU\n",
+		     get_khandle_from_ino(inode));
+	pvfs2_init_acl(inode, dir);
+	return inode;
+
+out_iput:
+	iput(inode);
+	return ERR_PTR(error);
+}