提交 b3ae4755 编写于 作者: M Mike Marshall

Orangefs: implement .write_iter

Until now, orangefs_devreq_write_iter has just been a wrapper for
the old-fashioned orangefs_devreq_writev... linux would call
.write_iter with "struct kiocb *iocb" and "struct iov_iter *iter"
and .write_iter would just:

        return pvfs2_devreq_writev(iocb->ki_filp,
                                   iter->iov,
                                   iter->nr_segs,
                                   &iocb->ki_pos);
Signed-off-by: NMike Marshall <hubcap@omnibond.com>
上级 85096169
...@@ -245,304 +245,240 @@ static ssize_t orangefs_devreq_read(struct file *file, ...@@ -245,304 +245,240 @@ static ssize_t orangefs_devreq_read(struct file *file,
} }
/* /*
* Function for writev() callers into the device. Readdir related * Function for writev() callers into the device.
* operations have an extra iovec containing info about objects *
* contained in directories. * Userspace should have written:
* - __u32 version
* - __u32 magic
* - __u64 tag
* - struct orangefs_downcall_s
* - trailer buffer (in the case of READDIR operations)
*/ */
static ssize_t orangefs_devreq_writev(struct file *file, static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
const struct iovec *iov, struct iov_iter *iter)
size_t count,
loff_t *offset)
{ {
ssize_t ret;
struct orangefs_kernel_op_s *op = NULL; struct orangefs_kernel_op_s *op = NULL;
void *buffer = NULL; struct {
void *ptr = NULL; __u32 version;
unsigned long i = 0; __u32 magic;
int num_remaining = MAX_DEV_REQ_DOWNSIZE; __u64 tag;
int ret = 0; } head;
/* num elements in iovec without trailer */ int total = ret = iov_iter_count(iter);
int notrailer_count = 4; int n;
/* int downcall_size = sizeof(struct orangefs_downcall_s);
* If there's a trailer, its iov index will be equal to int head_size = sizeof(head);
* notrailer_count.
*/ gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n",
int trailer_index = notrailer_count; __func__,
int payload_size = 0; total,
int returned_downcall_size = 0; ret);
__s32 magic = 0;
__s32 proto_ver = 0;
__u64 tag = 0;
ssize_t total_returned_size = 0;
/* if (total < MAX_DEV_REQ_DOWNSIZE) {
* There will always be at least notrailer_count iovecs, and gossip_err("%s: total:%d: must be at least:%lu:\n",
* when there's a trailer, one more than notrailer_count. Check __func__,
* count's sanity. total,
*/ MAX_DEV_REQ_DOWNSIZE);
if (count != notrailer_count && count != (notrailer_count + 1)) { ret = -EFAULT;
gossip_err("%s: count:%zu: notrailer_count :%d:\n", goto out;
__func__,
count,
notrailer_count);
return -EPROTO;
} }
n = copy_from_iter(&head, head_size, iter);
/* Copy the non-trailer iovec data into a device request buffer. */ if (n < head_size) {
buffer = dev_req_alloc(); gossip_err("%s: failed to copy head.\n", __func__);
if (!buffer) { ret = -EFAULT;
gossip_err("%s: dev_req_alloc failed.\n", __func__); goto out;
return -ENOMEM;
} }
ptr = buffer;
for (i = 0; i < notrailer_count; i++) { if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
if (iov[i].iov_len > num_remaining) { gossip_err("%s: userspace claims version"
gossip_err "%d, minimum version required: %d.\n",
("writev error: Freeing buffer and returning\n"); __func__,
dev_req_release(buffer); head.version,
return -EMSGSIZE; ORANGEFS_MINIMUM_USERSPACE_VERSION);
} ret = -EPROTO;
ret = copy_from_user(ptr, iov[i].iov_base, iov[i].iov_len); goto out;
if (ret) {
gossip_err("Failed to copy data from user space\n");
dev_req_release(buffer);
return -EIO;
}
num_remaining -= iov[i].iov_len;
ptr += iov[i].iov_len;
payload_size += iov[i].iov_len;
} }
total_returned_size = payload_size;
/* these elements are currently 8 byte aligned (8 bytes for (version + if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
* magic) 8 bytes for tag). If you add another element, either gossip_err("Error: Device magic number does not match.\n");
* make it 8 bytes big, or use get_unaligned when asigning. ret = -EPROTO;
*/ goto out;
ptr = buffer; }
proto_ver = *((__s32 *) ptr); /* unused */
ptr += sizeof(__s32);
magic = *((__s32 *) ptr); op = orangefs_devreq_remove_op(head.tag);
ptr += sizeof(__s32); if (!op) {
gossip_err("WARNING: No one's waiting for tag %llu\n",
llu(head.tag));
goto out;
}
tag = *((__u64 *) ptr); get_op(op); /* increase ref count. */
ptr += sizeof(__u64);
if (magic != ORANGEFS_DEVREQ_MAGIC) { n = copy_from_iter(&op->downcall, downcall_size, iter);
gossip_err("Error: Device magic number does not match.\n"); if (n != downcall_size) {
dev_req_release(buffer); gossip_err("%s: failed to copy downcall.\n", __func__);
return -EPROTO; put_op(op);
ret = -EFAULT;
goto out;
} }
op = orangefs_devreq_remove_op(tag); if (op->downcall.status)
if (op) { goto wakeup;
/* Increase ref count! */
get_op(op);
/* calculate the size of the returned downcall. */
returned_downcall_size =
payload_size - (2 * sizeof(__s32) + sizeof(__u64));
/* copy the passed in downcall into the op */
if (returned_downcall_size ==
sizeof(struct orangefs_downcall_s)) {
memcpy(&op->downcall,
ptr,
sizeof(struct orangefs_downcall_s));
} else {
gossip_err("%s: returned downcall size:%d: \n",
__func__,
returned_downcall_size);
dev_req_release(buffer);
put_op(op);
return -EMSGSIZE;
}
/* Don't tolerate an unexpected trailer iovec. */ /*
if ((op->downcall.trailer_size == 0) && * We've successfully peeled off the head and the downcall.
(count != notrailer_count)) { * Something has gone awry if total doesn't equal the
gossip_err("%s: unexpected trailer iovec.\n", * sum of head_size, downcall_size and trailer_size.
__func__); */
dev_req_release(buffer); if ((head_size + downcall_size + op->downcall.trailer_size) != total) {
put_op(op); gossip_err("%s: funky write, head_size:%d"
return -EPROTO; ": downcall_size:%d: trailer_size:%lld"
} ": total size:%d:\n",
__func__,
head_size,
downcall_size,
op->downcall.trailer_size,
total);
put_op(op);
ret = -EFAULT;
goto out;
}
/* Don't consider the trailer if there's a bad status. */ /* Only READDIR operations should have trailers. */
if (op->downcall.status != 0) if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) &&
goto no_trailer; (op->downcall.trailer_size != 0)) {
gossip_err("%s: %x operation with trailer.",
__func__,
op->downcall.type);
put_op(op);
ret = -EFAULT;
goto out;
}
/* get the trailer if there is one. */ /* READDIR operations should always have trailers. */
if (op->downcall.trailer_size == 0) if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) &&
goto no_trailer; (op->downcall.trailer_size == 0)) {
gossip_err("%s: %x operation with no trailer.",
__func__,
op->downcall.type);
put_op(op);
ret = -EFAULT;
goto out;
}
gossip_debug(GOSSIP_DEV_DEBUG, if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
"%s: op->downcall.trailer_size %lld\n", goto wakeup;
__func__,
op->downcall.trailer_size);
/* op->downcall.trailer_buf =
* Bail if we think think there should be a trailer, but vmalloc(op->downcall.trailer_size);
* there's no iovec for it. if (op->downcall.trailer_buf == NULL) {
*/ gossip_err("%s: failed trailer vmalloc.\n",
if (count != (notrailer_count + 1)) { __func__);
gossip_err("%s: trailer_size:%lld: count:%zu:\n", put_op(op);
__func__, ret = -ENOMEM;
op->downcall.trailer_size, goto out;
count); }
dev_req_release(buffer); memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
put_op(op); n = copy_from_iter(op->downcall.trailer_buf,
return -EPROTO; op->downcall.trailer_size,
} iter);
if (n != op->downcall.trailer_size) {
gossip_err("%s: failed to copy trailer.\n", __func__);
vfree(op->downcall.trailer_buf);
put_op(op);
ret = -EFAULT;
goto out;
}
/* Verify that trailer_size is accurate. */ wakeup:
if (op->downcall.trailer_size != iov[trailer_index].iov_len) {
gossip_err("%s: trailer_size:%lld: != iov_len:%zd:\n",
__func__,
op->downcall.trailer_size,
iov[trailer_index].iov_len);
dev_req_release(buffer);
put_op(op);
return -EMSGSIZE;
}
total_returned_size += iov[trailer_index].iov_len; /*
* If this operation is an I/O operation we need to wait
* for all data to be copied before we can return to avoid
* buffer corruption and races that can pull the buffers
* out from under us.
*
* Essentially we're synchronizing with other parts of the
* vfs implicitly by not allowing the user space
* application reading/writing this device to return until
* the buffers are done being used.
*/
if (op->downcall.type == ORANGEFS_VFS_OP_FILE_IO) {
int timed_out = 0;
DEFINE_WAIT(wait_entry);
/* /*
* Allocate a buffer, copy the trailer bytes into it and * tell the vfs op waiting on a waitqueue
* attach it to the downcall. * that this op is done
*/ */
op->downcall.trailer_buf = vmalloc(iov[trailer_index].iov_len); spin_lock(&op->lock);
if (op->downcall.trailer_buf != NULL) { set_op_state_serviced(op);
gossip_debug(GOSSIP_DEV_DEBUG, "vmalloc: %p\n", spin_unlock(&op->lock);
op->downcall.trailer_buf);
ret = copy_from_user(op->downcall.trailer_buf,
iov[trailer_index].iov_base,
iov[trailer_index].iov_len);
if (ret) {
gossip_err("%s: Failed to copy trailer.\n",
__func__);
dev_req_release(buffer);
gossip_debug(GOSSIP_DEV_DEBUG,
"vfree: %p\n",
op->downcall.trailer_buf);
vfree(op->downcall.trailer_buf);
op->downcall.trailer_buf = NULL;
put_op(op);
return -EIO;
}
} else {
gossip_err("writev: could not vmalloc for trailer!\n");
dev_req_release(buffer);
put_op(op);
return -ENOMEM;
}
no_trailer: wake_up_interruptible(&op->waitq);
/* if this operation is an I/O operation we need to wait
* for all data to be copied before we can return to avoid
* buffer corruption and races that can pull the buffers
* out from under us.
*
* Essentially we're synchronizing with other parts of the
* vfs implicitly by not allowing the user space
* application reading/writing this device to return until
* the buffers are done being used.
*/
if (op->upcall.type == ORANGEFS_VFS_OP_FILE_IO) {
int timed_out = 0;
DEFINE_WAIT(wait_entry);
/* while (1) {
* tell the vfs op waiting on a waitqueue
* that this op is done
*/
spin_lock(&op->lock); spin_lock(&op->lock);
set_op_state_serviced(op); prepare_to_wait_exclusive(
spin_unlock(&op->lock); &op->io_completion_waitq,
&wait_entry,
wake_up_interruptible(&op->waitq); TASK_INTERRUPTIBLE);
if (op->io_completed) {
while (1) {
spin_lock(&op->lock);
prepare_to_wait_exclusive(
&op->io_completion_waitq,
&wait_entry,
TASK_INTERRUPTIBLE);
if (op->io_completed) {
spin_unlock(&op->lock);
break;
}
spin_unlock(&op->lock); spin_unlock(&op->lock);
if (!signal_pending(current)) {
int timeout =
MSECS_TO_JIFFIES(1000 *
op_timeout_secs);
if (!schedule_timeout(timeout)) {
gossip_debug(GOSSIP_DEV_DEBUG,
"%s: timed out.\n",
__func__);
timed_out = 1;
break;
}
continue;
}
gossip_debug(GOSSIP_DEV_DEBUG,
"%s: signal on I/O wait, aborting\n",
__func__);
break; break;
} }
spin_lock(&op->lock);
finish_wait(&op->io_completion_waitq, &wait_entry);
spin_unlock(&op->lock); spin_unlock(&op->lock);
/* NOTE: for I/O operations we handle releasing the op if (!signal_pending(current)) {
* object except in the case of timeout. the reason we int timeout =
* can't free the op in timeout cases is that the op MSECS_TO_JIFFIES(1000 *
* service logic in the vfs retries operations using op_timeout_secs);
* the same op ptr, thus it can't be freed. if (!schedule_timeout(timeout)) {
*/ gossip_debug(GOSSIP_DEV_DEBUG,
if (!timed_out) "%s: timed out.\n",
op_release(op); __func__);
} else { timed_out = 1;
break;
}
continue;
}
/* gossip_debug(GOSSIP_DEV_DEBUG,
* tell the vfs op waiting on a waitqueue that "%s: signal on I/O wait, aborting\n",
* this op is done __func__);
*/ break;
spin_lock(&op->lock);
set_op_state_serviced(op);
spin_unlock(&op->lock);
/*
* for every other operation (i.e. non-I/O), we need to
* wake up the callers for downcall completion
* notification
*/
wake_up_interruptible(&op->waitq);
} }
spin_lock(&op->lock);
finish_wait(&op->io_completion_waitq, &wait_entry);
spin_unlock(&op->lock);
/* NOTE: for I/O operations we handle releasing the op
* object except in the case of timeout. the reason we
* can't free the op in timeout cases is that the op
* service logic in the vfs retries operations using
* the same op ptr, thus it can't be freed.
*/
if (!timed_out)
op_release(op);
} else { } else {
/* ignore downcalls that we're not interested in */ /*
gossip_debug(GOSSIP_DEV_DEBUG, * tell the vfs op waiting on a waitqueue that
"WARNING: No one's waiting for tag %llu\n", * this op is done
llu(tag)); */
spin_lock(&op->lock);
set_op_state_serviced(op);
spin_unlock(&op->lock);
/*
* for every other operation (i.e. non-I/O), we need to
* wake up the callers for downcall completion
* notification
*/
wake_up_interruptible(&op->waitq);
} }
/* put_op? */ out:
dev_req_release(buffer); return ret;
return total_returned_size;
}
static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
return orangefs_devreq_writev(iocb->ki_filp,
iter->iov,
iter->nr_segs,
&iocb->ki_pos);
} }
/* Returns whether any FS are still pending remounted */ /* Returns whether any FS are still pending remounted */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册