提交 ca4ba96e 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "There are several patches from Ilya fixing RBD allocation lifecycle
  issues, a series adding a nocephx_sign_messages option (and associated
  bug fixes/cleanups), several patches from Zheng improving the
  (directory) fsync behavior, a big improvement in IO for direct-io
  requests when striping is enabled from Caifeng, and several other
  small fixes and cleanups"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  libceph: clear msg->con in ceph_msg_release() only
  libceph: add nocephx_sign_messages option
  libceph: stop duplicating client fields in messenger
  libceph: drop authorizer check from cephx msg signing routines
  libceph: msg signing callouts don't need con argument
  libceph: evaluate osd_req_op_data() arguments only once
  ceph: make fsync() wait unsafe requests that created/modified inode
  ceph: add request to i_unsafe_dirops when getting unsafe reply
  libceph: introduce ceph_x_authorizer_cleanup()
  ceph: don't invalidate page cache when inode is no longer used
  rbd: remove duplicate calls to rbd_dev_mapping_clear()
  rbd: set device_type::release instead of device::release
  rbd: don't free rbd_dev outside of the release callback
  rbd: return -ENOMEM instead of pool id if rbd_dev_create() fails
  libceph: use local variable cursor instead of &msg->cursor
  libceph: remove con argument in handle_reply()
  ceph: combine as many iovec as possile into one OSD request
  ceph: fix message length computation
  ceph: fix a comment typo
  rbd: drop null test before destroy functions
......@@ -418,8 +418,6 @@ MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (d
static int rbd_img_request_submit(struct rbd_img_request *img_request);
static void rbd_dev_device_release(struct device *dev);
static ssize_t rbd_add(struct bus_type *bus, const char *buf,
size_t count);
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
......@@ -3991,14 +3989,12 @@ static const struct attribute_group *rbd_attr_groups[] = {
NULL
};
static void rbd_sysfs_dev_release(struct device *dev)
{
}
static void rbd_dev_release(struct device *dev);
static struct device_type rbd_device_type = {
.name = "rbd",
.groups = rbd_attr_groups,
.release = rbd_sysfs_dev_release,
.release = rbd_dev_release,
};
static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
......@@ -4041,6 +4037,25 @@ static void rbd_spec_free(struct kref *kref)
kfree(spec);
}
static void rbd_dev_release(struct device *dev)
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
bool need_put = !!rbd_dev->opts;
rbd_put_client(rbd_dev->rbd_client);
rbd_spec_put(rbd_dev->spec);
kfree(rbd_dev->opts);
kfree(rbd_dev);
/*
* This is racy, but way better than putting module outside of
* the release callback. The race window is pretty small, so
* doing something similar to dm (dm-builtin.c) is overkill.
*/
if (need_put)
module_put(THIS_MODULE);
}
static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
struct rbd_spec *spec,
struct rbd_options *opts)
......@@ -4057,6 +4072,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
INIT_LIST_HEAD(&rbd_dev->node);
init_rwsem(&rbd_dev->header_rwsem);
rbd_dev->dev.bus = &rbd_bus_type;
rbd_dev->dev.type = &rbd_device_type;
rbd_dev->dev.parent = &rbd_root_dev;
device_initialize(&rbd_dev->dev);
rbd_dev->rbd_client = rbdc;
rbd_dev->spec = spec;
rbd_dev->opts = opts;
......@@ -4068,15 +4088,21 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
/*
* If this is a mapping rbd_dev (as opposed to a parent one),
* pin our module. We have a ref from do_rbd_add(), so use
* __module_get().
*/
if (rbd_dev->opts)
__module_get(THIS_MODULE);
return rbd_dev;
}
static void rbd_dev_destroy(struct rbd_device *rbd_dev)
{
rbd_put_client(rbd_dev->rbd_client);
rbd_spec_put(rbd_dev->spec);
kfree(rbd_dev->opts);
kfree(rbd_dev);
if (rbd_dev)
put_device(&rbd_dev->dev);
}
/*
......@@ -4702,27 +4728,6 @@ static int rbd_dev_header_info(struct rbd_device *rbd_dev)
return rbd_dev_v2_header_info(rbd_dev);
}
static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
{
struct device *dev;
int ret;
dev = &rbd_dev->dev;
dev->bus = &rbd_bus_type;
dev->type = &rbd_device_type;
dev->parent = &rbd_root_dev;
dev->release = rbd_dev_device_release;
dev_set_name(dev, "%d", rbd_dev->dev_id);
ret = device_register(dev);
return ret;
}
static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
{
device_unregister(&rbd_dev->dev);
}
/*
* Get a unique rbd identifier for the given new rbd_dev, and add
* the rbd_dev to the global list.
......@@ -5225,7 +5230,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
ret = rbd_bus_add_dev(rbd_dev);
dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
ret = device_add(&rbd_dev->dev);
if (ret)
goto err_out_mapping;
......@@ -5248,8 +5254,6 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
unregister_blkdev(rbd_dev->major, rbd_dev->name);
err_out_id:
rbd_dev_id_put(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
return ret;
}
......@@ -5397,7 +5401,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
struct rbd_spec *spec = NULL;
struct rbd_client *rbdc;
bool read_only;
int rc = -ENOMEM;
int rc;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
......@@ -5405,7 +5409,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
/* parse add command */
rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
if (rc < 0)
goto err_out_module;
goto out;
rbdc = rbd_get_client(ceph_opts);
if (IS_ERR(rbdc)) {
......@@ -5432,8 +5436,10 @@ static ssize_t do_rbd_add(struct bus_type *bus,
}
rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
if (!rbd_dev)
if (!rbd_dev) {
rc = -ENOMEM;
goto err_out_client;
}
rbdc = NULL; /* rbd_dev now owns this */
spec = NULL; /* rbd_dev now owns this */
rbd_opts = NULL; /* rbd_dev now owns this */
......@@ -5458,10 +5464,13 @@ static ssize_t do_rbd_add(struct bus_type *bus,
*/
rbd_dev_header_unwatch_sync(rbd_dev);
rbd_dev_image_release(rbd_dev);
goto err_out_module;
goto out;
}
return count;
rc = count;
out:
module_put(THIS_MODULE);
return rc;
err_out_rbd_dev:
rbd_dev_destroy(rbd_dev);
......@@ -5470,12 +5479,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
err_out_args:
rbd_spec_put(spec);
kfree(rbd_opts);
err_out_module:
module_put(THIS_MODULE);
dout("Error adding device %s\n", buf);
return (ssize_t)rc;
goto out;
}
static ssize_t rbd_add(struct bus_type *bus,
......@@ -5495,17 +5499,15 @@ static ssize_t rbd_add_single_major(struct bus_type *bus,
return do_rbd_add(bus, buf, count);
}
static void rbd_dev_device_release(struct device *dev)
static void rbd_dev_device_release(struct rbd_device *rbd_dev)
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
rbd_free_disk(rbd_dev);
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
device_del(&rbd_dev->dev);
rbd_dev_mapping_clear(rbd_dev);
if (!single_major)
unregister_blkdev(rbd_dev->major, rbd_dev->name);
rbd_dev_id_put(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
}
static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
......@@ -5590,9 +5592,8 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
* rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
* in a potential use after free of rbd_dev->disk or rbd_dev.
*/
rbd_bus_del_dev(rbd_dev);
rbd_dev_device_release(rbd_dev);
rbd_dev_image_release(rbd_dev);
module_put(THIS_MODULE);
return count;
}
......@@ -5663,10 +5664,8 @@ static int rbd_slab_init(void)
if (rbd_segment_name_cache)
return 0;
out_err:
if (rbd_obj_request_cache) {
kmem_cache_destroy(rbd_obj_request_cache);
rbd_obj_request_cache = NULL;
}
kmem_cache_destroy(rbd_obj_request_cache);
rbd_obj_request_cache = NULL;
kmem_cache_destroy(rbd_img_request_cache);
rbd_img_request_cache = NULL;
......
......@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
/* use ceph virtual inode (id + snaphot) */
/* use ceph virtual inode (id + snapshot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
......
......@@ -1655,9 +1655,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
......@@ -1971,49 +1970,46 @@ static void sync_write_wait(struct inode *inode)
}
/*
* wait for any uncommitted directory operations to commit.
* wait for any unsafe requests to complete.
*/
static int unsafe_dirop_wait(struct inode *inode)
static int unsafe_request_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct list_head *head = &ci->i_unsafe_dirops;
struct ceph_mds_request *req;
u64 last_tid;
int ret = 0;
if (!S_ISDIR(inode->i_mode))
return 0;
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
if (list_empty(head))
goto out;
req = list_last_entry(head, struct ceph_mds_request,
r_unsafe_dir_item);
last_tid = req->r_tid;
do {
ceph_mdsc_get_request(req);
spin_unlock(&ci->i_unsafe_lock);
if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
req1 = list_last_entry(&ci->i_unsafe_dirops,
struct ceph_mds_request,
r_unsafe_dir_item);
ceph_mdsc_get_request(req1);
}
if (!list_empty(&ci->i_unsafe_iops)) {
req2 = list_last_entry(&ci->i_unsafe_iops,
struct ceph_mds_request,
r_unsafe_target_item);
ceph_mdsc_get_request(req2);
}
spin_unlock(&ci->i_unsafe_lock);
dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
inode, req->r_tid, last_tid);
ret = !wait_for_completion_timeout(&req->r_safe_completion,
ceph_timeout_jiffies(req->r_timeout));
dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
ceph_timeout_jiffies(req1->r_timeout));
if (ret)
ret = -EIO; /* timed out */
ceph_mdsc_put_request(req);
spin_lock(&ci->i_unsafe_lock);
if (ret || list_empty(head))
break;
req = list_first_entry(head, struct ceph_mds_request,
r_unsafe_dir_item);
} while (req->r_tid < last_tid);
out:
spin_unlock(&ci->i_unsafe_lock);
return ret;
err = -EIO;
ceph_mdsc_put_request(req1);
}
if (req2) {
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
ceph_timeout_jiffies(req2->r_timeout));
if (ret)
err = -EIO;
ceph_mdsc_put_request(req2);
}
return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
......@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
ret = unsafe_dirop_wait(inode);
ret = unsafe_request_wait(inode);
/*
* only wait on non-file metadata writeback (the mds
......
......@@ -34,6 +34,74 @@
* need to wait for MDS acknowledgement.
*/
/*
* Calculate the length sum of direct io vectors that can
* be combined into one page vector.
*/
static size_t dio_get_pagev_size(const struct iov_iter *it)
{
const struct iovec *iov = it->iov;
const struct iovec *iovend = iov + it->nr_segs;
size_t size;
size = iov->iov_len - it->iov_offset;
/*
* An iov can be page vectored when both the current tail
* and the next base are page aligned.
*/
while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
(++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
size += iov->iov_len;
}
dout("dio_get_pagevlen len = %zu\n", size);
return size;
}
/*
* Allocate a page vector based on (@it, @nbytes).
* The return value is the tuple describing a page vector,
* that is (@pages, @page_align, @num_pages).
*/
static struct page **
dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
size_t *page_align, int *num_pages)
{
struct iov_iter tmp_it = *it;
size_t align;
struct page **pages;
int ret = 0, idx, npages;
align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
(PAGE_SIZE - 1);
npages = calc_pages_for(align, nbytes);
pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
if (!pages) {
pages = vmalloc(sizeof(*pages) * npages);
if (!pages)
return ERR_PTR(-ENOMEM);
}
for (idx = 0; idx < npages; ) {
size_t start;
ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
npages - idx, &start);
if (ret < 0)
goto fail;
iov_iter_advance(&tmp_it, ret);
nbytes -= ret;
idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
}
BUG_ON(nbytes != 0);
*num_pages = npages;
*page_align = align;
dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
return pages;
fail:
ceph_put_page_vector(pages, idx, false);
return ERR_PTR(ret);
}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
......@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
size_t start;
ssize_t n;
n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
if (n < 0)
return n;
num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
n = dio_get_pagev_size(i);
pages = dio_get_pages_alloc(i, n, &start, &num_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
......@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
CEPH_OSD_FLAG_WRITE;
while (iov_iter_count(from) > 0) {
u64 len = iov_iter_single_seg_count(from);
u64 len = dio_get_pagev_size(from);
size_t start;
ssize_t n;
......@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
ret = n;
n = len;
pages = dio_get_pages_alloc(from, len, &start, &num_pages);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
ret = PTR_ERR(pages);
break;
}
num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
......
......@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;
......
......@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
ihold(dir);
spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
......@@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
if (req->r_unsafe_dir) {
if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
}
if (req->r_target_inode && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_target_item);
spin_unlock(&ci->i_unsafe_lock);
}
if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
......@@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
}
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
......@@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
} else {
/* try to drop referring dentries */
/* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
......@@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
INIT_LIST_HEAD(&req->r_wait);
......@@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
len = sizeof(*head) +
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
sizeof(struct timespec);
sizeof(struct ceph_timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
......@@ -2477,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
if (req->r_unsafe_dir) {
struct ceph_inode_info *ci =
ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_dir_item,
&ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
dout("handle_reply tid %lld result %d\n", tid, result);
......@@ -2518,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
up_read(&mdsc->snap_rwsem);
if (realm)
ceph_put_snap_realm(mdsc, realm);
if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
spin_unlock(&ci->i_unsafe_lock);
}
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
......@@ -3917,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
static int mds_sign_message(struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
return ceph_auth_sign_message(auth, msg);
}
static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
static int mds_check_message_signature(struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
return ceph_auth_check_message_signature(auth, msg);
}
......@@ -3940,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
.sign_message = sign_message,
.check_message_signature = check_message_signature,
.sign_message = mds_sign_message,
.check_message_signature = mds_check_message_signature,
};
/* eof */
......@@ -236,6 +236,9 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
/* unsafe requests that modify the target inode */
struct list_head r_unsafe_target_item;
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
......
......@@ -342,6 +342,7 @@ struct ceph_inode_info {
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
......
......@@ -29,8 +29,9 @@
#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
#define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */
#define CEPH_OPT_NOMSGAUTH (1<<4) /* don't require msg signing feat */
#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */
#define CEPH_OPT_NOMSGSIGN (1<<6) /* don't sign msgs */
#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
......@@ -137,6 +138,7 @@ struct ceph_client {
#endif
};
#define from_msgr(ms) container_of(ms, struct ceph_client, msgr)
/*
......
......@@ -43,10 +43,9 @@ struct ceph_connection_operations {
struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip);
int (*sign_message) (struct ceph_connection *con, struct ceph_msg *msg);
int (*check_message_signature) (struct ceph_connection *con,
struct ceph_msg *msg);
int (*sign_message) (struct ceph_msg *msg);
int (*check_message_signature) (struct ceph_msg *msg);
};
/* use format string %s%d */
......@@ -58,8 +57,6 @@ struct ceph_messenger {
atomic_t stopping;
possible_net_t net;
bool nocrc;
bool tcp_nodelay;
/*
* the global_seq counts connections i (attempt to) initiate
......@@ -67,9 +64,6 @@ struct ceph_messenger {
*/
u32 global_seq;
spinlock_t global_seq_lock;
u64 supported_features;
u64 required_features;
};
enum ceph_msg_data_type {
......@@ -268,11 +262,7 @@ extern void ceph_msgr_exit(void);
extern void ceph_msgr_flush(void);
extern void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
u64 supported_features,
u64 required_features,
bool nocrc,
bool tcp_nodelay);
struct ceph_entity_addr *myaddr);
extern void ceph_messenger_fini(struct ceph_messenger *msgr);
extern void ceph_con_init(struct ceph_connection *con, void *private,
......
......@@ -8,6 +8,7 @@
#include <linux/ceph/decode.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
#include "crypto.h"
......@@ -279,6 +280,15 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
return -EINVAL;
}
static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au)
{
ceph_crypto_key_destroy(&au->session_key);
if (au->buf) {
ceph_buffer_put(au->buf);
au->buf = NULL;
}
}
static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th,
struct ceph_x_authorizer *au)
......@@ -297,7 +307,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
ceph_crypto_key_destroy(&au->session_key);
ret = ceph_crypto_key_clone(&au->session_key, &th->session_key);
if (ret)
return ret;
goto out_au;
maxlen = sizeof(*msg_a) + sizeof(msg_b) +
ceph_x_encrypt_buflen(ticket_blob_len);
......@@ -309,8 +319,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
if (!au->buf) {
au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
if (!au->buf) {
ceph_crypto_key_destroy(&au->session_key);
return -ENOMEM;
ret = -ENOMEM;
goto out_au;
}
}
au->service = th->service;
......@@ -340,7 +350,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
p, end - p);
if (ret < 0)
goto out_buf;
goto out_au;
p += ret;
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
......@@ -348,9 +358,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
BUG_ON(au->buf->vec.iov_len > maxlen);
return 0;
out_buf:
ceph_buffer_put(au->buf);
au->buf = NULL;
out_au:
ceph_x_authorizer_cleanup(au);
return ret;
}
......@@ -624,8 +633,7 @@ static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
{
struct ceph_x_authorizer *au = (void *)a;
ceph_crypto_key_destroy(&au->session_key);
ceph_buffer_put(au->buf);
ceph_x_authorizer_cleanup(au);
kfree(au);
}
......@@ -653,8 +661,7 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
remove_ticket_handler(ac, th);
}
if (xi->auth_authorizer.buf)
ceph_buffer_put(xi->auth_authorizer.buf);
ceph_x_authorizer_cleanup(&xi->auth_authorizer);
kfree(ac->private);
ac->private = NULL;
......@@ -691,8 +698,10 @@ static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
struct ceph_msg *msg)
{
int ret;
if (!auth->authorizer)
if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
msg, &msg->footer.sig);
if (ret < 0)
......@@ -707,8 +716,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
__le64 sig_check;
int ret;
if (!auth->authorizer)
if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
msg, &sig_check);
if (ret < 0)
......
......@@ -245,6 +245,8 @@ enum {
Opt_nocrc,
Opt_cephx_require_signatures,
Opt_nocephx_require_signatures,
Opt_cephx_sign_messages,
Opt_nocephx_sign_messages,
Opt_tcp_nodelay,
Opt_notcp_nodelay,
};
......@@ -267,6 +269,8 @@ static match_table_t opt_tokens = {
{Opt_nocrc, "nocrc"},
{Opt_cephx_require_signatures, "cephx_require_signatures"},
{Opt_nocephx_require_signatures, "nocephx_require_signatures"},
{Opt_cephx_sign_messages, "cephx_sign_messages"},
{Opt_nocephx_sign_messages, "nocephx_sign_messages"},
{Opt_tcp_nodelay, "tcp_nodelay"},
{Opt_notcp_nodelay, "notcp_nodelay"},
{-1, NULL}
......@@ -491,6 +495,12 @@ ceph_parse_options(char *options, const char *dev_name,
case Opt_nocephx_require_signatures:
opt->flags |= CEPH_OPT_NOMSGAUTH;
break;
case Opt_cephx_sign_messages:
opt->flags &= ~CEPH_OPT_NOMSGSIGN;
break;
case Opt_nocephx_sign_messages:
opt->flags |= CEPH_OPT_NOMSGSIGN;
break;
case Opt_tcp_nodelay:
opt->flags |= CEPH_OPT_TCP_NODELAY;
......@@ -534,6 +544,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
seq_puts(m, "nocrc,");
if (opt->flags & CEPH_OPT_NOMSGAUTH)
seq_puts(m, "nocephx_require_signatures,");
if (opt->flags & CEPH_OPT_NOMSGSIGN)
seq_puts(m, "nocephx_sign_messages,");
if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
seq_puts(m, "notcp_nodelay,");
......@@ -596,11 +608,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
if (ceph_test_opt(client, MYIP))
myaddr = &client->options->my_addr;
ceph_messenger_init(&client->msgr, myaddr,
client->supported_features,
client->required_features,
ceph_test_opt(client, NOCRC),
ceph_test_opt(client, TCP_NODELAY));
ceph_messenger_init(&client->msgr, myaddr);
/* subsystems */
err = ceph_monc_init(&client->monc, client);
......
......@@ -16,8 +16,10 @@ struct ceph_crypto_key {
static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
if (key)
if (key) {
kfree(key->key);
key->key = NULL;
}
}
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
......
......@@ -509,7 +509,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret;
}
if (con->msgr->tcp_nodelay) {
if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) {
int optval = 1;
ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
......@@ -637,9 +637,6 @@ static int con_close_socket(struct ceph_connection *con)
static void ceph_msg_remove(struct ceph_msg *msg)
{
list_del_init(&msg->list_head);
BUG_ON(msg->con == NULL);
msg->con->ops->put(msg->con);
msg->con = NULL;
ceph_msg_put(msg);
}
......@@ -662,15 +659,14 @@ static void reset_connection(struct ceph_connection *con)
if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
con->ops->put(con);
}
con->connect_seq = 0;
con->out_seq = 0;
if (con->out_msg) {
BUG_ON(con->out_msg->con != con);
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
}
......@@ -1205,7 +1201,7 @@ static void prepare_write_message_footer(struct ceph_connection *con)
con->out_kvec[v].iov_base = &m->footer;
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message)
con->ops->sign_message(con, m);
con->ops->sign_message(m);
else
m->footer.sig = 0;
con->out_kvec[v].iov_len = sizeof(m->footer);
......@@ -1432,7 +1428,8 @@ static int prepare_write_connect(struct ceph_connection *con)
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
con->connect_seq, global_seq, proto);
con->out_connect.features = cpu_to_le64(con->msgr->supported_features);
con->out_connect.features =
cpu_to_le64(from_msgr(con->msgr)->supported_features);
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
con->out_connect.global_seq = cpu_to_le32(global_seq);
......@@ -1527,7 +1524,7 @@ static int write_partial_message_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
bool do_datacrc = !con->msgr->nocrc;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
u32 crc;
dout("%s %p msg %p\n", __func__, con, msg);
......@@ -1552,8 +1549,8 @@ static int write_partial_message_data(struct ceph_connection *con)
bool need_crc;
int ret;
page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
&last_piece);
page = ceph_msg_data_next(cursor, &page_offset, &length,
&last_piece);
ret = ceph_tcp_sendpage(con->sock, page, page_offset,
length, !last_piece);
if (ret <= 0) {
......@@ -1564,7 +1561,7 @@ static int write_partial_message_data(struct ceph_connection *con)
}
if (do_datacrc && cursor->need_crc)
crc = ceph_crc32c_page(crc, page, page_offset, length);
need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
need_crc = ceph_msg_data_advance(cursor, (size_t)ret);
}
dout("%s %p msg %p done\n", __func__, con, msg);
......@@ -2005,8 +2002,8 @@ static int process_banner(struct ceph_connection *con)
static int process_connect(struct ceph_connection *con)
{
u64 sup_feat = con->msgr->supported_features;
u64 req_feat = con->msgr->required_features;
u64 sup_feat = from_msgr(con->msgr)->supported_features;
u64 req_feat = from_msgr(con->msgr)->required_features;
u64 server_feat = ceph_sanitize_features(
le64_to_cpu(con->in_reply.features));
int ret;
......@@ -2232,7 +2229,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
const bool do_datacrc = !con->msgr->nocrc;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
struct page *page;
size_t page_offset;
size_t length;
......@@ -2246,8 +2243,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = con->in_data_crc;
while (cursor->resid) {
page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
NULL);
page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
if (ret <= 0) {
if (do_datacrc)
......@@ -2258,7 +2254,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = ceph_crc32c_page(crc, page, page_offset, ret);
(void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
(void) ceph_msg_data_advance(cursor, (size_t)ret);
}
if (do_datacrc)
con->in_data_crc = crc;
......@@ -2278,7 +2274,7 @@ static int read_partial_message(struct ceph_connection *con)
int end;
int ret;
unsigned int front_len, middle_len, data_len;
bool do_datacrc = !con->msgr->nocrc;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
u64 seq;
u32 crc;
......@@ -2423,7 +2419,7 @@ static int read_partial_message(struct ceph_connection *con)
}
if (need_sign && con->ops->check_message_signature &&
con->ops->check_message_signature(con, m)) {
con->ops->check_message_signature(m)) {
pr_err("read_partial_message %p signature check failed\n", m);
return -EBADMSG;
}
......@@ -2438,13 +2434,10 @@ static int read_partial_message(struct ceph_connection *con)
*/
static void process_message(struct ceph_connection *con)
{
struct ceph_msg *msg;
struct ceph_msg *msg = con->in_msg;
BUG_ON(con->in_msg->con != con);
con->in_msg->con = NULL;
msg = con->in_msg;
con->in_msg = NULL;
con->ops->put(con);
/* if first message, set peer_name */
if (con->peer_name.type == 0)
......@@ -2677,7 +2670,7 @@ static int try_read(struct ceph_connection *con)
if (ret <= 0) {
switch (ret) {
case -EBADMSG:
con->error_msg = "bad crc";
con->error_msg = "bad crc/signature";
/* fall through */
case -EBADE:
ret = -EIO;
......@@ -2918,10 +2911,8 @@ static void con_fault(struct ceph_connection *con)
if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
con->ops->put(con);
}
/* Requeue anything that hasn't been acked */
......@@ -2952,15 +2943,8 @@ static void con_fault(struct ceph_connection *con)
* initialize a new messenger instance
*/
void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
u64 supported_features,
u64 required_features,
bool nocrc,
bool tcp_nodelay)
struct ceph_entity_addr *myaddr)
{
msgr->supported_features = supported_features;
msgr->required_features = required_features;
spin_lock_init(&msgr->global_seq_lock);
if (myaddr)
......@@ -2970,8 +2954,6 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
msgr->inst.addr.type = 0;
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
encode_my_addr(msgr);
msgr->nocrc = nocrc;
msgr->tcp_nodelay = tcp_nodelay;
atomic_set(&msgr->stopping, 0);
write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
......@@ -2986,6 +2968,15 @@ void ceph_messenger_fini(struct ceph_messenger *msgr)
}
EXPORT_SYMBOL(ceph_messenger_fini);
static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
{
if (msg->con)
msg->con->ops->put(msg->con);
msg->con = con ? con->ops->get(con) : NULL;
BUG_ON(msg->con != con);
}
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
......@@ -3017,9 +3008,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
return;
}
BUG_ON(msg->con != NULL);
msg->con = con->ops->get(con);
BUG_ON(msg->con == NULL);
msg_con_set(msg, con);
BUG_ON(!list_empty(&msg->list_head));
list_add_tail(&msg->list_head, &con->out_queue);
......@@ -3047,16 +3036,15 @@ void ceph_msg_revoke(struct ceph_msg *msg)
{
struct ceph_connection *con = msg->con;
if (!con)
if (!con) {
dout("%s msg %p null con\n", __func__, msg);
return; /* Message not in our possession */
}
mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) {
dout("%s %p msg %p - was on queue\n", __func__, con, msg);
list_del_init(&msg->list_head);
BUG_ON(msg->con == NULL);
msg->con->ops->put(msg->con);
msg->con = NULL;
msg->hdr.seq = 0;
ceph_msg_put(msg);
......@@ -3080,16 +3068,13 @@ void ceph_msg_revoke(struct ceph_msg *msg)
*/
void ceph_msg_revoke_incoming(struct ceph_msg *msg)
{
struct ceph_connection *con;
struct ceph_connection *con = msg->con;
BUG_ON(msg == NULL);
if (!msg->con) {
if (!con) {
dout("%s msg %p null con\n", __func__, msg);
return; /* Message not in our possession */
}
con = msg->con;
mutex_lock(&con->mutex);
if (con->in_msg == msg) {
unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
......@@ -3335,9 +3320,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
}
if (msg) {
BUG_ON(*skip);
msg_con_set(msg, con);
con->in_msg = msg;
con->in_msg->con = con->ops->get(con);
BUG_ON(con->in_msg->con == NULL);
} else {
/*
* Null message pointer means either we should skip
......@@ -3384,6 +3368,8 @@ static void ceph_msg_release(struct kref *kref)
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
msg_con_set(m, NULL);
/* drop middle, data, if any */
if (m->middle) {
ceph_buffer_put(m->middle);
......
......@@ -120,11 +120,13 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
}
#endif /* CONFIG_BLOCK */
#define osd_req_op_data(oreq, whch, typ, fld) \
({ \
BUG_ON(whch >= (oreq)->r_num_ops); \
&(oreq)->r_ops[whch].typ.fld; \
})
#define osd_req_op_data(oreq, whch, typ, fld) \
({ \
struct ceph_osd_request *__oreq = (oreq); \
unsigned int __whch = (whch); \
BUG_ON(__whch >= __oreq->r_num_ops); \
&__oreq->r_ops[__whch].typ.fld; \
})
static struct ceph_osd_data *
osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
......@@ -1750,8 +1752,7 @@ static void complete_request(struct ceph_osd_request *req)
* handle osd op reply. either call the callback if it is specified,
* or do the completion to wake up the waiting thread.
*/
static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
struct ceph_connection *con)
static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
{
void *p, *end;
struct ceph_osd_request *req;
......@@ -2807,7 +2808,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
ceph_osdc_handle_map(osdc, msg);
break;
case CEPH_MSG_OSD_OPREPLY:
handle_reply(osdc, msg, con);
handle_reply(osdc, msg);
break;
case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg);
......@@ -2849,9 +2850,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
goto out;
}
if (req->r_reply->con)
dout("%s revoking msg %p from old con %p\n", __func__,
req->r_reply, req->r_reply->con);
ceph_msg_revoke_incoming(req->r_reply);
if (front_len > req->r_reply->front_alloc_len) {
......@@ -2978,17 +2976,19 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
static int osd_sign_message(struct ceph_msg *msg)
{
struct ceph_osd *o = con->private;
struct ceph_osd *o = msg->con->private;
struct ceph_auth_handshake *auth = &o->o_auth;
return ceph_auth_sign_message(auth, msg);
}
static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
static int osd_check_message_signature(struct ceph_msg *msg)
{
struct ceph_osd *o = con->private;
struct ceph_osd *o = msg->con->private;
struct ceph_auth_handshake *auth = &o->o_auth;
return ceph_auth_check_message_signature(auth, msg);
}
......@@ -3000,7 +3000,7 @@ static const struct ceph_connection_operations osd_con_ops = {
.verify_authorizer_reply = verify_authorizer_reply,
.invalidate_authorizer = invalidate_authorizer,
.alloc_msg = alloc_msg,
.sign_message = sign_message,
.check_message_signature = check_message_signature,
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
.fault = osd_reset,
};
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册