提交 b2deee2d 编写于 作者: L Linus Torvalds

Merge tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "This time around we have:

   - support for rbd data-pool feature, which enables rbd images on
     erasure-coded pools (myself). CEPH_PG_MAX_SIZE has been bumped to
     allow erasure-coded profiles with k+m up to 32.

   - a patch for ceph_d_revalidate() performance regression introduced
     in 4.9, along with some cleanups in the area (Jeff Layton)

   - a set of fixes for unsafe ->d_parent accesses in CephFS (Jeff
     Layton)

   - buffered reads are now processed in rsize windows instead of rasize
     windows (Andreas Gerstmayr). The new default for rsize mount option
     is 64M.

   - ack vs commit distinction is gone, greatly simplifying ->fsync()
     and MOSDOpReply handling code (myself)

  ... also a few filesystem bug fixes from Zheng, a CRUSH sync up (CRUSH
  computations are still serialized though) and several minor fixes and
  cleanups all over"

* tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client: (52 commits)
  libceph, rbd, ceph: WRITE | ONDISK -> WRITE
  libceph: get rid of ack vs commit
  ceph: remove special ack vs commit behavior
  ceph: tidy some white space in get_nonsnap_parent()
  crush: fix dprintk compilation
  crush: do is_out test only if we do not collide
  ceph: remove req from unsafe list when unregistering it
  rbd: constify device_type structure
  rbd: kill obj_request->object_name and rbd_segment_name_cache
  rbd: store and use obj_request->object_no
  rbd: RBD_V{1,2}_DATA_FORMAT macros
  rbd: factor out __rbd_osd_req_create()
  rbd: set offset and length outside of rbd_obj_request_create()
  rbd: support for data-pool feature
  rbd: introduce rbd_init_layout()
  rbd: use rbd_obj_bytes() more
  rbd: remove now unused rbd_obj_request_wait() and helpers
  rbd: switch rbd_obj_method_sync() to ceph_osdc_call()
  libceph: pass reply buffer length through ceph_osdc_call()
  rbd: do away with obj_request in rbd_obj_read_sync()
  ...
...@@ -98,11 +98,10 @@ Mount Options ...@@ -98,11 +98,10 @@ Mount Options
size. size.
rsize=X rsize=X
Specify the maximum read size in bytes. By default there is no Specify the maximum read size in bytes. Default: 64 MB.
maximum.
rasize=X rasize=X
Specify the maximum readahead. Specify the maximum readahead. Default: 8 MB.
mount_timeout=X mount_timeout=X
Specify the timeout value for mount (in seconds), in the case Specify the timeout value for mount (in seconds), in the case
......
此差异已折叠。
...@@ -25,8 +25,8 @@ ...@@ -25,8 +25,8 @@
*/ */
#define RBD_HEADER_PREFIX "rbd_header." #define RBD_HEADER_PREFIX "rbd_header."
#define RBD_DATA_PREFIX "rbd_data."
#define RBD_ID_PREFIX "rbd_id." #define RBD_ID_PREFIX "rbd_id."
#define RBD_V2_DATA_FORMAT "%s.%016llx"
#define RBD_LOCK_NAME "rbd_lock" #define RBD_LOCK_NAME "rbd_lock"
#define RBD_LOCK_TAG "internal" #define RBD_LOCK_TAG "internal"
...@@ -42,13 +42,14 @@ enum rbd_notify_op { ...@@ -42,13 +42,14 @@ enum rbd_notify_op {
/* /*
* For format version 1, rbd image 'foo' consists of objects * For format version 1, rbd image 'foo' consists of objects
* foo.rbd - image metadata * foo.rbd - image metadata
* rb.<idhi>.<idlo>.00000000 * rb.<idhi>.<idlo>.<extra>.000000000000
* rb.<idhi>.<idlo>.00000001 * rb.<idhi>.<idlo>.<extra>.000000000001
* ... - data * ... - data
* There is no notion of a persistent image id in rbd format 1. * There is no notion of a persistent image id in rbd format 1.
*/ */
#define RBD_SUFFIX ".rbd" #define RBD_SUFFIX ".rbd"
#define RBD_V1_DATA_FORMAT "%s.%012llx"
#define RBD_DIRECTORY "rbd_directory" #define RBD_DIRECTORY "rbd_directory"
#define RBD_INFO "rbd_info" #define RBD_INFO "rbd_info"
...@@ -57,9 +58,6 @@ enum rbd_notify_op { ...@@ -57,9 +58,6 @@ enum rbd_notify_op {
#define RBD_MIN_OBJ_ORDER 16 #define RBD_MIN_OBJ_ORDER 16
#define RBD_MAX_OBJ_ORDER 30 #define RBD_MAX_OBJ_ORDER 30
#define RBD_COMP_NONE 0
#define RBD_CRYPT_NONE 0
#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" #define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
#define RBD_HEADER_SIGNATURE "RBD" #define RBD_HEADER_SIGNATURE "RBD"
#define RBD_HEADER_VERSION "001.005" #define RBD_HEADER_VERSION "001.005"
......
...@@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) ...@@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
nr_pages = i; nr_pages = i;
if (nr_pages > 0) { if (nr_pages > 0) {
len = nr_pages << PAGE_SHIFT; len = nr_pages << PAGE_SHIFT;
osd_req_op_extent_update(req, 0, len);
break; break;
} }
goto out_pages; goto out_pages;
...@@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" : wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
if (ci->i_wrbuffer_ref > 0) { if (ci->i_wrbuffer_ref > 0) {
pr_warn_ratelimited( pr_warn_ratelimited(
"writepage_start %p %lld forced umount\n", "writepage_start %p %lld forced umount\n",
...@@ -1017,8 +1018,7 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -1017,8 +1018,7 @@ static int ceph_writepages_start(struct address_space *mapping,
&ci->i_layout, vino, &ci->i_layout, vino,
offset, &len, 0, num_ops, offset, &len, 0, num_ops,
CEPH_OSD_OP_WRITE, CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_WRITE,
CEPH_OSD_FLAG_ONDISK,
snapc, truncate_seq, snapc, truncate_seq,
truncate_size, false); truncate_size, false);
if (IS_ERR(req)) { if (IS_ERR(req)) {
...@@ -1028,8 +1028,7 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -1028,8 +1028,7 @@ static int ceph_writepages_start(struct address_space *mapping,
min(num_ops, min(num_ops,
CEPH_OSD_SLAB_OPS), CEPH_OSD_SLAB_OPS),
CEPH_OSD_OP_WRITE, CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_WRITE,
CEPH_OSD_FLAG_ONDISK,
snapc, truncate_seq, snapc, truncate_seq,
truncate_size, true); truncate_size, true);
BUG_ON(IS_ERR(req)); BUG_ON(IS_ERR(req));
...@@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file, ...@@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file,
int r; int r;
struct ceph_snap_context *snapc, *oldest; struct ceph_snap_context *snapc, *oldest;
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page); dout(" page %p forced umount\n", page);
unlock_page(page); unlock_page(page);
return -EIO; return -EIO;
...@@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ...@@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1, ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
NULL, 0, 0, false); NULL, 0, 0, false);
if (IS_ERR(req)) { if (IS_ERR(req)) {
err = PTR_ERR(req); err = PTR_ERR(req);
...@@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ...@@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3, ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
NULL, ci->i_truncate_seq, NULL, ci->i_truncate_seq,
ci->i_truncate_size, false); ci->i_truncate_size, false);
if (IS_ERR(req)) { if (IS_ERR(req)) {
...@@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, ...@@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
goto out_unlock; goto out_unlock;
} }
wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
......
...@@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) ...@@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
inode); inode);
if (fscache_cookie_enabled(ci->fscache)) { if (fscache_cookie_enabled(ci->fscache)) {
dout("fscache_file_set_cookie %p %p enabing cache\n", dout("fscache_file_set_cookie %p %p enabling cache\n",
inode, filp); inode, filp);
} }
} }
......
...@@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) ...@@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
/* /*
* Return caps we have registered with the MDS(s) as 'wanted'. * Return caps we have registered with the MDS(s) as 'wanted'.
*/ */
int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
{ {
struct ceph_cap *cap; struct ceph_cap *cap;
struct rb_node *p; struct rb_node *p;
...@@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) ...@@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node); cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap)) if (check && !__cap_is_valid(cap))
continue; continue;
if (cap == ci->i_auth_cap) if (cap == ci->i_auth_cap)
mds_wanted |= cap->mds_wanted; mds_wanted |= cap->mds_wanted;
...@@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ...@@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
delayed = 1; delayed = 1;
} }
ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
if (want & ~cap->mds_wanted) {
/* user space may open/close single file frequently.
* This avoids droping mds_wanted immediately after
* requesting new mds_wanted.
*/
__cap_set_timeouts(mdsc, ci);
}
cap->issued &= retain; /* drop bits we don't want */ cap->issued &= retain; /* drop bits we don't want */
if (cap->implemented & ~cap->issued) { if (cap->implemented & ~cap->issued) {
...@@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ...@@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
ceph_sync_write_wait(inode);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end); ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0) if (ret < 0)
goto out; goto out;
...@@ -2477,23 +2482,22 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ...@@ -2477,23 +2482,22 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
int mds_wanted; int mds_wanted;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == if (READ_ONCE(mdsc->fsc->mount_state) ==
CEPH_MOUNT_SHUTDOWN) { CEPH_MOUNT_SHUTDOWN) {
dout("get_cap_refs %p forced umount\n", inode); dout("get_cap_refs %p forced umount\n", inode);
*err = -EIO; *err = -EIO;
ret = 1; ret = 1;
goto out_unlock; goto out_unlock;
} }
mds_wanted = __ceph_caps_mds_wanted(ci); mds_wanted = __ceph_caps_mds_wanted(ci, false);
if ((mds_wanted & need) != need) { if (need & ~(mds_wanted & need)) {
dout("get_cap_refs %p caps were dropped" dout("get_cap_refs %p caps were dropped"
" (session killed?)\n", inode); " (session killed?)\n", inode);
*err = -ESTALE; *err = -ESTALE;
ret = 1; ret = 1;
goto out_unlock; goto out_unlock;
} }
if ((mds_wanted & file_wanted) == if (!(file_wanted & ~mds_wanted))
(file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
} }
...@@ -3404,6 +3408,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ...@@ -3404,6 +3408,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
tcap->implemented |= issued; tcap->implemented |= issued;
if (cap == ci->i_auth_cap) if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap; ci->i_auth_cap = tcap;
if (!list_empty(&ci->i_cap_flush_list) && if (!list_empty(&ci->i_cap_flush_list) &&
ci->i_auth_cap == tcap) { ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
...@@ -3417,9 +3422,18 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ...@@ -3417,9 +3422,18 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
} else if (tsession) { } else if (tsession) {
/* add placeholder for the export tagert */ /* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&tcap->session->s_cap_flushing);
spin_unlock(&mdsc->cap_dirty_lock);
}
__ceph_remove_cap(cap, false); __ceph_remove_cap(cap, false);
goto out_unlock; goto out_unlock;
} }
...@@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode, ...@@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
} }
int ceph_encode_dentry_release(void **p, struct dentry *dentry, int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless) int mds, int drop, int unless)
{ {
struct inode *dir = d_inode(dentry->d_parent); struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p; struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_dentry_info *di = ceph_dentry(dentry);
int force = 0; int force = 0;
...@@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, ...@@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock); spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds) if (di->lease_session && di->lease_session->s_mds == mds)
force = 1; force = 1;
if (!dir) {
parent = dget(dentry->d_parent);
dir = d_inode(parent);
}
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
dput(parent);
spin_lock(&dentry->d_lock); spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) { if (ret && di->lease_session && di->lease_session->s_mds == mds) {
......
...@@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p) ...@@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
if (req->r_got_unsafe) if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
seq_puts(s, "\t(unsafe)"); seq_puts(s, "\t(unsafe)");
else else
seq_puts(s, "\t"); seq_puts(s, "\t");
......
...@@ -371,7 +371,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -371,7 +371,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* hints to request -> mds selection code */ /* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS; req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag); req->r_direct_hash = ceph_frag_value(frag);
req->r_direct_is_hash = true; __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
if (fi->last_name) { if (fi->last_name) {
req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
if (!req->r_path2) { if (!req->r_path2) {
...@@ -417,7 +417,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -417,7 +417,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
fi->frag = frag; fi->frag = frag;
fi->last_readdir = req; fi->last_readdir = req;
if (req->r_did_prepopulate) { if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
fi->readdir_cache_idx = req->r_readdir_cache_idx; fi->readdir_cache_idx = req->r_readdir_cache_idx;
if (fi->readdir_cache_idx < 0) { if (fi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */ /* preclude from marking dir ordered */
...@@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, ...@@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
mask |= CEPH_CAP_XATTR_SHARED; mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask); req->r_args.getattr.mask = cpu_to_le32(mask);
req->r_locked_dir = dir; req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
err = ceph_handle_snapdir(req, dentry, err); err = ceph_handle_snapdir(req, dentry, err);
dentry = ceph_finish_lookup(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err);
...@@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, ...@@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
} }
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = 2; req->r_num_caps = 2;
req->r_locked_dir = dir; req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_args.mknod.rdev = cpu_to_le32(rdev);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
...@@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, ...@@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
goto out; goto out;
} }
req->r_locked_dir = dir; req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = 2; req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
...@@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ...@@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = 2; req->r_num_caps = 2;
req->r_locked_dir = dir; req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
...@@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, ...@@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = 2; req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry); req->r_old_dentry = dget(old_dentry);
req->r_locked_dir = dir; req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
/* release LINK_SHARED on source inode (mds will lock it) */ /* release LINK_SHARED on source inode (mds will lock it) */
...@@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) ...@@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
} }
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = 2; req->r_num_caps = 2;
req->r_locked_dir = dir; req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = drop_caps_for_unlink(inode); req->r_inode_drop = drop_caps_for_unlink(inode);
...@@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req->r_num_caps = 2; req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry); req->r_old_dentry = dget(old_dentry);
req->r_old_dentry_dir = old_dir; req->r_old_dentry_dir = old_dir;
req->r_locked_dir = new_dir; req->r_parent = new_dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
...@@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ...@@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct inode *dir; struct inode *dir;
if (flags & LOOKUP_RCU) { if (flags & LOOKUP_RCU) {
parent = ACCESS_ONCE(dentry->d_parent); parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent); dir = d_inode_rcu(parent);
if (!dir) if (!dir)
return -ECHILD; return -ECHILD;
...@@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ...@@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD; return -ECHILD;
op = ceph_snap(dir) == CEPH_SNAPDIR ? op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR; CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
if (!IS_ERR(req)) { if (!IS_ERR(req)) {
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2; req->r_num_caps = 2;
req->r_parent = dir;
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir)) if (ceph_security_xattr_wanted(dir))
......
...@@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name, ...@@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name,
req->r_inode = d_inode(child); req->r_inode = d_inode(child);
ihold(d_inode(child)); ihold(d_inode(child));
req->r_ino2 = ceph_vino(d_inode(parent)); req->r_ino2 = ceph_vino(d_inode(parent));
req->r_locked_dir = d_inode(parent); req->r_parent = d_inode(parent);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_num_caps = 2; req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
......
...@@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file) ...@@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file)
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (__ceph_is_any_real_caps(ci) && if (__ceph_is_any_real_caps(ci) &&
(((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
int mds_wanted = __ceph_caps_mds_wanted(ci); int mds_wanted = __ceph_caps_mds_wanted(ci, true);
int issued = __ceph_caps_issued(ci, NULL); int issued = __ceph_caps_issued(ci, NULL);
dout("open %p fmode %d want %s issued %s using existing\n", dout("open %p fmode %d want %s issued %s using existing\n",
...@@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= CEPH_CAP_XATTR_SHARED; mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask); req->r_args.open.mask = cpu_to_le32(mask);
req->r_locked_dir = dir; /* caller holds dir->i_mutex */ req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req); req);
...@@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
goto out; goto out;
} }
req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
...@@ -794,89 +793,6 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -794,89 +793,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
kfree(aio_work); kfree(aio_work);
} }
/*
* Write commit request unsafe callback, called to tell us when a
* request is unsafe (that is, in flight--has been handed to the
* messenger to send to its target osd). It is called again when
* we've received a response message indicating the request is
* "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
* is completed early (and unsuccessfully) due to a timeout or
* interrupt.
*
* This is used if we requested both an ACK and ONDISK commit reply
* from the OSD.
*/
static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
{
struct ceph_inode_info *ci = ceph_inode(req->r_inode);
dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
unsafe ? "un" : "");
if (unsafe) {
ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_item,
&ci->i_unsafe_writes);
spin_unlock(&ci->i_unsafe_lock);
complete_all(&req->r_completion);
} else {
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_item);
spin_unlock(&ci->i_unsafe_lock);
ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
}
}
/*
* Wait on any unsafe replies for the given inode. First wait on the
* newest request, and make that the upper bound. Then, if there are
* more requests, keep waiting on the oldest as long as it is still older
* than the original request.
*/
void ceph_sync_write_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct list_head *head = &ci->i_unsafe_writes;
struct ceph_osd_request *req;
u64 last_tid;
if (!S_ISREG(inode->i_mode))
return;
spin_lock(&ci->i_unsafe_lock);
if (list_empty(head))
goto out;
/* set upper bound as _last_ entry in chain */
req = list_last_entry(head, struct ceph_osd_request,
r_unsafe_item);
last_tid = req->r_tid;
do {
ceph_osdc_get_request(req);
spin_unlock(&ci->i_unsafe_lock);
dout("sync_write_wait on tid %llu (until %llu)\n",
req->r_tid, last_tid);
wait_for_completion(&req->r_done_completion);
ceph_osdc_put_request(req);
spin_lock(&ci->i_unsafe_lock);
/*
* from here on look at first entry in chain, since we
* only want to wait for anything older than last_tid
*/
if (list_empty(head))
break;
req = list_first_entry(head, struct ceph_osd_request,
r_unsafe_item);
} while (req->r_tid < last_tid);
out:
spin_unlock(&ci->i_unsafe_lock);
}
static ssize_t static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_snap_context *snapc, struct ceph_snap_context *snapc,
...@@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (ret2 < 0) if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2); dout("invalidate_inode_pages2_range returned %d\n", ret2);
flags = CEPH_OSD_FLAG_ORDERSNAP | flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
} else { } else {
flags = CEPH_OSD_FLAG_READ; flags = CEPH_OSD_FLAG_READ;
} }
...@@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ...@@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ret < 0) if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret); dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP | flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ACK;
while ((len = iov_iter_count(from)) > 0) { while ((len = iov_iter_count(from)) > 0) {
size_t left; size_t left;
...@@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ...@@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
goto out; goto out;
} }
/* get a second commit callback */
req->r_unsafe_callback = ceph_sync_write_unsafe;
req->r_inode = inode; req->r_inode = inode;
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
...@@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode, ...@@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode,
ceph_vino(inode), ceph_vino(inode),
offset, length, offset, length,
0, 1, op, 0, 1, op,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_WRITE,
CEPH_OSD_FLAG_ONDISK,
NULL, 0, 0, false); NULL, 0, 0, false);
if (IS_ERR(req)) { if (IS_ERR(req)) {
ret = PTR_ERR(req); ret = PTR_ERR(req);
......
...@@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_gen = 0; ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0; ci->i_rdcache_revoking = 0;
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops); INIT_LIST_HEAD(&ci->i_unsafe_dirops);
INIT_LIST_HEAD(&ci->i_unsafe_iops); INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock); spin_lock_init(&ci->i_unsafe_lock);
...@@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode) ...@@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode)
return 1; return 1;
} }
void ceph_evict_inode(struct inode *inode)
{
/* wait unsafe sync writes */
ceph_sync_write_wait(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
static inline blkcnt_t calc_inode_blocks(u64 size) static inline blkcnt_t calc_inode_blocks(u64 size)
{ {
return (size + (1<<9) - 1) >> 9; return (size + (1<<9) - 1) >> 9;
...@@ -1016,7 +1007,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -1016,7 +1007,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
static void update_dentry_lease(struct dentry *dentry, static void update_dentry_lease(struct dentry *dentry,
struct ceph_mds_reply_lease *lease, struct ceph_mds_reply_lease *lease,
struct ceph_mds_session *session, struct ceph_mds_session *session,
unsigned long from_time) unsigned long from_time,
struct ceph_vino *tgt_vino,
struct ceph_vino *dir_vino)
{ {
struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_dentry_info *di = ceph_dentry(dentry);
long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned duration = le32_to_cpu(lease->duration_ms);
...@@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry, ...@@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry,
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
struct inode *dir; struct inode *dir;
/*
* Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
* we expect a negative dentry.
*/
if (!tgt_vino && d_really_is_positive(dentry))
return;
if (tgt_vino && (d_really_is_negative(dentry) ||
!ceph_ino_compare(d_inode(dentry), tgt_vino)))
return;
spin_lock(&dentry->d_lock); spin_lock(&dentry->d_lock);
dout("update_dentry_lease %p duration %lu ms ttl %lu\n", dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
dentry, duration, ttl); dentry, duration, ttl);
/* make lease_rdcache_gen match directory */
dir = d_inode(dentry->d_parent); dir = d_inode(dentry->d_parent);
/* make sure parent matches dir_vino */
if (!ceph_ino_compare(dir, dir_vino))
goto out_unlock;
/* only track leases on regular dentries */ /* only track leases on regular dentries */
if (ceph_snap(dir) != CEPH_NOSNAP) if (ceph_snap(dir) != CEPH_NOSNAP)
goto out_unlock; goto out_unlock;
...@@ -1108,61 +1115,27 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) ...@@ -1108,61 +1115,27 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
* *
* Called with snap_rwsem (read). * Called with snap_rwsem (read).
*/ */
int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_mds_session *session)
{ {
struct ceph_mds_session *session = req->r_session;
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL; struct inode *in = NULL;
struct ceph_vino vino; struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_client(sb); struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int err = 0; int err = 0;
dout("fill_trace %p is_dentry %d is_target %d\n", req, dout("fill_trace %p is_dentry %d is_target %d\n", req,
rinfo->head->is_dentry, rinfo->head->is_target); rinfo->head->is_dentry, rinfo->head->is_target);
#if 0
/*
* Debugging hook:
*
* If we resend completed ops to a recovering mds, we get no
* trace. Since that is very rare, pretend this is the case
* to ensure the 'no trace' handlers in the callers behave.
*
* Fill in inodes unconditionally to avoid breaking cap
* invariants.
*/
if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
pr_info("fill_trace faking empty trace on %lld %s\n",
req->r_tid, ceph_mds_op_name(rinfo->head->op));
if (rinfo->head->is_dentry) {
rinfo->head->is_dentry = 0;
err = fill_inode(req->r_locked_dir,
&rinfo->diri, rinfo->dirfrag,
session, req->r_request_started, -1);
}
if (rinfo->head->is_target) {
rinfo->head->is_target = 0;
ininfo = rinfo->targeti.in;
vino.ino = le64_to_cpu(ininfo->ino);
vino.snap = le64_to_cpu(ininfo->snapid);
in = ceph_get_inode(sb, vino);
err = fill_inode(in, &rinfo->targeti, NULL,
session, req->r_request_started,
req->r_fmode);
iput(in);
}
}
#endif
if (!rinfo->head->is_target && !rinfo->head->is_dentry) { if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
dout("fill_trace reply is empty!\n"); dout("fill_trace reply is empty!\n");
if (rinfo->head->result == 0 && req->r_locked_dir) if (rinfo->head->result == 0 && req->r_parent)
ceph_invalidate_dir_request(req); ceph_invalidate_dir_request(req);
return 0; return 0;
} }
if (rinfo->head->is_dentry) { if (rinfo->head->is_dentry) {
struct inode *dir = req->r_locked_dir; struct inode *dir = req->r_parent;
if (dir) { if (dir) {
err = fill_inode(dir, NULL, err = fill_inode(dir, NULL,
...@@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dname.name = rinfo->dname; dname.name = rinfo->dname;
dname.len = rinfo->dname_len; dname.len = rinfo->dname_len;
dname.hash = full_name_hash(parent, dname.name, dname.len); dname.hash = full_name_hash(parent, dname.name, dname.len);
vino.ino = le64_to_cpu(rinfo->targeti.in->ino); tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
retry_lookup: retry_lookup:
dn = d_lookup(parent, &dname); dn = d_lookup(parent, &dname);
dout("d_lookup on parent=%p name=%.*s got %p\n", dout("d_lookup on parent=%p name=%.*s got %p\n",
...@@ -1206,8 +1179,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1206,8 +1179,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
} }
err = 0; err = 0;
} else if (d_really_is_positive(dn) && } else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != vino.ino || (ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != vino.snap)) { ceph_snap(d_inode(dn)) != tvino.snap)) {
dout(" dn %p points to wrong inode %p\n", dout(" dn %p points to wrong inode %p\n",
dn, d_inode(dn)); dn, d_inode(dn));
d_delete(dn); d_delete(dn);
...@@ -1221,10 +1194,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1221,10 +1194,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
} }
if (rinfo->head->is_target) { if (rinfo->head->is_target) {
vino.ino = le64_to_cpu(rinfo->targeti.in->ino); tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
in = ceph_get_inode(sb, vino); in = ceph_get_inode(sb, tvino);
if (IS_ERR(in)) { if (IS_ERR(in)) {
err = PTR_ERR(in); err = PTR_ERR(in);
goto done; goto done;
...@@ -1233,8 +1206,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1233,8 +1206,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
session, req->r_request_started, session, req->r_request_started,
(!req->r_aborted && rinfo->head->result == 0) ? (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
req->r_fmode : -1, rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation); &req->r_caps_reservation);
if (err < 0) { if (err < 0) {
pr_err("fill_inode badness %p %llx.%llx\n", pr_err("fill_inode badness %p %llx.%llx\n",
...@@ -1247,8 +1220,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1247,8 +1220,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
* ignore null lease/binding on snapdir ENOENT, or else we * ignore null lease/binding on snapdir ENOENT, or else we
* will have trouble splicing in the virtual snapdir later * will have trouble splicing in the virtual snapdir later
*/ */
if (rinfo->head->is_dentry && !req->r_aborted && if (rinfo->head->is_dentry &&
req->r_locked_dir && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
fsc->mount_options->snapdir_name, fsc->mount_options->snapdir_name,
req->r_dentry->d_name.len))) { req->r_dentry->d_name.len))) {
...@@ -1257,17 +1231,19 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1257,17 +1231,19 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
* mknod symlink mkdir : null -> new inode * mknod symlink mkdir : null -> new inode
* unlink : linked -> null * unlink : linked -> null
*/ */
struct inode *dir = req->r_locked_dir; struct inode *dir = req->r_parent;
struct dentry *dn = req->r_dentry; struct dentry *dn = req->r_dentry;
bool have_dir_cap, have_lease; bool have_dir_cap, have_lease;
BUG_ON(!dn); BUG_ON(!dn);
BUG_ON(!dir); BUG_ON(!dir);
BUG_ON(d_inode(dn->d_parent) != dir); BUG_ON(d_inode(dn->d_parent) != dir);
BUG_ON(ceph_ino(dir) !=
le64_to_cpu(rinfo->diri.in->ino)); dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
BUG_ON(ceph_snap(dir) != dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
le64_to_cpu(rinfo->diri.in->snapid));
BUG_ON(ceph_ino(dir) != dvino.ino);
BUG_ON(ceph_snap(dir) != dvino.snap);
/* do we have a lease on the whole dir? */ /* do we have a lease on the whole dir? */
have_dir_cap = have_dir_cap =
...@@ -1319,12 +1295,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1319,12 +1295,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
ceph_dir_clear_ordered(dir); ceph_dir_clear_ordered(dir);
dout("d_delete %p\n", dn); dout("d_delete %p\n", dn);
d_delete(dn); d_delete(dn);
} else { } else if (have_lease) {
if (have_lease && d_unhashed(dn)) if (d_unhashed(dn))
d_add(dn, NULL); d_add(dn, NULL);
update_dentry_lease(dn, rinfo->dlease, update_dentry_lease(dn, rinfo->dlease,
session, session,
req->r_request_started); req->r_request_started,
NULL, &dvino);
} }
goto done; goto done;
} }
...@@ -1347,15 +1324,19 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1347,15 +1324,19 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
have_lease = false; have_lease = false;
} }
if (have_lease) if (have_lease) {
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
update_dentry_lease(dn, rinfo->dlease, session, update_dentry_lease(dn, rinfo->dlease, session,
req->r_request_started); req->r_request_started,
&tvino, &dvino);
}
dout(" final dn %p\n", dn); dout(" final dn %p\n", dn);
} else if (!req->r_aborted && } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
(req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP) &&
req->r_op == CEPH_MDS_OP_MKSNAP)) { !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct dentry *dn = req->r_dentry; struct dentry *dn = req->r_dentry;
struct inode *dir = req->r_locked_dir; struct inode *dir = req->r_parent;
/* fill out a snapdir LOOKUPSNAP dentry */ /* fill out a snapdir LOOKUPSNAP dentry */
BUG_ON(!dn); BUG_ON(!dn);
...@@ -1370,6 +1351,26 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1370,6 +1351,26 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
goto done; goto done;
} }
req->r_dentry = dn; /* may have spliced */ req->r_dentry = dn; /* may have spliced */
} else if (rinfo->head->is_dentry) {
struct ceph_vino *ptvino = NULL;
if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
le32_to_cpu(rinfo->dlease->duration_ms)) {
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
if (rinfo->head->is_target) {
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
ptvino = &tvino;
}
update_dentry_lease(req->r_dentry, rinfo->dlease,
session, req->r_request_started, ptvino,
&dvino);
} else {
dout("%s: no dentry lease or dir cap\n", __func__);
}
} }
done: done:
dout("fill_trace done err=%d\n", err); dout("fill_trace done err=%d\n", err);
...@@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
u32 fpos_offset; u32 fpos_offset;
struct ceph_readdir_cache_control cache_ctl = {}; struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted) if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session); return readdir_prepopulate_inodes_only(req, session);
if (rinfo->hash_order && req->r_path2) { if (rinfo->hash_order && req->r_path2) {
...@@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
/* FIXME: release caps/leases if error occurs */ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino; struct ceph_vino tvino, dvino;
dname.name = rde->name; dname.name = rde->name;
dname.len = rde->name_len; dname.len = rde->name_len;
dname.hash = full_name_hash(parent, dname.name, dname.len); dname.hash = full_name_hash(parent, dname.name, dname.len);
vino.ino = le64_to_cpu(rde->inode.in->ino); tvino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rde->inode.in->snapid); tvino.snap = le64_to_cpu(rde->inode.in->snapid);
if (rinfo->hash_order) { if (rinfo->hash_order) {
u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
...@@ -1559,8 +1560,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1559,8 +1560,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
goto out; goto out;
} }
} else if (d_really_is_positive(dn) && } else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != vino.ino || (ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != vino.snap)) { ceph_snap(d_inode(dn)) != tvino.snap)) {
dout(" dn %p points to wrong inode %p\n", dout(" dn %p points to wrong inode %p\n",
dn, d_inode(dn)); dn, d_inode(dn));
d_delete(dn); d_delete(dn);
...@@ -1572,7 +1573,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1572,7 +1573,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (d_really_is_positive(dn)) { if (d_really_is_positive(dn)) {
in = d_inode(dn); in = d_inode(dn);
} else { } else {
in = ceph_get_inode(parent->d_sb, vino); in = ceph_get_inode(parent->d_sb, tvino);
if (IS_ERR(in)) { if (IS_ERR(in)) {
dout("new_inode badness\n"); dout("new_inode badness\n");
d_drop(dn); d_drop(dn);
...@@ -1617,8 +1618,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1617,8 +1618,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_dentry(dn)->offset = rde->offset; ceph_dentry(dn)->offset = rde->offset;
dvino = ceph_vino(d_inode(parent));
update_dentry_lease(dn, rde->lease, req->r_session, update_dentry_lease(dn, rde->lease, req->r_session,
req->r_request_started); req->r_request_started, &tvino, &dvino);
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn, ret = fill_readdir_cache(d_inode(parent), dn,
...@@ -1632,7 +1634,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1632,7 +1634,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
out: out:
if (err == 0 && skipped == 0) { if (err == 0 && skipped == 0) {
req->r_did_prepopulate = true; set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
req->r_readdir_cache_idx = cache_ctl.index; req->r_readdir_cache_idx = cache_ctl.index;
} }
ceph_readdir_cache_release(&cache_ctl); ceph_readdir_cache_release(&cache_ctl);
...@@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work) ...@@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work)
mutex_lock(&ci->i_truncate_mutex); mutex_lock(&ci->i_truncate_mutex);
if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
inode, ceph_ino(inode)); inode, ceph_ino(inode));
mapping_set_error(inode->i_mapping, -EIO); mapping_set_error(inode->i_mapping, -EIO);
......
...@@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) ...@@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
l.stripe_count = ci->i_layout.stripe_count; l.stripe_count = ci->i_layout.stripe_count;
l.object_size = ci->i_layout.object_size; l.object_size = ci->i_layout.object_size;
l.data_pool = ci->i_layout.pool_id; l.data_pool = ci->i_layout.pool_id;
l.preferred_osd = (s32)-1; l.preferred_osd = -1;
if (copy_to_user(arg, &l, sizeof(l))) if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT; return -EFAULT;
} }
...@@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) ...@@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
nl.data_pool = ci->i_layout.pool_id; nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */ /* this is obsolete, and always -1 */
nl.preferred_osd = le64_to_cpu(-1); nl.preferred_osd = -1;
err = __validate_layout(mdsc, &nl); err = __validate_layout(mdsc, &nl);
if (err) if (err)
......
...@@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref) ...@@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref)
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
iput(req->r_inode); iput(req->r_inode);
} }
if (req->r_locked_dir) if (req->r_parent)
ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
iput(req->r_target_inode); iput(req->r_target_inode);
if (req->r_dentry) if (req->r_dentry)
dput(req->r_dentry); dput(req->r_dentry);
...@@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, ...@@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
{ {
dout("__unregister_request %p tid %lld\n", req, req->r_tid); dout("__unregister_request %p tid %lld\n", req, req->r_tid);
/* Never leave an unregistered request on an unsafe list! */
list_del_init(&req->r_unsafe_item);
if (req->r_tid == mdsc->oldest_tid) { if (req->r_tid == mdsc->oldest_tid) {
struct rb_node *p = rb_next(&req->r_node); struct rb_node *p = rb_next(&req->r_node);
mdsc->oldest_tid = 0; mdsc->oldest_tid = 0;
...@@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc, ...@@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
erase_request(&mdsc->request_tree, req); erase_request(&mdsc->request_tree, req);
if (req->r_unsafe_dir && req->r_got_unsafe) { if (req->r_unsafe_dir &&
test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock); spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item); list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock); spin_unlock(&ci->i_unsafe_lock);
} }
if (req->r_target_inode && req->r_got_unsafe) { if (req->r_target_inode &&
test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock); spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_target_item); list_del_init(&req->r_unsafe_target_item);
...@@ -667,6 +672,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc, ...@@ -667,6 +672,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
} }
/*
* Walk back up the dentry tree until we hit a dentry representing a
* non-snapshot inode. We do this using the rcu_read_lock (which must be held
* when calling this) to ensure that the objects won't disappear while we're
* working with them. Once we hit a candidate dentry, we attempt to take a
* reference to it, and return that as the result.
*/
static struct inode *get_nonsnap_parent(struct dentry *dentry)
{
struct inode *inode = NULL;
while (dentry && !IS_ROOT(dentry)) {
inode = d_inode_rcu(dentry);
if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
break;
dentry = dentry->d_parent;
}
if (inode)
inode = igrab(inode);
return inode;
}
/* /*
* Choose mds to send request to next. If there is a hint set in the * Choose mds to send request to next. If there is a hint set in the
* request (e.g., due to a prior forward hint from the mds), use that. * request (e.g., due to a prior forward hint from the mds), use that.
...@@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, ...@@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
* *
* Called under mdsc->mutex. * Called under mdsc->mutex.
*/ */
static struct dentry *get_nonsnap_parent(struct dentry *dentry)
{
/*
* we don't need to worry about protecting the d_parent access
* here because we never renaming inside the snapped namespace
* except to resplice to another snapdir, and either the old or new
* result is a valid result.
*/
while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
dentry = dentry->d_parent;
return dentry;
}
static int __choose_mds(struct ceph_mds_client *mdsc, static int __choose_mds(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req) struct ceph_mds_request *req)
{ {
...@@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
int mode = req->r_direct_mode; int mode = req->r_direct_mode;
int mds = -1; int mds = -1;
u32 hash = req->r_direct_hash; u32 hash = req->r_direct_hash;
bool is_hash = req->r_direct_is_hash; bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
/* /*
* is there a specific mds we should try? ignore hint if we have * is there a specific mds we should try? ignore hint if we have
...@@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
inode = NULL; inode = NULL;
if (req->r_inode) { if (req->r_inode) {
inode = req->r_inode; inode = req->r_inode;
ihold(inode);
} else if (req->r_dentry) { } else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */ /* ignore race with rename; old or new d_parent is okay */
struct dentry *parent = req->r_dentry->d_parent; struct dentry *parent;
struct inode *dir = d_inode(parent); struct inode *dir;
rcu_read_lock();
parent = req->r_dentry->d_parent;
dir = req->r_parent ? : d_inode_rcu(parent);
if (dir->i_sb != mdsc->fsc->sb) { if (!dir || dir->i_sb != mdsc->fsc->sb) {
/* not this fs! */ /* not this fs or parent went negative */
inode = d_inode(req->r_dentry); inode = d_inode(req->r_dentry);
if (inode)
ihold(inode);
} else if (ceph_snap(dir) != CEPH_NOSNAP) { } else if (ceph_snap(dir) != CEPH_NOSNAP) {
/* direct snapped/virtual snapdir requests /* direct snapped/virtual snapdir requests
* based on parent dir inode */ * based on parent dir inode */
struct dentry *dn = get_nonsnap_parent(parent); inode = get_nonsnap_parent(parent);
inode = d_inode(dn);
dout("__choose_mds using nonsnap parent %p\n", inode); dout("__choose_mds using nonsnap parent %p\n", inode);
} else { } else {
/* dentry target */ /* dentry target */
inode = d_inode(req->r_dentry); inode = d_inode(req->r_dentry);
if (!inode || mode == USE_AUTH_MDS) { if (!inode || mode == USE_AUTH_MDS) {
/* dir + name */ /* dir + name */
inode = dir; inode = igrab(dir);
hash = ceph_dentry_hash(dir, req->r_dentry); hash = ceph_dentry_hash(dir, req->r_dentry);
is_hash = true; is_hash = true;
} else {
ihold(inode);
} }
} }
rcu_read_unlock();
} }
dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
...@@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
(int)r, frag.ndist); (int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) CEPH_MDS_STATE_ACTIVE)
return mds; goto out;
} }
/* since this file/dir wasn't known to be /* since this file/dir wasn't known to be
...@@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
inode, ceph_vinop(inode), frag.frag, mds); inode, ceph_vinop(inode), frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) CEPH_MDS_STATE_ACTIVE)
return mds; goto out;
} }
} }
} }
...@@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
if (!cap) { if (!cap) {
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
iput(inode);
goto random; goto random;
} }
mds = cap->session->s_mds; mds = cap->session->s_mds;
...@@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
inode, ceph_vinop(inode), mds, inode, ceph_vinop(inode), mds,
cap == ci->i_auth_cap ? "auth " : "", cap); cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
out:
iput(inode);
return mds; return mds;
random: random:
...@@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, ...@@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
while (!list_empty(&session->s_unsafe)) { while (!list_empty(&session->s_unsafe)) {
req = list_first_entry(&session->s_unsafe, req = list_first_entry(&session->s_unsafe,
struct ceph_mds_request, r_unsafe_item); struct ceph_mds_request, r_unsafe_item);
list_del_init(&req->r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n", pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid); req->r_tid);
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
...@@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
if (ci->i_wrbuffer_ref > 0 && if (ci->i_wrbuffer_ref > 0 &&
ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true; invalidate = true;
while (!list_empty(&ci->i_cap_flush_list)) { while (!list_empty(&ci->i_cap_flush_list)) {
...@@ -1775,18 +1800,23 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, ...@@ -1775,18 +1800,23 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
return path; return path;
} }
static int build_dentry_path(struct dentry *dentry, static int build_dentry_path(struct dentry *dentry, struct inode *dir,
const char **ppath, int *ppathlen, u64 *pino, const char **ppath, int *ppathlen, u64 *pino,
int *pfreepath) int *pfreepath)
{ {
char *path; char *path;
if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { rcu_read_lock();
*pino = ceph_ino(d_inode(dentry->d_parent)); if (!dir)
dir = d_inode_rcu(dentry->d_parent);
if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
*pino = ceph_ino(dir);
rcu_read_unlock();
*ppath = dentry->d_name.name; *ppath = dentry->d_name.name;
*ppathlen = dentry->d_name.len; *ppathlen = dentry->d_name.len;
return 0; return 0;
} }
rcu_read_unlock();
path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
if (IS_ERR(path)) if (IS_ERR(path))
return PTR_ERR(path); return PTR_ERR(path);
...@@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode, ...@@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode,
* an explicit ino+path. * an explicit ino+path.
*/ */
static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
const char *rpath, u64 rino, struct inode *rdiri, const char *rpath,
const char **ppath, int *pathlen, u64 rino, const char **ppath, int *pathlen,
u64 *ino, int *freepath) u64 *ino, int *freepath)
{ {
int r = 0; int r = 0;
...@@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ...@@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode)); ceph_snap(rinode));
} else if (rdentry) { } else if (rdentry) {
r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
freepath);
dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
*ppath); *ppath);
} else if (rpath || rino) { } else if (rpath || rino) {
...@@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ...@@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
int ret; int ret;
ret = set_request_path_attr(req->r_inode, req->r_dentry, ret = set_request_path_attr(req->r_inode, req->r_dentry,
req->r_path1, req->r_ino1.ino, req->r_parent, req->r_path1, req->r_ino1.ino,
&path1, &pathlen1, &ino1, &freepath1); &path1, &pathlen1, &ino1, &freepath1);
if (ret < 0) { if (ret < 0) {
msg = ERR_PTR(ret); msg = ERR_PTR(ret);
...@@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ...@@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
} }
ret = set_request_path_attr(NULL, req->r_old_dentry, ret = set_request_path_attr(NULL, req->r_old_dentry,
req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino, req->r_path2, req->r_ino2.ino,
&path2, &pathlen2, &ino2, &freepath2); &path2, &pathlen2, &ino2, &freepath2);
if (ret < 0) { if (ret < 0) {
...@@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ...@@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
mds, req->r_inode_drop, req->r_inode_unless, 0); mds, req->r_inode_drop, req->r_inode_unless, 0);
if (req->r_dentry_drop) if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry, releases += ceph_encode_dentry_release(&p, req->r_dentry,
mds, req->r_dentry_drop, req->r_dentry_unless); req->r_parent, mds, req->r_dentry_drop,
req->r_dentry_unless);
if (req->r_old_dentry_drop) if (req->r_old_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_old_dentry, releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
mds, req->r_old_dentry_drop, req->r_old_dentry_unless); req->r_old_dentry_dir, mds,
req->r_old_dentry_drop,
req->r_old_dentry_unless);
if (req->r_old_inode_drop) if (req->r_old_inode_drop)
releases += ceph_encode_inode_release(&p, releases += ceph_encode_inode_release(&p,
d_inode(req->r_old_dentry), d_inode(req->r_old_dentry),
...@@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ...@@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (req->r_got_unsafe) { if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
void *p; void *p;
/* /*
* Replay. Do not regenerate message (and rebuild * Replay. Do not regenerate message (and rebuild
...@@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ...@@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead = msg->front.iov_base; rhead = msg->front.iov_base;
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (req->r_got_unsafe) if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY; flags |= CEPH_MDS_FLAG_REPLAY;
if (req->r_locked_dir) if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY; flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags); rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd; rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1; rhead->num_retry = req->r_attempts - 1;
rhead->ino = 0; rhead->ino = 0;
dout(" r_locked_dir = %p\n", req->r_locked_dir); dout(" r_parent = %p\n", req->r_parent);
return 0; return 0;
} }
...@@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc, ...@@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
int mds = -1; int mds = -1;
int err = 0; int err = 0;
if (req->r_err || req->r_got_result) { if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
if (req->r_aborted) if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
goto out; goto out;
} }
...@@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc, ...@@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc,
err = -EIO; err = -EIO;
goto finish; goto finish;
} }
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout("do_request forced umount\n"); dout("do_request forced umount\n");
err = -EIO; err = -EIO;
goto finish; goto finish;
} }
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
if (mdsc->mdsmap_err) { if (mdsc->mdsmap_err) {
err = mdsc->mdsmap_err; err = mdsc->mdsmap_err;
dout("do_request mdsmap err %d\n", err); dout("do_request mdsmap err %d\n", err);
...@@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) ...@@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
while (p) { while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node); req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p); p = rb_next(p);
if (req->r_got_unsafe) if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
continue; continue;
if (req->r_attempts > 0) if (req->r_attempts > 0)
continue; /* only new requests */ continue; /* only new requests */
...@@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ...@@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
dout("do_request on %p\n", req); dout("do_request on %p\n", req);
/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode) if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_locked_dir) if (req->r_parent)
ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
if (req->r_old_dentry_dir) if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN); CEPH_CAP_PIN);
...@@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ...@@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
/* only abort if we didn't race with a real reply */ /* only abort if we didn't race with a real reply */
if (req->r_got_result) { if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
err = le32_to_cpu(req->r_reply_info.head->result); err = le32_to_cpu(req->r_reply_info.head->result);
} else if (err < 0) { } else if (err < 0) {
dout("aborted request %lld with %d\n", req->r_tid, err); dout("aborted request %lld with %d\n", req->r_tid, err);
...@@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ...@@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
*/ */
mutex_lock(&req->r_fill_mutex); mutex_lock(&req->r_fill_mutex);
req->r_err = err; req->r_err = err;
req->r_aborted = true; set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex); mutex_unlock(&req->r_fill_mutex);
if (req->r_locked_dir && if (req->r_parent &&
(req->r_op & CEPH_MDS_OP_WRITE)) (req->r_op & CEPH_MDS_OP_WRITE))
ceph_invalidate_dir_request(req); ceph_invalidate_dir_request(req);
} else { } else {
...@@ -2323,7 +2358,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ...@@ -2323,7 +2358,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
*/ */
void ceph_invalidate_dir_request(struct ceph_mds_request *req) void ceph_invalidate_dir_request(struct ceph_mds_request *req)
{ {
struct inode *inode = req->r_locked_dir; struct inode *inode = req->r_parent;
dout("invalidate_dir_request %p (complete, lease(s))\n", inode); dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
...@@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} }
/* dup? */ /* dup? */
if ((req->r_got_unsafe && !head->safe) || if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
(req->r_got_safe && head->safe)) { (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
pr_warn("got a dup %s reply on %llu from mds%d\n", pr_warn("got a dup %s reply on %llu from mds%d\n",
head->safe ? "safe" : "unsafe", tid, mds); head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
goto out; goto out;
} }
if (req->r_got_safe) { if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
pr_warn("got unsafe after safe on %llu from mds%d\n", pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds); tid, mds);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
...@@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (head->safe) { if (head->safe) {
req->r_got_safe = true; set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
if (req->r_got_unsafe) { if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
/* /*
* We already handled the unsafe response, now do the * We already handled the unsafe response, now do the
* cleanup. No need to examine the response; the MDS * cleanup. No need to examine the response; the MDS
...@@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
* useful we could do with a revised return value. * useful we could do with a revised return value.
*/ */
dout("got safe reply %llu, mds%d\n", tid, mds); dout("got safe reply %llu, mds%d\n", tid, mds);
list_del_init(&req->r_unsafe_item);
/* last unsafe request during umount? */ /* last unsafe request during umount? */
if (mdsc->stopping && !__get_oldest_req(mdsc)) if (mdsc->stopping && !__get_oldest_req(mdsc))
...@@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
goto out; goto out;
} }
} else { } else {
req->r_got_unsafe = true; set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
if (req->r_unsafe_dir) { if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = struct ceph_inode_info *ci =
...@@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */ /* insert trace into our cache */
mutex_lock(&req->r_fill_mutex); mutex_lock(&req->r_fill_mutex);
current->journal_info = req; current->journal_info = req;
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); err = ceph_fill_trace(mdsc->fsc->sb, req);
if (err == 0) { if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP)) req->r_op == CEPH_MDS_OP_LSSNAP))
...@@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (realm) if (realm)
ceph_put_snap_realm(mdsc, realm); ceph_put_snap_realm(mdsc, realm);
if (err == 0 && req->r_got_unsafe && req->r_target_inode) { if (err == 0 && req->r_target_inode &&
test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock); spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
...@@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} }
out_err: out_err:
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
if (!req->r_aborted) { if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
if (err) { if (err) {
req->r_err = err; req->r_err = err;
} else { } else {
req->r_reply = ceph_msg_get(msg); req->r_reply = ceph_msg_get(msg);
req->r_got_result = true; set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
} }
} else { } else {
dout("reply arrived after request %lld was aborted\n", tid); dout("reply arrived after request %lld was aborted\n", tid);
...@@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, ...@@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
goto out; /* dup reply? */ goto out; /* dup reply? */
} }
if (req->r_aborted) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid); dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) { } else if (fwd_seq <= req->r_num_fwd) {
...@@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, ...@@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
/* resend. forward race not possible; mds would drop */ /* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err); BUG_ON(req->r_err);
BUG_ON(req->r_got_result); BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
req->r_attempts = 0; req->r_attempts = 0;
req->r_num_fwd = fwd_seq; req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds; req->r_resend_mds = next_mds;
...@@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ...@@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
while (p) { while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node); req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p); p = rb_next(p);
if (req->r_got_unsafe) if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
continue; continue;
if (req->r_attempts == 0) if (req->r_attempts == 0)
continue; /* only old requests */ continue; /* only old requests */
...@@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ...@@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{ {
u64 want_tid, want_flush; u64 want_tid, want_flush;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return; return;
dout("sync\n"); dout("sync\n");
...@@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ...@@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/ */
static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
{ {
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true; return true;
return atomic_read(&mdsc->num_sessions) <= skipped; return atomic_read(&mdsc->num_sessions) <= skipped;
} }
......
...@@ -202,9 +202,18 @@ struct ceph_mds_request { ...@@ -202,9 +202,18 @@ struct ceph_mds_request {
char *r_path1, *r_path2; char *r_path1, *r_path2;
struct ceph_vino r_ino1, r_ino2; struct ceph_vino r_ino1, r_ino2;
struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ struct inode *r_parent; /* parent dir inode */
struct inode *r_target_inode; /* resulting inode */ struct inode *r_target_inode; /* resulting inode */
#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */
#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */
#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
unsigned long r_req_flags;
struct mutex r_fill_mutex; struct mutex r_fill_mutex;
union ceph_mds_request_args r_args; union ceph_mds_request_args r_args;
...@@ -216,7 +225,6 @@ struct ceph_mds_request { ...@@ -216,7 +225,6 @@ struct ceph_mds_request {
/* for choosing which mds to send this request to */ /* for choosing which mds to send this request to */
int r_direct_mode; int r_direct_mode;
u32 r_direct_hash; /* choose dir frag based on this dentry hash */ u32 r_direct_hash; /* choose dir frag based on this dentry hash */
bool r_direct_is_hash; /* true if r_direct_hash is valid */
/* data payload is used for xattr ops */ /* data payload is used for xattr ops */
struct ceph_pagelist *r_pagelist; struct ceph_pagelist *r_pagelist;
...@@ -234,7 +242,6 @@ struct ceph_mds_request { ...@@ -234,7 +242,6 @@ struct ceph_mds_request {
struct ceph_mds_reply_info_parsed r_reply_info; struct ceph_mds_reply_info_parsed r_reply_info;
struct page *r_locked_page; struct page *r_locked_page;
int r_err; int r_err;
bool r_aborted;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */ unsigned long r_started; /* start time to measure timeout against */
...@@ -262,9 +269,7 @@ struct ceph_mds_request { ...@@ -262,9 +269,7 @@ struct ceph_mds_request {
ceph_mds_request_callback_t r_callback; ceph_mds_request_callback_t r_callback;
ceph_mds_request_wait_callback_t r_wait_for_completion; ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */ struct list_head r_unsafe_item; /* per-session unsafe list item */
bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate;
long long r_dir_release_cnt; long long r_dir_release_cnt;
long long r_dir_ordered_cnt; long long r_dir_ordered_cnt;
int r_readdir_cache_idx; int r_readdir_cache_idx;
......
...@@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = { ...@@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = {
.destroy_inode = ceph_destroy_inode, .destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode, .write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode, .drop_inode = ceph_drop_inode,
.evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs, .sync_fs = ceph_sync_fs,
.put_super = ceph_put_super, .put_super = ceph_put_super,
.show_options = ceph_show_options, .show_options = ceph_show_options,
...@@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb, ...@@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb,
fsc->backing_dev_info.ra_pages = fsc->backing_dev_info.ra_pages =
VM_MAX_READAHEAD * 1024 / PAGE_SIZE; VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
fsc->mount_options->rsize >= PAGE_SIZE)
fsc->backing_dev_info.io_pages =
(fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else if (fsc->mount_options->rsize == 0)
fsc->backing_dev_info.io_pages = ULONG_MAX;
err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq)); atomic_long_inc_return(&bdi_seq));
if (!err) if (!err)
......
...@@ -45,8 +45,8 @@ ...@@ -45,8 +45,8 @@
#define ceph_test_mount_opt(fsc, opt) \ #define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
#define CEPH_RSIZE_DEFAULT 0 /* max read size */ #define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */
#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap" #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
...@@ -343,7 +343,6 @@ struct ceph_inode_info { ...@@ -343,7 +343,6 @@ struct ceph_inode_info {
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock; spinlock_t i_unsafe_lock;
...@@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) ...@@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
} }
/* what the mds thinks we want */ /* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
...@@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops; ...@@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb); extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode); extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode);
extern void ceph_evict_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb, extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino); struct ceph_vino vino);
...@@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, ...@@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec *ctime, u64 time_warp_seq, struct timespec *ctime,
struct timespec *mtime, struct timespec *atime); struct timespec *mtime, struct timespec *atime);
extern int ceph_fill_trace(struct super_block *sb, extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req, struct ceph_mds_request *req);
struct ceph_mds_session *session);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session); struct ceph_mds_session *session);
...@@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); ...@@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_encode_inode_release(void **p, struct inode *inode, extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force); int mds, int drop, int unless, int force);
extern int ceph_encode_dentry_release(void **p, struct dentry *dn, extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
struct inode *dir,
int mds, int drop, int unless); int mds, int drop, int unless);
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
...@@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp); extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len); char *data, size_t len);
extern void ceph_sync_write_wait(struct inode *inode);
/* dir.c */ /* dir.c */
extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops; extern const struct file_operations ceph_snapdir_fops;
......
...@@ -22,7 +22,6 @@ struct ceph_osd_client; ...@@ -22,7 +22,6 @@ struct ceph_osd_client;
* completion callback for async writepages * completion callback for async writepages
*/ */
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
#define CEPH_HOMELESS_OSD -1 #define CEPH_HOMELESS_OSD -1
...@@ -170,15 +169,12 @@ struct ceph_osd_request { ...@@ -170,15 +169,12 @@ struct ceph_osd_request {
unsigned int r_num_ops; unsigned int r_num_ops;
int r_result; int r_result;
bool r_got_reply;
struct ceph_osd_client *r_osdc; struct ceph_osd_client *r_osdc;
struct kref r_kref; struct kref r_kref;
bool r_mempool; bool r_mempool;
struct completion r_completion; struct completion r_completion; /* private to osd_client.c */
struct completion r_done_completion; /* fsync waiter */
ceph_osdc_callback_t r_callback; ceph_osdc_callback_t r_callback;
ceph_osdc_unsafe_callback_t r_unsafe_callback;
struct list_head r_unsafe_item; struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */ struct inode *r_inode; /* for use by callbacks */
......
...@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) ...@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
case CEPH_POOL_TYPE_EC: case CEPH_POOL_TYPE_EC:
return false; return false;
default: default:
BUG_ON(1); BUG();
} }
} }
...@@ -81,13 +81,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest, ...@@ -81,13 +81,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest,
const struct ceph_object_locator *src); const struct ceph_object_locator *src);
void ceph_oloc_destroy(struct ceph_object_locator *oloc); void ceph_oloc_destroy(struct ceph_object_locator *oloc);
/*
* Maximum supported by kernel client object name length
*
* (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
*/
#define CEPH_MAX_OID_NAME_LEN 100
/* /*
* 51-char inline_name is long enough for all cephfs and all but one * 51-char inline_name is long enough for all cephfs and all but one
* rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
...@@ -173,8 +166,8 @@ struct ceph_osdmap { ...@@ -173,8 +166,8 @@ struct ceph_osdmap {
* the list of osds that store+replicate them. */ * the list of osds that store+replicate them. */
struct crush_map *crush; struct crush_map *crush;
struct mutex crush_scratch_mutex; struct mutex crush_workspace_mutex;
int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; void *crush_workspace;
}; };
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
......
...@@ -50,7 +50,7 @@ struct ceph_timespec { ...@@ -50,7 +50,7 @@ struct ceph_timespec {
#define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_LINEAR 2
#define CEPH_PG_LAYOUT_HYBRID 3 #define CEPH_PG_LAYOUT_HYBRID 3
#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ #define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */
/* /*
* placement group. * placement group.
......
...@@ -135,13 +135,6 @@ struct crush_bucket { ...@@ -135,13 +135,6 @@ struct crush_bucket {
__u32 size; /* num items */ __u32 size; /* num items */
__s32 *items; __s32 *items;
/*
* cached random permutation: used for uniform bucket and for
* the linear search fallback for the other bucket types.
*/
__u32 perm_x; /* @x for which *perm is defined */
__u32 perm_n; /* num elements of *perm that are permuted/defined */
__u32 *perm;
}; };
struct crush_bucket_uniform { struct crush_bucket_uniform {
...@@ -211,6 +204,21 @@ struct crush_map { ...@@ -211,6 +204,21 @@ struct crush_map {
* device fails. */ * device fails. */
__u8 chooseleaf_stable; __u8 chooseleaf_stable;
/*
* This value is calculated after decode or construction by
* the builder. It is exposed here (rather than having a
* 'build CRUSH working space' function) so that callers can
* reserve a static buffer, allocate space on the stack, or
* otherwise avoid calling into the heap allocator if they
* want to. The size of the working space depends on the map,
* while the size of the scratch vector passed to the mapper
* depends on the size of the desired result set.
*
* Nothing stops the caller from allocating both in one swell
* foop and passing in two points, though.
*/
size_t working_size;
#ifndef __KERNEL__ #ifndef __KERNEL__
/* /*
* version 0 (original) of straw_calc has various flaws. version 1 * version 0 (original) of straw_calc has various flaws. version 1
...@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i) ...@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
return ((i+1) << 1)-1; return ((i+1) << 1)-1;
} }
/*
* These data structures are private to the CRUSH implementation. They
* are exposed in this header file because builder needs their
* definitions to calculate the total working size.
*
* Moving this out of the crush map allow us to treat the CRUSH map as
* immutable within the mapper and removes the requirement for a CRUSH
* map lock.
*/
struct crush_work_bucket {
__u32 perm_x; /* @x for which *perm is defined */
__u32 perm_n; /* num elements of *perm that are permuted/defined */
__u32 *perm; /* Permutation of the bucket's items */
};
struct crush_work {
struct crush_work_bucket **work; /* Per-bucket working store */
};
#endif #endif
...@@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map, ...@@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map,
int ruleno, int ruleno,
int x, int *result, int result_max, int x, int *result, int result_max,
const __u32 *weights, int weight_max, const __u32 *weights, int weight_max,
int *scratch); void *cwin);
/*
* Returns the exact amount of workspace that will need to be used
* for a given combination of crush_map and result_max. The caller can
* then allocate this much on its own, either on the stack, in a
* per-thread long-lived buffer, or however it likes.
*/
static inline size_t crush_work_size(const struct crush_map *map,
int result_max)
{
return map->working_size + result_max * 3 * sizeof(__u32);
}
void crush_init_workspace(const struct crush_map *map, void *v);
#endif #endif
...@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, ...@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
__func__, lock_name, type, cookie, tag, desc, flags); __func__, lock_name, type, cookie, tag, desc, flags);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE, lock_op_page,
lock_op_page, lock_op_buf_size, NULL, NULL); lock_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret); dout("%s: status %d\n", __func__, ret);
__free_page(lock_op_page); __free_page(lock_op_page);
...@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc, ...@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE, unlock_op_page,
unlock_op_page, unlock_op_buf_size, NULL, NULL); unlock_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret); dout("%s: status %d\n", __func__, ret);
__free_page(unlock_op_page); __free_page(unlock_op_page);
...@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, ...@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
cookie, ENTITY_NAME(*locker)); cookie, ENTITY_NAME(*locker));
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE, break_op_page,
break_op_page, break_op_buf_size, NULL, NULL); break_op_buf_size, NULL, NULL);
dout("%s: status %d\n", __func__, ret); dout("%s: status %d\n", __func__, ret);
__free_page(break_op_page); __free_page(break_op_page);
...@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc, ...@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
int get_info_op_buf_size; int get_info_op_buf_size;
int name_len = strlen(lock_name); int name_len = strlen(lock_name);
struct page *get_info_op_page, *reply_page; struct page *get_info_op_page, *reply_page;
size_t reply_len; size_t reply_len = PAGE_SIZE;
void *p, *end; void *p, *end;
int ret; int ret;
......
...@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) ...@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{ {
kfree(b->h.perm);
kfree(b->h.items); kfree(b->h.items);
kfree(b); kfree(b);
} }
...@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) ...@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
{ {
kfree(b->item_weights); kfree(b->item_weights);
kfree(b->sum_weights); kfree(b->sum_weights);
kfree(b->h.perm);
kfree(b->h.items); kfree(b->h.items);
kfree(b); kfree(b);
} }
void crush_destroy_bucket_tree(struct crush_bucket_tree *b) void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{ {
kfree(b->h.perm);
kfree(b->h.items); kfree(b->h.items);
kfree(b->node_weights); kfree(b->node_weights);
kfree(b); kfree(b);
...@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) ...@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
{ {
kfree(b->straws); kfree(b->straws);
kfree(b->item_weights); kfree(b->item_weights);
kfree(b->h.perm);
kfree(b->h.items); kfree(b->h.items);
kfree(b); kfree(b);
} }
...@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) ...@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
{ {
kfree(b->item_weights); kfree(b->item_weights);
kfree(b->h.perm);
kfree(b->h.items); kfree(b->h.items);
kfree(b); kfree(b);
} }
......
...@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size ...@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
return -1; return -1;
} }
/* /*
* bucket choose methods * bucket choose methods
* *
...@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size ...@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
* Since this is expensive, we optimize for the r=0 case, which * Since this is expensive, we optimize for the r=0 case, which
* captures the vast majority of calls. * captures the vast majority of calls.
*/ */
static int bucket_perm_choose(struct crush_bucket *bucket, static int bucket_perm_choose(const struct crush_bucket *bucket,
struct crush_work_bucket *work,
int x, int r) int x, int r)
{ {
unsigned int pr = r % bucket->size; unsigned int pr = r % bucket->size;
unsigned int i, s; unsigned int i, s;
/* start a new permutation if @x has changed */ /* start a new permutation if @x has changed */
if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { if (work->perm_x != (__u32)x || work->perm_n == 0) {
dprintk("bucket %d new x=%d\n", bucket->id, x); dprintk("bucket %d new x=%d\n", bucket->id, x);
bucket->perm_x = x; work->perm_x = x;
/* optimize common r=0 case */ /* optimize common r=0 case */
if (pr == 0) { if (pr == 0) {
s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
bucket->size; bucket->size;
bucket->perm[0] = s; work->perm[0] = s;
bucket->perm_n = 0xffff; /* magic value, see below */ work->perm_n = 0xffff; /* magic value, see below */
goto out; goto out;
} }
for (i = 0; i < bucket->size; i++) for (i = 0; i < bucket->size; i++)
bucket->perm[i] = i; work->perm[i] = i;
bucket->perm_n = 0; work->perm_n = 0;
} else if (bucket->perm_n == 0xffff) { } else if (work->perm_n == 0xffff) {
/* clean up after the r=0 case above */ /* clean up after the r=0 case above */
for (i = 1; i < bucket->size; i++) for (i = 1; i < bucket->size; i++)
bucket->perm[i] = i; work->perm[i] = i;
bucket->perm[bucket->perm[0]] = 0; work->perm[work->perm[0]] = 0;
bucket->perm_n = 1; work->perm_n = 1;
} }
/* calculate permutation up to pr */ /* calculate permutation up to pr */
for (i = 0; i < bucket->perm_n; i++) for (i = 0; i < work->perm_n; i++)
dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
while (bucket->perm_n <= pr) { while (work->perm_n <= pr) {
unsigned int p = bucket->perm_n; unsigned int p = work->perm_n;
/* no point in swapping the final entry */ /* no point in swapping the final entry */
if (p < bucket->size - 1) { if (p < bucket->size - 1) {
i = crush_hash32_3(bucket->hash, x, bucket->id, p) % i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
(bucket->size - p); (bucket->size - p);
if (i) { if (i) {
unsigned int t = bucket->perm[p + i]; unsigned int t = work->perm[p + i];
bucket->perm[p + i] = bucket->perm[p]; work->perm[p + i] = work->perm[p];
bucket->perm[p] = t; work->perm[p] = t;
} }
dprintk(" perm_choose swap %d with %d\n", p, p+i); dprintk(" perm_choose swap %d with %d\n", p, p+i);
} }
bucket->perm_n++; work->perm_n++;
} }
for (i = 0; i < bucket->size; i++) for (i = 0; i < bucket->size; i++)
dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
s = bucket->perm[pr]; s = work->perm[pr];
out: out:
dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
bucket->size, x, r, pr, s); bucket->size, x, r, pr, s);
...@@ -132,14 +132,14 @@ static int bucket_perm_choose(struct crush_bucket *bucket, ...@@ -132,14 +132,14 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
} }
/* uniform */ /* uniform */
static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
int x, int r) struct crush_work_bucket *work, int x, int r)
{ {
return bucket_perm_choose(&bucket->h, x, r); return bucket_perm_choose(&bucket->h, work, x, r);
} }
/* list */ /* list */
static int bucket_list_choose(struct crush_bucket_list *bucket, static int bucket_list_choose(const struct crush_bucket_list *bucket,
int x, int r) int x, int r)
{ {
int i; int i;
...@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, ...@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
w *= bucket->sum_weights[i]; w *= bucket->sum_weights[i];
w = w >> 16; w = w >> 16;
/*dprintk(" scaled %llx\n", w);*/ /*dprintk(" scaled %llx\n", w);*/
if (w < bucket->item_weights[i]) if (w < bucket->item_weights[i]) {
return bucket->h.items[i]; return bucket->h.items[i];
}
} }
dprintk("bad list sums for bucket %d\n", bucket->h.id); dprintk("bad list sums for bucket %d\n", bucket->h.id);
...@@ -192,7 +193,7 @@ static int terminal(int x) ...@@ -192,7 +193,7 @@ static int terminal(int x)
return x & 1; return x & 1;
} }
static int bucket_tree_choose(struct crush_bucket_tree *bucket, static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
int x, int r) int x, int r)
{ {
int n; int n;
...@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, ...@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
/* straw */ /* straw */
static int bucket_straw_choose(struct crush_bucket_straw *bucket, static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
int x, int r) int x, int r)
{ {
__u32 i; __u32 i;
...@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin) ...@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
* *
*/ */
static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
int x, int r) int x, int r)
{ {
unsigned int i, high = 0; unsigned int i, high = 0;
...@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, ...@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
high_draw = draw; high_draw = draw;
} }
} }
return bucket->h.items[high]; return bucket->h.items[high];
} }
static int crush_bucket_choose(struct crush_bucket *in, int x, int r) static int crush_bucket_choose(const struct crush_bucket *in,
struct crush_work_bucket *work,
int x, int r)
{ {
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0); BUG_ON(in->size == 0);
switch (in->alg) { switch (in->alg) {
case CRUSH_BUCKET_UNIFORM: case CRUSH_BUCKET_UNIFORM:
return bucket_uniform_choose((struct crush_bucket_uniform *)in, return bucket_uniform_choose(
x, r); (const struct crush_bucket_uniform *)in,
work, x, r);
case CRUSH_BUCKET_LIST: case CRUSH_BUCKET_LIST:
return bucket_list_choose((struct crush_bucket_list *)in, return bucket_list_choose((const struct crush_bucket_list *)in,
x, r); x, r);
case CRUSH_BUCKET_TREE: case CRUSH_BUCKET_TREE:
return bucket_tree_choose((struct crush_bucket_tree *)in, return bucket_tree_choose((const struct crush_bucket_tree *)in,
x, r); x, r);
case CRUSH_BUCKET_STRAW: case CRUSH_BUCKET_STRAW:
return bucket_straw_choose((struct crush_bucket_straw *)in, return bucket_straw_choose(
x, r); (const struct crush_bucket_straw *)in,
x, r);
case CRUSH_BUCKET_STRAW2: case CRUSH_BUCKET_STRAW2:
return bucket_straw2_choose((struct crush_bucket_straw2 *)in, return bucket_straw2_choose(
x, r); (const struct crush_bucket_straw2 *)in,
x, r);
default: default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg); dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0]; return in->items[0];
} }
} }
/* /*
* true if device is marked "out" (failed, fully offloaded) * true if device is marked "out" (failed, fully offloaded)
* of the cluster * of the cluster
...@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map, ...@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
* @parent_r: r value passed from the parent * @parent_r: r value passed from the parent
*/ */
static int crush_choose_firstn(const struct crush_map *map, static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket, struct crush_work *work,
const struct crush_bucket *bucket,
const __u32 *weight, int weight_max, const __u32 *weight, int weight_max,
int x, int numrep, int type, int x, int numrep, int type,
int *out, int outpos, int *out, int outpos,
...@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
int rep; int rep;
unsigned int ftotal, flocal; unsigned int ftotal, flocal;
int retry_descent, retry_bucket, skip_rep; int retry_descent, retry_bucket, skip_rep;
struct crush_bucket *in = bucket; const struct crush_bucket *in = bucket;
int r; int r;
int i; int i;
int item = 0; int item = 0;
...@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
if (local_fallback_retries > 0 && if (local_fallback_retries > 0 &&
flocal >= (in->size>>1) && flocal >= (in->size>>1) &&
flocal > local_fallback_retries) flocal > local_fallback_retries)
item = bucket_perm_choose(in, x, r); item = bucket_perm_choose(
in, work->work[-1-in->id],
x, r);
else else
item = crush_bucket_choose(in, x, r); item = crush_bucket_choose(
in, work->work[-1-in->id],
x, r);
if (item >= map->max_devices) { if (item >= map->max_devices) {
dprintk(" bad item %d\n", item); dprintk(" bad item %d\n", item);
skip_rep = 1; skip_rep = 1;
...@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
sub_r = r >> (vary_r-1); sub_r = r >> (vary_r-1);
else else
sub_r = 0; sub_r = 0;
if (crush_choose_firstn(map, if (crush_choose_firstn(
map->buckets[-1-item], map,
weight, weight_max, work,
x, stable ? 1 : outpos+1, 0, map->buckets[-1-item],
out2, outpos, count, weight, weight_max,
recurse_tries, 0, x, stable ? 1 : outpos+1, 0,
local_retries, out2, outpos, count,
local_fallback_retries, recurse_tries, 0,
0, local_retries,
vary_r, local_fallback_retries,
stable, 0,
NULL, vary_r,
sub_r) <= outpos) stable,
NULL,
sub_r) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
reject = 1; reject = 1;
} else { } else {
...@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
} }
} }
if (!reject) { if (!reject && !collide) {
/* out? */ /* out? */
if (itemtype == 0) if (itemtype == 0)
reject = is_out(map, weight, reject = is_out(map, weight,
weight_max, weight_max,
item, x); item, x);
else
reject = 0;
} }
reject: reject:
...@@ -600,7 +611,8 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -600,7 +611,8 @@ static int crush_choose_firstn(const struct crush_map *map,
* *
*/ */
static void crush_choose_indep(const struct crush_map *map, static void crush_choose_indep(const struct crush_map *map,
struct crush_bucket *bucket, struct crush_work *work,
const struct crush_bucket *bucket,
const __u32 *weight, int weight_max, const __u32 *weight, int weight_max,
int x, int left, int numrep, int type, int x, int left, int numrep, int type,
int *out, int outpos, int *out, int outpos,
...@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
int *out2, int *out2,
int parent_r) int parent_r)
{ {
struct crush_bucket *in = bucket; const struct crush_bucket *in = bucket;
int endpos = outpos + left; int endpos = outpos + left;
int rep; int rep;
unsigned int ftotal; unsigned int ftotal;
...@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
break; break;
} }
item = crush_bucket_choose(in, x, r); item = crush_bucket_choose(
in, work->work[-1-in->id],
x, r);
if (item >= map->max_devices) { if (item >= map->max_devices) {
dprintk(" bad item %d\n", item); dprintk(" bad item %d\n", item);
out[rep] = CRUSH_ITEM_NONE; out[rep] = CRUSH_ITEM_NONE;
...@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
if (recurse_to_leaf) { if (recurse_to_leaf) {
if (item < 0) { if (item < 0) {
crush_choose_indep(map, crush_choose_indep(
map->buckets[-1-item], map,
weight, weight_max, work,
x, 1, numrep, 0, map->buckets[-1-item],
out2, rep, weight, weight_max,
recurse_tries, 0, x, 1, numrep, 0,
0, NULL, r); out2, rep,
recurse_tries, 0,
0, NULL, r);
if (out2[rep] == CRUSH_ITEM_NONE) { if (out2[rep] == CRUSH_ITEM_NONE) {
/* placed nothing; no leaf */ /* placed nothing; no leaf */
break; break;
...@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
#endif #endif
} }
/*
* This takes a chunk of memory and sets it up to be a shiny new
* working area for a CRUSH placement computation. It must be called
* on any newly allocated memory before passing it in to
* crush_do_rule. It may be used repeatedly after that, so long as the
* map has not changed. If the map /has/ changed, you must make sure
* the working size is no smaller than what was allocated and re-run
* crush_init_workspace.
*
* If you do retain the working space between calls to crush, make it
* thread-local.
*/
void crush_init_workspace(const struct crush_map *map, void *v)
{
struct crush_work *w = v;
__s32 b;
/*
* We work by moving through the available space and setting
* values and pointers as we go.
*
* It's a bit like Forth's use of the 'allot' word since we
* set the pointer first and then reserve the space for it to
* point to by incrementing the point.
*/
v += sizeof(struct crush_work *);
w->work = v;
v += map->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < map->max_buckets; ++b) {
if (!map->buckets[b])
continue;
w->work[b] = v;
switch (map->buckets[b]->alg) {
default:
v += sizeof(struct crush_work_bucket);
break;
}
w->work[b]->perm_x = 0;
w->work[b]->perm_n = 0;
w->work[b]->perm = v;
v += map->buckets[b]->size * sizeof(__u32);
}
BUG_ON(v - (void *)w != map->working_size);
}
/** /**
* crush_do_rule - calculate a mapping with the given input and rule * crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map * @map: the crush_map
...@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
* @result_max: maximum result size * @result_max: maximum result size
* @weight: weight vector (for map leaves) * @weight: weight vector (for map leaves)
* @weight_max: size of weight vector * @weight_max: size of weight vector
* @scratch: scratch vector for private use; must be >= 3 * result_max * @cwin: pointer to at least crush_work_size() bytes of memory
*/ */
int crush_do_rule(const struct crush_map *map, int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max, int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max, const __u32 *weight, int weight_max,
int *scratch) void *cwin)
{ {
int result_len; int result_len;
int *a = scratch; struct crush_work *cw = cwin;
int *b = scratch + result_max; int *a = cwin + map->working_size;
int *c = scratch + result_max*2; int *b = a + result_max;
int *c = b + result_max;
int *w = a;
int *o = b;
int recurse_to_leaf; int recurse_to_leaf;
int *w;
int wsize = 0; int wsize = 0;
int *o;
int osize; int osize;
int *tmp; int *tmp;
struct crush_rule *rule; const struct crush_rule *rule;
__u32 step; __u32 step;
int i, j; int i, j;
int numrep; int numrep;
...@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map, ...@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
rule = map->rules[ruleno]; rule = map->rules[ruleno];
result_len = 0; result_len = 0;
w = a;
o = b;
for (step = 0; step < rule->len; step++) { for (step = 0; step < rule->len; step++) {
int firstn = 0; int firstn = 0;
struct crush_rule_step *curstep = &rule->steps[step]; const struct crush_rule_step *curstep = &rule->steps[step];
switch (curstep->op) { switch (curstep->op) {
case CRUSH_RULE_TAKE: case CRUSH_RULE_TAKE:
...@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
recurse_tries = choose_tries; recurse_tries = choose_tries;
osize += crush_choose_firstn( osize += crush_choose_firstn(
map, map,
cw,
map->buckets[bno], map->buckets[bno],
weight, weight_max, weight, weight_max,
x, numrep, x, numrep,
...@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
numrep : (result_max-osize)); numrep : (result_max-osize));
crush_choose_indep( crush_choose_indep(
map, map,
cw,
map->buckets[bno], map->buckets[bno],
weight, weight_max, weight, weight_max,
x, out_size, numrep, x, out_size, numrep,
...@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map, ...@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
break; break;
} }
} }
return result_len; return result_len;
} }
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <linux/err.h> #include <linux/err.h>
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/sched.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <crypto/aes.h> #include <crypto/aes.h>
#include <crypto/skcipher.h> #include <crypto/skcipher.h>
......
...@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req) ...@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
kref_init(&req->r_kref); kref_init(&req->r_kref);
init_completion(&req->r_completion); init_completion(&req->r_completion);
init_completion(&req->r_done_completion);
RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_node);
RB_CLEAR_NODE(&req->r_mc_node); RB_CLEAR_NODE(&req->r_mc_node);
INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_unsafe_item);
...@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, ...@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
BUG_ON(length > previous); BUG_ON(length > previous);
op->extent.length = length; op->extent.length = length;
op->indata_len -= previous - length; if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
op->indata_len -= previous - length;
} }
EXPORT_SYMBOL(osd_req_op_extent_update); EXPORT_SYMBOL(osd_req_op_extent_update);
...@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) ...@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
bool need_send = false; bool need_send = false;
bool promoted = false; bool promoted = false;
WARN_ON(req->r_tid || req->r_got_reply); WARN_ON(req->r_tid);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again: again:
...@@ -1704,17 +1704,10 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) ...@@ -1704,17 +1704,10 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
static void account_request(struct ceph_osd_request *req) static void account_request(struct ceph_osd_request *req)
{ {
unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
if (req->r_flags & CEPH_OSD_FLAG_READ) { req->r_flags |= CEPH_OSD_FLAG_ONDISK;
WARN_ON(req->r_flags & mask);
req->r_flags |= CEPH_OSD_FLAG_ACK;
} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
WARN_ON(!(req->r_flags & mask));
else
WARN_ON(1);
WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
atomic_inc(&req->r_osdc->num_requests); atomic_inc(&req->r_osdc->num_requests);
} }
...@@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req) ...@@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req)
static void __complete_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req)
{ {
if (req->r_callback) if (req->r_callback) {
dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
req->r_tid, req->r_callback, req->r_result);
req->r_callback(req); req->r_callback(req);
else }
complete_all(&req->r_completion);
} }
/* /*
* Note that this is open-coded in handle_reply(), which has to deal * This is open-coded in handle_reply().
* with ack vs commit, dup acks, etc.
*/ */
static void complete_request(struct ceph_osd_request *req, int err) static void complete_request(struct ceph_osd_request *req, int err)
{ {
...@@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err) ...@@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
req->r_result = err; req->r_result = err;
finish_request(req); finish_request(req);
__complete_request(req); __complete_request(req);
complete_all(&req->r_done_completion); complete_all(&req->r_completion);
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
} }
...@@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req) ...@@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req)
cancel_map_check(req); cancel_map_check(req);
finish_request(req); finish_request(req);
complete_all(&req->r_done_completion); complete_all(&req->r_completion);
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
} }
...@@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req) ...@@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
mutex_lock(&lreq->lock); mutex_lock(&lreq->lock);
dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
lreq->linger_id, req->r_result); lreq->linger_id, req->r_result);
WARN_ON(!__linger_registered(lreq));
linger_reg_commit_complete(lreq, req->r_result); linger_reg_commit_complete(lreq, req->r_result);
lreq->committed = true; lreq->committed = true;
...@@ -2785,31 +2777,8 @@ static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m) ...@@ -2785,31 +2777,8 @@ static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
} }
/* /*
* We are done with @req if * Handle MOSDOpReply. Set ->r_result and call the callback if it is
* - @m is a safe reply, or * specified.
* - @m is an unsafe reply and we didn't want a safe one
*/
static bool done_request(const struct ceph_osd_request *req,
const struct MOSDOpReply *m)
{
return (m->result < 0 ||
(m->flags & CEPH_OSD_FLAG_ONDISK) ||
!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
}
/*
* handle osd op reply. either call the callback if it is specified,
* or do the completion to wake up the waiting thread.
*
* ->r_unsafe_callback is set? yes no
*
* first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
* any or needed/got safe) r_done_completion r_done_completion
*
* first reply is unsafe r_unsafe_cb(true) (nothing)
*
* when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
* r_done_completion r_done_completion
*/ */
static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
{ {
...@@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) ...@@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
struct MOSDOpReply m; struct MOSDOpReply m;
u64 tid = le64_to_cpu(msg->hdr.tid); u64 tid = le64_to_cpu(msg->hdr.tid);
u32 data_len = 0; u32 data_len = 0;
bool already_acked;
int ret; int ret;
int i; int i;
...@@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) ...@@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
le32_to_cpu(msg->hdr.data_len), req->r_tid); le32_to_cpu(msg->hdr.data_len), req->r_tid);
goto fail_request; goto fail_request;
} }
dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, dout("%s req %p tid %llu result %d data_len %u\n", __func__,
req, req->r_tid, req->r_got_reply, m.result, data_len); req, req->r_tid, m.result, data_len);
already_acked = req->r_got_reply;
if (!already_acked) {
req->r_result = m.result ?: data_len;
req->r_replay_version = m.replay_version; /* struct */
req->r_got_reply = true;
} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
dout("req %p tid %llu dup ack\n", req, req->r_tid);
goto out_unlock_session;
}
if (done_request(req, &m)) {
finish_request(req);
if (req->r_linger) {
WARN_ON(req->r_unsafe_callback);
dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
__complete_request(req);
}
}
/*
* Since we only ever request ONDISK, we should only ever get
* one (type of) reply back.
*/
WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
req->r_result = m.result ?: data_len;
finish_request(req);
mutex_unlock(&osd->lock); mutex_unlock(&osd->lock);
up_read(&osdc->lock); up_read(&osdc->lock);
if (done_request(req, &m)) { __complete_request(req);
if (already_acked && req->r_unsafe_callback) { complete_all(&req->r_completion);
dout("req %p tid %llu safe-cb\n", req, req->r_tid); ceph_osdc_put_request(req);
req->r_unsafe_callback(req, false);
} else if (!req->r_linger) {
dout("req %p tid %llu cb\n", req, req->r_tid);
__complete_request(req);
}
complete_all(&req->r_done_completion);
ceph_osdc_put_request(req);
} else {
if (req->r_unsafe_callback) {
dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
req->r_unsafe_callback(req, true);
} else {
WARN_ON(1);
}
}
return; return;
fail_request: fail_request:
...@@ -3540,7 +3480,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc) ...@@ -3540,7 +3480,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
up_read(&osdc->lock); up_read(&osdc->lock);
dout("%s waiting on req %p tid %llu last_tid %llu\n", dout("%s waiting on req %p tid %llu last_tid %llu\n",
__func__, req, req->r_tid, last_tid); __func__, req, req->r_tid, last_tid);
wait_for_completion(&req->r_done_completion); wait_for_completion(&req->r_completion);
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
goto again; goto again;
} }
...@@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ...@@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oid_copy(&lreq->t.base_oid, oid);
ceph_oloc_copy(&lreq->t.base_oloc, oloc); ceph_oloc_copy(&lreq->t.base_oloc, oloc);
lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; lreq->t.flags = CEPH_OSD_FLAG_WRITE;
lreq->mtime = CURRENT_TIME; lreq->mtime = CURRENT_TIME;
lreq->reg_req = alloc_linger_request(lreq); lreq->reg_req = alloc_linger_request(lreq);
...@@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ...@@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; req->r_flags = CEPH_OSD_FLAG_WRITE;
req->r_mtime = CURRENT_TIME; req->r_mtime = CURRENT_TIME;
osd_req_op_watch_init(req, 0, lreq->linger_id, osd_req_op_watch_init(req, 0, lreq->linger_id,
CEPH_OSD_WATCH_OP_UNWATCH); CEPH_OSD_WATCH_OP_UNWATCH);
...@@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map); ...@@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
* Execute an OSD class method on an object. * Execute an OSD class method on an object.
* *
* @flags: CEPH_OSD_FLAG_* * @flags: CEPH_OSD_FLAG_*
* @resp_len: out param for reply length * @resp_len: in/out param for reply length
*/ */
int ceph_osdc_call(struct ceph_osd_client *osdc, int ceph_osdc_call(struct ceph_osd_client *osdc,
struct ceph_object_id *oid, struct ceph_object_id *oid,
...@@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, ...@@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
struct ceph_osd_request *req; struct ceph_osd_request *req;
int ret; int ret;
if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
return -E2BIG;
req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
if (!req) if (!req)
return -ENOMEM; return -ENOMEM;
...@@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, ...@@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
0, false, false); 0, false, false);
if (resp_page) if (resp_page)
osd_req_op_cls_response_data_pages(req, 0, &resp_page, osd_req_op_cls_response_data_pages(req, 0, &resp_page,
PAGE_SIZE, 0, false, false); *resp_len, 0, false, false);
ceph_osdc_start_request(osdc, req, false); ceph_osdc_start_request(osdc, req, false);
ret = ceph_osdc_wait_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req);
...@@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, ...@@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
int page_align = off & ~PAGE_MASK; int page_align = off & ~PAGE_MASK;
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq, truncate_size, snapc, truncate_seq, truncate_size,
true); true);
if (IS_ERR(req)) if (IS_ERR(req))
......
...@@ -153,6 +153,32 @@ static int skip_name_map(void **p, void *end) ...@@ -153,6 +153,32 @@ static int skip_name_map(void **p, void *end)
return -EINVAL; return -EINVAL;
} }
static void crush_finalize(struct crush_map *c)
{
__s32 b;
/* Space for the array of pointers to per-bucket workspace */
c->working_size = sizeof(struct crush_work) +
c->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < c->max_buckets; b++) {
if (!c->buckets[b])
continue;
switch (c->buckets[b]->alg) {
default:
/*
* The base case, permutation variables and
* the pointer to the permutation array.
*/
c->working_size += sizeof(struct crush_work_bucket);
break;
}
/* Every bucket has a permutation array. */
c->working_size += c->buckets[b]->size * sizeof(__u32);
}
}
static struct crush_map *crush_decode(void *pbyval, void *end) static struct crush_map *crush_decode(void *pbyval, void *end)
{ {
struct crush_map *c; struct crush_map *c;
...@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
if (b->items == NULL) if (b->items == NULL)
goto badmem; goto badmem;
b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
if (b->perm == NULL)
goto badmem;
b->perm_n = 0;
ceph_decode_need(p, end, b->size*sizeof(u32), bad); ceph_decode_need(p, end, b->size*sizeof(u32), bad);
for (j = 0; j < b->size; j++) for (j = 0; j < b->size; j++)
...@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
dout("crush decode tunable chooseleaf_stable = %d\n", dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable); c->chooseleaf_stable);
crush_finalize(c);
done: done:
dout("crush_decode success\n"); dout("crush_decode success\n");
return c; return c;
...@@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) ...@@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
map->pool_max = -1; map->pool_max = -1;
map->pg_temp = RB_ROOT; map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT; map->primary_temp = RB_ROOT;
mutex_init(&map->crush_scratch_mutex); mutex_init(&map->crush_workspace_mutex);
return map; return map;
} }
...@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) ...@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kfree(map->osd_weight); kfree(map->osd_weight);
kfree(map->osd_addr); kfree(map->osd_addr);
kfree(map->osd_primary_affinity); kfree(map->osd_primary_affinity);
kfree(map->crush_workspace);
kfree(map); kfree(map);
} }
...@@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) ...@@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
return 0; return 0;
} }
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
void *workspace;
size_t work_size;
if (IS_ERR(crush))
return PTR_ERR(crush);
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
workspace = kmalloc(work_size, GFP_NOIO);
if (!workspace) {
crush_destroy(crush);
return -ENOMEM;
}
crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
kfree(map->crush_workspace);
map->crush = crush;
map->crush_workspace = workspace;
return 0;
}
#define OSDMAP_WRAPPER_COMPAT_VER 7 #define OSDMAP_WRAPPER_COMPAT_VER 7
#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1
...@@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ...@@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
/* crush */ /* crush */
ceph_decode_32_safe(p, end, len, e_inval); ceph_decode_32_safe(p, end, len, e_inval);
map->crush = crush_decode(*p, min(*p + len, end)); err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
if (IS_ERR(map->crush)) { if (err)
err = PTR_ERR(map->crush);
map->crush = NULL;
goto bad; goto bad;
}
*p += len;
/* ignore the rest */ /* ignore the rest */
*p = end; *p = end;
...@@ -1375,7 +1421,6 @@ static int decode_new_up_state_weight(void **p, void *end, ...@@ -1375,7 +1421,6 @@ static int decode_new_up_state_weight(void **p, void *end,
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map) struct ceph_osdmap *map)
{ {
struct crush_map *newcrush = NULL;
struct ceph_fsid fsid; struct ceph_fsid fsid;
u32 epoch = 0; u32 epoch = 0;
struct ceph_timespec modified; struct ceph_timespec modified;
...@@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
/* new crush? */ /* new crush? */
ceph_decode_32_safe(p, end, len, e_inval); ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) { if (len > 0) {
newcrush = crush_decode(*p, min(*p+len, end)); err = osdmap_set_crush(map,
if (IS_ERR(newcrush)) { crush_decode(*p, min(*p + len, end)));
err = PTR_ERR(newcrush); if (err)
newcrush = NULL;
goto bad; goto bad;
}
*p += len; *p += len;
} }
...@@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
map->epoch++; map->epoch++;
map->modified = modified; map->modified = modified;
if (newcrush) {
if (map->crush)
crush_destroy(map->crush);
map->crush = newcrush;
newcrush = NULL;
}
/* new_pools */ /* new_pools */
err = decode_new_pools(p, end, map); err = decode_new_pools(p, end, map);
...@@ -1505,8 +1542,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1505,8 +1542,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
print_hex_dump(KERN_DEBUG, "osdmap: ", print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1, DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true); start, end - start, true);
if (newcrush)
crush_destroy(newcrush);
return ERR_PTR(err); return ERR_PTR(err);
} }
...@@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, ...@@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
BUG_ON(result_max > CEPH_PG_MAX_SIZE); BUG_ON(result_max > CEPH_PG_MAX_SIZE);
mutex_lock(&map->crush_scratch_mutex); mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max, r = crush_do_rule(map->crush, ruleno, x, result, result_max,
weight, weight_max, map->crush_scratch_ary); weight, weight_max, map->crush_workspace);
mutex_unlock(&map->crush_scratch_mutex); mutex_unlock(&map->crush_workspace_mutex);
return r; return r;
} }
...@@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, ...@@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
return; return;
} }
len = do_crush(osdmap, ruleno, pps, raw->osds, if (pi->size > ARRAY_SIZE(raw->osds)) {
min_t(int, pi->size, ARRAY_SIZE(raw->osds)), pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
pi->id, pi->crush_ruleset, pi->type, pi->size,
ARRAY_SIZE(raw->osds));
return;
}
len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
osdmap->osd_weight, osdmap->max_osd); osdmap->osd_weight, osdmap->max_osd);
if (len < 0) { if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
......
...@@ -18,8 +18,6 @@ ...@@ -18,8 +18,6 @@
* 02110-1301, USA. * 02110-1301, USA.
*/ */
#include <stddef.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/ceph/libceph.h> #include <linux/ceph/libceph.h>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册