提交 6e188240 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (59 commits)
  ceph: reuse mon subscribe message instead of allocated anew
  ceph: avoid resending queued message to monitor
  ceph: Storage class should be before const qualifier
  ceph: all allocation functions should get gfp_mask
  ceph: specify max_bytes on readdir replies
  ceph: cleanup pool op strings
  ceph: Use kzalloc
  ceph: use common helper for aborted dir request invalidation
  ceph: cope with out of order (unsafe after safe) mds reply
  ceph: save peer feature bits in connection structure
  ceph: resync headers with userland
  ceph: use ceph. prefix for virtual xattrs
  ceph: throw out dirty caps metadata, data on session teardown
  ceph: attempt mds reconnect if mds closes our session
  ceph: clean up send_mds_reconnect interface
  ceph: wait for mds OPEN reply to indicate reconnect success
  ceph: only send cap releases when mds is OPEN|HUNG
  ceph: dicard cap releases on mds restart
  ceph: make mon client statfs handling more generic
  ceph: drop src address(es) from message header [new protocol feature]
  ...
...@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, ...@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
int rc = 0; int rc = 0;
struct page **pages; struct page **pages;
struct pagevec pvec;
loff_t offset; loff_t offset;
u64 len; u64 len;
...@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, ...@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
if (rc < 0) if (rc < 0)
goto out; goto out;
/* set uptodate and add to lru in pagevec-sized chunks */
pagevec_init(&pvec, 0);
for (; !list_empty(page_list) && len > 0; for (; !list_empty(page_list) && len > 0;
rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
struct page *page = struct page *page =
...@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, ...@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
zero_user_segment(page, s, PAGE_CACHE_SIZE); zero_user_segment(page, s, PAGE_CACHE_SIZE);
} }
if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
page_cache_release(page); page_cache_release(page);
dout("readpages %p add_to_page_cache failed %p\n", dout("readpages %p add_to_page_cache failed %p\n",
inode, page); inode, page);
...@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, ...@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
flush_dcache_page(page); flush_dcache_page(page);
SetPageUptodate(page); SetPageUptodate(page);
unlock_page(page); unlock_page(page);
if (pagevec_add(&pvec, page) == 0) page_cache_release(page);
pagevec_lru_add_file(&pvec); /* add to lru */
} }
pagevec_lru_add_file(&pvec);
rc = 0; rc = 0;
out: out:
...@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, ...@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
ceph_release_pages(req->r_pages, req->r_num_pages); ceph_release_pages(req->r_pages, req->r_num_pages);
if (req->r_pages_from_pool) if (req->r_pages_from_pool)
mempool_free(req->r_pages, mempool_free(req->r_pages,
ceph_client(inode->i_sb)->wb_pagevec_pool); ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
else else
kfree(req->r_pages); kfree(req->r_pages);
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
......
...@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac, ...@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
ret = ac->ops->build_request(ac, p + sizeof(u32), end); ret = ac->ops->build_request(ac, p + sizeof(u32), end);
if (ret < 0) { if (ret < 0) {
pr_err("error %d building request\n", ret); pr_err("error %d building auth method %s request\n", ret,
ac->ops->name);
return ret; return ret;
} }
dout(" built request %d bytes\n", ret); dout(" built request %d bytes\n", ret);
...@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ...@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
if (ac->protocol != protocol) { if (ac->protocol != protocol) {
ret = ceph_auth_init_protocol(ac, protocol); ret = ceph_auth_init_protocol(ac, protocol);
if (ret) { if (ret) {
pr_err("error %d on auth protocol %d init\n", pr_err("error %d on auth method %s init\n",
ret, protocol); ret, ac->ops->name);
goto out; goto out;
} }
} }
...@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ...@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
return ceph_build_auth_request(ac, reply_buf, reply_len); return ceph_build_auth_request(ac, reply_buf, reply_len);
} else if (ret) { } else if (ret) {
pr_err("authentication error %d\n", ret); pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
return ret; return ret;
} }
return 0; return 0;
......
...@@ -15,6 +15,8 @@ struct ceph_auth_client; ...@@ -15,6 +15,8 @@ struct ceph_auth_client;
struct ceph_authorizer; struct ceph_authorizer;
struct ceph_auth_client_ops { struct ceph_auth_client_ops {
const char *name;
/* /*
* true if we are authenticated and can connect to * true if we are authenticated and can connect to
* services. * services.
......
...@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, ...@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
} }
static const struct ceph_auth_client_ops ceph_auth_none_ops = { static const struct ceph_auth_client_ops ceph_auth_none_ops = {
.name = "none",
.reset = reset, .reset = reset,
.destroy = destroy, .destroy = destroy,
.is_authenticated = is_authenticated, .is_authenticated = is_authenticated,
......
...@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ...@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
int ret; int ret;
char *dbuf; char *dbuf;
char *ticket_buf; char *ticket_buf;
u8 struct_v; u8 reply_struct_v;
dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
if (!dbuf) if (!dbuf)
...@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ...@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
goto out_dbuf; goto out_dbuf;
ceph_decode_need(&p, end, 1 + sizeof(u32), bad); ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
struct_v = ceph_decode_8(&p); reply_struct_v = ceph_decode_8(&p);
if (struct_v != 1) if (reply_struct_v != 1)
goto bad; goto bad;
num = ceph_decode_32(&p); num = ceph_decode_32(&p);
dout("%d tickets\n", num); dout("%d tickets\n", num);
while (num--) { while (num--) {
int type; int type;
u8 struct_v; u8 tkt_struct_v, blob_struct_v;
struct ceph_x_ticket_handler *th; struct ceph_x_ticket_handler *th;
void *dp, *dend; void *dp, *dend;
int dlen; int dlen;
...@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ...@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
type = ceph_decode_32(&p); type = ceph_decode_32(&p);
dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
struct_v = ceph_decode_8(&p); tkt_struct_v = ceph_decode_8(&p);
if (struct_v != 1) if (tkt_struct_v != 1)
goto bad; goto bad;
th = get_ticket_handler(ac, type); th = get_ticket_handler(ac, type);
...@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ...@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
dend = dbuf + dlen; dend = dbuf + dlen;
dp = dbuf; dp = dbuf;
struct_v = ceph_decode_8(&dp); tkt_struct_v = ceph_decode_8(&dp);
if (struct_v != 1) if (tkt_struct_v != 1)
goto bad; goto bad;
memcpy(&old_key, &th->session_key, sizeof(old_key)); memcpy(&old_key, &th->session_key, sizeof(old_key));
...@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ...@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
tpend = tp + dlen; tpend = tp + dlen;
dout(" ticket blob is %d bytes\n", dlen); dout(" ticket blob is %d bytes\n", dlen);
ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
struct_v = ceph_decode_8(&tp); blob_struct_v = ceph_decode_8(&tp);
new_secret_id = ceph_decode_64(&tp); new_secret_id = ceph_decode_64(&tp);
ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
if (ret) if (ret)
...@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, ...@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
static const struct ceph_auth_client_ops ceph_x_ops = { static const struct ceph_auth_client_ops ceph_x_ops = {
.name = "x",
.is_authenticated = ceph_x_is_authenticated, .is_authenticated = ceph_x_is_authenticated,
.build_request = ceph_x_build_request, .build_request = ceph_x_build_request,
.handle_reply = ceph_x_handle_reply, .handle_reply = ceph_x_handle_reply,
......
...@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap) ...@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
{ {
struct ceph_mds_session *session = cap->session; struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci; struct ceph_inode_info *ci = cap->ci;
struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_mds_client *mdsc =
&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
int removed = 0; int removed = 0;
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
...@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session, ...@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
seq, issue_seq, mseq, follows, size, max_size, seq, issue_seq, mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
if (IS_ERR(msg)) if (!msg)
return PTR_ERR(msg); return -ENOMEM;
msg->hdr.tid = cpu_to_le64(flush_tid); msg->hdr.tid = cpu_to_le64(flush_tid);
...@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) ...@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
*/ */
void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
{ {
struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_mds_client *mdsc =
&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
int was = ci->i_dirty_caps; int was = ci->i_dirty_caps;
int dirty = 0; int dirty = 0;
...@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ...@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
static int __mark_caps_flushing(struct inode *inode, static int __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session) struct ceph_mds_session *session)
{ {
struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int flushing; int flushing;
...@@ -1663,7 +1665,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1663,7 +1665,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
unsigned *flush_tid) unsigned *flush_tid)
{ {
struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int unlock_session = session ? 0 : 1; int unlock_session = session ? 0 : 1;
int flushing = 0; int flushing = 0;
...@@ -1716,10 +1718,9 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, ...@@ -1716,10 +1718,9 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
static int caps_are_flushed(struct inode *inode, unsigned tid) static int caps_are_flushed(struct inode *inode, unsigned tid)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int dirty, i, ret = 1; int i, ret = 1;
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
dirty = __ceph_caps_dirty(ci);
for (i = 0; i < CEPH_CAP_BITS; i++) for (i = 0; i < CEPH_CAP_BITS; i++)
if ((ci->i_flushing_caps & (1 << i)) && if ((ci->i_flushing_caps & (1 << i)) &&
ci->i_cap_flush_tid[i] <= tid) { ci->i_cap_flush_tid[i] <= tid) {
...@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) ...@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
err = wait_event_interruptible(ci->i_cap_wq, err = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid)); caps_are_flushed(inode, flush_tid));
} else { } else {
struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc =
&ceph_sb_to_client(inode->i_sb)->mdsc;
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
if (__ceph_caps_dirty(ci)) if (__ceph_caps_dirty(ci))
...@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ...@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(inode->i_lock) __releases(inode->i_lock)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
unsigned seq = le32_to_cpu(m->seq); unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty); int dirty = le32_to_cpu(m->dirty);
int cleaned = 0; int cleaned = 0;
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
* Ceph release version * Ceph release version
*/ */
#define CEPH_VERSION_MAJOR 0 #define CEPH_VERSION_MAJOR 0
#define CEPH_VERSION_MINOR 19 #define CEPH_VERSION_MINOR 20
#define CEPH_VERSION_PATCH 0 #define CEPH_VERSION_PATCH 0
#define _CEPH_STRINGIFY(x) #x #define _CEPH_STRINGIFY(x) #x
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
* client-facing protocol. * client-facing protocol.
*/ */
#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ #define CEPH_OSD_PROTOCOL 8 /* cluster internal */
#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ #define CEPH_MDS_PROTOCOL 12 /* cluster internal */
#define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_MON_PROTOCOL 5 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */
#define CEPH_MDSC_PROTOCOL 32 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */
...@@ -53,8 +53,18 @@ ...@@ -53,8 +53,18 @@
/* /*
* feature bits * feature bits
*/ */
#define CEPH_FEATURE_SUPPORTED 0 #define CEPH_FEATURE_UID 1
#define CEPH_FEATURE_REQUIRED 0 #define CEPH_FEATURE_NOSRCADDR 2
#define CEPH_FEATURE_FLOCK 4
#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
/* /*
...@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); ...@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
#define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_NONE 0x1
#define CEPH_AUTH_CEPHX 0x2 #define CEPH_AUTH_CEPHX 0x2
#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
/********************************************* /*********************************************
* message layer * message layer
...@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); ...@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
#define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_SNAP 0x312
#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313
/* pool ops */
#define CEPH_MSG_POOLOP_REPLY 48
#define CEPH_MSG_POOLOP 49
/* osd */ /* osd */
#define CEPH_MSG_OSD_MAP 41 #define CEPH_MSG_OSD_MAP 41
#define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43 #define CEPH_MSG_OSD_OPREPLY 43
/* pool operations */
enum {
POOL_OP_CREATE = 0x01,
POOL_OP_DELETE = 0x02,
POOL_OP_AUID_CHANGE = 0x03,
POOL_OP_CREATE_SNAP = 0x11,
POOL_OP_DELETE_SNAP = 0x12,
POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
};
struct ceph_mon_request_header { struct ceph_mon_request_header {
__le64 have_version; __le64 have_version;
__le16 session_mon; __le16 session_mon;
...@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply { ...@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
struct ceph_statfs st; struct ceph_statfs st;
} __attribute__ ((packed)); } __attribute__ ((packed));
const char *ceph_pool_op_name(int op);
struct ceph_mon_poolop {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
__le32 pool;
__le32 op;
__le64 auid;
__le64 snapid;
__le32 name_len;
} __attribute__ ((packed));
struct ceph_mon_poolop_reply {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
__le32 reply_code;
__le32 epoch;
char has_data;
char data[0];
} __attribute__ ((packed));
struct ceph_mon_unmanaged_snap {
__le64 snapid;
} __attribute__ ((packed));
struct ceph_osd_getmap { struct ceph_osd_getmap {
struct ceph_mon_request_header monhdr; struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid; struct ceph_fsid fsid;
...@@ -308,6 +361,7 @@ union ceph_mds_request_args { ...@@ -308,6 +361,7 @@ union ceph_mds_request_args {
struct { struct {
__le32 frag; /* which dir fragment */ __le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */ __le32 max_entries; /* how many dentries to grab */
__le32 max_bytes;
} __attribute__ ((packed)) readdir; } __attribute__ ((packed)) readdir;
struct { struct {
__le32 mode; __le32 mode;
......
...@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type) ...@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
case CEPH_ENTITY_TYPE_OSD: return "osd"; case CEPH_ENTITY_TYPE_OSD: return "osd";
case CEPH_ENTITY_TYPE_MON: return "mon"; case CEPH_ENTITY_TYPE_MON: return "mon";
case CEPH_ENTITY_TYPE_CLIENT: return "client"; case CEPH_ENTITY_TYPE_CLIENT: return "client";
case CEPH_ENTITY_TYPE_ADMIN: return "admin";
case CEPH_ENTITY_TYPE_AUTH: return "auth"; case CEPH_ENTITY_TYPE_AUTH: return "auth";
default: return "unknown"; default: return "unknown";
} }
...@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op) ...@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
case CEPH_OSD_OP_RMXATTR: return "rmxattr"; case CEPH_OSD_OP_RMXATTR: return "rmxattr";
case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
case CEPH_OSD_OP_PULL: return "pull"; case CEPH_OSD_OP_PULL: return "pull";
case CEPH_OSD_OP_PUSH: return "push"; case CEPH_OSD_OP_PUSH: return "push";
...@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o) ...@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
} }
return "???"; return "???";
} }
const char *ceph_pool_op_name(int op)
{
switch (op) {
case POOL_OP_CREATE: return "create";
case POOL_OP_DELETE: return "delete";
case POOL_OP_AUID_CHANGE: return "auid change";
case POOL_OP_CREATE_SNAP: return "create snap";
case POOL_OP_DELETE_SNAP: return "delete snap";
case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
}
return "???";
}
...@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p) ...@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
static int monc_show(struct seq_file *s, void *p) static int monc_show(struct seq_file *s, void *p)
{ {
struct ceph_client *client = s->private; struct ceph_client *client = s->private;
struct ceph_mon_statfs_request *req; struct ceph_mon_generic_request *req;
struct ceph_mon_client *monc = &client->monc; struct ceph_mon_client *monc = &client->monc;
struct rb_node *rp; struct rb_node *rp;
...@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p) ...@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
if (monc->want_next_osdmap) if (monc->want_next_osdmap)
seq_printf(s, "want next osdmap\n"); seq_printf(s, "want next osdmap\n");
for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
req = rb_entry(rp, struct ceph_mon_statfs_request, node); __u16 op;
req = rb_entry(rp, struct ceph_mon_generic_request, node);
op = le16_to_cpu(req->request->hdr.type);
if (op == CEPH_MSG_STATFS)
seq_printf(s, "%lld statfs\n", req->tid); seq_printf(s, "%lld statfs\n", req->tid);
else
seq_printf(s, "%lld unknown\n", req->tid);
} }
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
......
...@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry) ...@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
return -ENOMEM; /* oh well */ return -ENOMEM; /* oh well */
spin_lock(&dentry->d_lock); spin_lock(&dentry->d_lock);
if (dentry->d_fsdata) /* lost a race */ if (dentry->d_fsdata) {
/* lost a race */
kmem_cache_free(ceph_dentry_cachep, di);
goto out_unlock; goto out_unlock;
}
di->dentry = dentry; di->dentry = dentry;
di->lease_session = NULL; di->lease_session = NULL;
dentry->d_fsdata = di; dentry->d_fsdata = di;
...@@ -125,7 +128,8 @@ static int __dcache_readdir(struct file *filp, ...@@ -125,7 +128,8 @@ static int __dcache_readdir(struct file *filp,
dentry = list_entry(p, struct dentry, d_u.d_child); dentry = list_entry(p, struct dentry, d_u.d_child);
di = ceph_dentry(dentry); di = ceph_dentry(dentry);
while (1) { while (1) {
dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
d_unhashed(dentry) ? "!hashed" : "hashed",
parent->d_subdirs.prev, parent->d_subdirs.next); parent->d_subdirs.prev, parent->d_subdirs.next);
if (p == &parent->d_subdirs) { if (p == &parent->d_subdirs) {
fi->at_end = 1; fi->at_end = 1;
...@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) ...@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
u32 ftype; u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo; struct ceph_mds_reply_info_parsed *rinfo;
const int max_entries = client->mount_args->max_readdir; const int max_entries = client->mount_args->max_readdir;
const int max_bytes = client->mount_args->max_readdir_bytes;
dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
if (fi->at_end) if (fi->at_end)
...@@ -312,6 +317,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) ...@@ -312,6 +317,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
req->r_readdir_offset = fi->next_offset; req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.frag = cpu_to_le32(frag);
req->r_args.readdir.max_entries = cpu_to_le32(max_entries); req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
req->r_num_caps = max_entries + 1; req->r_num_caps = max_entries + 1;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) { if (err < 0) {
...@@ -335,7 +341,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) ...@@ -335,7 +341,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (req->r_reply_info.dir_end) { if (req->r_reply_info.dir_end) {
kfree(fi->last_name); kfree(fi->last_name);
fi->last_name = NULL; fi->last_name = NULL;
fi->next_offset = 0; fi->next_offset = 2;
} else { } else {
rinfo = &req->r_reply_info; rinfo = &req->r_reply_info;
err = note_last_dentry(fi, err = note_last_dentry(fi,
...@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) ...@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err) struct dentry *dentry, int err)
{ {
struct ceph_client *client = ceph_client(dentry->d_sb); struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = dentry->d_parent->d_inode; struct inode *parent = dentry->d_parent->d_inode;
/* .snap dir? */ /* .snap dir? */
...@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, ...@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
!is_root_ceph_dentry(dir, dentry) && !is_root_ceph_dentry(dir, dentry) &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) && (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
di->offset = ci->i_max_offset++;
spin_unlock(&dir->i_lock); spin_unlock(&dir->i_lock);
dout(" dir %p complete, -ENOENT\n", dir); dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL); d_add(dentry, NULL);
...@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
/* ensure target dentry is invalidated, despite /* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */ rehashing bug in vfs_rename_dir */
new_dentry->d_time = jiffies; ceph_invalidate_dentry_lease(new_dentry);
ceph_dentry(new_dentry)->lease_shared_gen = 0;
} }
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
return err; return err;
} }
/*
* Ensure a dentry lease will no longer revalidate.
*/
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
dentry->d_time = jiffies;
ceph_dentry(dentry)->lease_shared_gen = 0;
spin_unlock(&dentry->d_lock);
}
/* /*
* Check if dentry lease is valid. If not, delete the lease. Try to * Check if dentry lease is valid. If not, delete the lease. Try to
...@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) ...@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
{ {
struct inode *dir = dentry->d_parent->d_inode; struct inode *dir = dentry->d_parent->d_inode;
dout("d_revalidate %p '%.*s' inode %p\n", dentry, dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
dentry->d_name.len, dentry->d_name.name, dentry->d_inode); dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */ /* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) { if (ceph_snap(dir) != CEPH_NOSNAP) {
...@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, ...@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int left; int left;
if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR; return -EISDIR;
if (!cf->dir_info) { if (!cf->dir_info) {
...@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn) ...@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name); dn->d_name.len, dn->d_name.name);
if (di) { if (di) {
mdsc = &ceph_client(dn->d_sb)->mdsc; mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock); spin_lock(&mdsc->dentry_lru_lock);
list_add_tail(&di->lru, &mdsc->dentry_lru); list_add_tail(&di->lru, &mdsc->dentry_lru);
mdsc->num_dentry++; mdsc->num_dentry++;
...@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) ...@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_dentry_info *di = ceph_dentry(dn);
struct ceph_mds_client *mdsc; struct ceph_mds_client *mdsc;
dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
dn->d_name.len, dn->d_name.name); dn->d_name.len, dn->d_name.name, di->offset);
if (di) { if (di) {
mdsc = &ceph_client(dn->d_sb)->mdsc; mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock); spin_lock(&mdsc->dentry_lru_lock);
list_move_tail(&di->lru, &mdsc->dentry_lru); list_move_tail(&di->lru, &mdsc->dentry_lru);
spin_unlock(&mdsc->dentry_lru_lock); spin_unlock(&mdsc->dentry_lru_lock);
...@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn) ...@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name); dn->d_name.len, dn->d_name.name);
if (di) { if (di) {
mdsc = &ceph_client(dn->d_sb)->mdsc; mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock); spin_lock(&mdsc->dentry_lru_lock);
list_del_init(&di->lru); list_del_init(&di->lru);
mdsc->num_dentry--; mdsc->num_dentry--;
......
...@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, ...@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE); return ERR_PTR(-ESTALE);
dentry = d_obtain_alias(inode); dentry = d_obtain_alias(inode);
if (!dentry) { if (IS_ERR(dentry)) {
pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
fh->ino, inode); fh->ino, inode);
iput(inode); iput(inode);
return ERR_PTR(-ENOMEM); return dentry;
} }
err = ceph_init_dentry(dentry); err = ceph_init_dentry(dentry);
...@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, ...@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
static struct dentry *__cfh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb,
struct ceph_nfs_confh *cfh) struct ceph_nfs_confh *cfh)
{ {
struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
struct inode *inode; struct inode *inode;
struct dentry *dentry; struct dentry *dentry;
struct ceph_vino vino; struct ceph_vino vino;
...@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, ...@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
} }
dentry = d_obtain_alias(inode); dentry = d_obtain_alias(inode);
if (!dentry) { if (IS_ERR(dentry)) {
pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
cfh->ino, inode); cfh->ino, inode);
iput(inode); iput(inode);
return ERR_PTR(-ENOMEM); return dentry;
} }
err = ceph_init_dentry(dentry); err = ceph_init_dentry(dentry);
if (err < 0) { if (err < 0) {
...@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, ...@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
return ERR_PTR(-ESTALE); return ERR_PTR(-ESTALE);
dentry = d_obtain_alias(inode); dentry = d_obtain_alias(inode);
if (!dentry) { if (IS_ERR(dentry)) {
pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
cfh->ino, inode); cfh->ino, inode);
iput(inode); iput(inode);
return ERR_PTR(-ENOMEM); return dentry;
} }
err = ceph_init_dentry(dentry); err = ceph_init_dentry(dentry);
if (err < 0) { if (err < 0) {
......
...@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages) ...@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
/* /*
* allocate a vector new pages * allocate a vector new pages
*/ */
static struct page **alloc_page_vector(int num_pages) struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
{ {
struct page **pages; struct page **pages;
int i; int i;
pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); pages = kmalloc(sizeof(*pages) * num_pages, flags);
if (!pages) if (!pages)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
for (i = 0; i < num_pages; i++) { for (i = 0; i < num_pages; i++) {
pages[i] = alloc_page(GFP_NOFS); pages[i] = __page_cache_alloc(flags);
if (pages[i] == NULL) { if (pages[i] == NULL) {
ceph_release_page_vector(pages, i); ceph_release_page_vector(pages, i);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, ...@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
* in sequence. * in sequence.
*/ */
} else { } else {
pages = alloc_page_vector(num_pages); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
} }
if (IS_ERR(pages)) if (IS_ERR(pages))
return PTR_ERR(pages); return PTR_ERR(pages);
...@@ -649,8 +649,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, ...@@ -649,8 +649,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
do_sync, do_sync,
ci->i_truncate_seq, ci->i_truncate_size, ci->i_truncate_seq, ci->i_truncate_size,
&mtime, false, 2); &mtime, false, 2);
if (IS_ERR(req)) if (!req)
return PTR_ERR(req); return -ENOMEM;
num_pages = calc_pages_for(pos, len); num_pages = calc_pages_for(pos, len);
...@@ -668,7 +668,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, ...@@ -668,7 +668,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
truncate_inode_pages_range(inode->i_mapping, pos, truncate_inode_pages_range(inode->i_mapping, pos,
(pos+len) | (PAGE_CACHE_SIZE-1)); (pos+len) | (PAGE_CACHE_SIZE-1));
} else { } else {
pages = alloc_page_vector(num_pages); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
if (IS_ERR(pages)) { if (IS_ERR(pages)) {
ret = PTR_ERR(pages); ret = PTR_ERR(pages);
goto out; goto out;
...@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, ...@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file->f_dentry->d_inode; struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
loff_t endoff = pos + iov->iov_len; loff_t endoff = pos + iov->iov_len;
int got = 0; int got = 0;
int ret, err; int ret, err;
......
...@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode) ...@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
*/ */
if (ci->i_snap_realm) { if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc = struct ceph_mds_client *mdsc =
&ceph_client(ci->vfs_inode.i_sb)->mdsc; &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct ceph_snap_realm *realm = ci->i_snap_realm; struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n", realm); dout(" dropping residual ref to snap realm %p\n", realm);
...@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode, ...@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
memcpy(ci->i_xattrs.blob->vec.iov_base, memcpy(ci->i_xattrs.blob->vec.iov_base,
iinfo->xattr_data, iinfo->xattr_len); iinfo->xattr_data, iinfo->xattr_len);
ci->i_xattrs.version = le64_to_cpu(info->xattr_version); ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
xattr_blob = NULL;
} }
inode->i_mapping->a_ops = &ceph_aops; inode->i_mapping->a_ops = &ceph_aops;
inode->i_mapping->backing_dev_info = inode->i_mapping->backing_dev_info =
&ceph_client(inode->i_sb)->backing_dev_info; &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
switch (inode->i_mode & S_IFMT) { switch (inode->i_mode & S_IFMT) {
case S_IFIFO: case S_IFIFO:
...@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode, ...@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
/* set dir completion flag? */ /* set dir completion flag? */
if (ci->i_files == 0 && ci->i_subdirs == 0 && if (ci->i_files == 0 && ci->i_subdirs == 0 &&
ceph_snap(inode) == CEPH_NOSNAP && ceph_snap(inode) == CEPH_NOSNAP &&
(le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
dout(" marking %p complete (empty)\n", inode); dout(" marking %p complete (empty)\n", inode);
ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_ceph_flags |= CEPH_I_COMPLETE;
ci->i_max_offset = 2; ci->i_max_offset = 2;
} }
/* it may be better to set st_size in getattr instead? */ /* it may be better to set st_size in getattr instead? */
if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
inode->i_size = ci->i_rbytes; inode->i_size = ci->i_rbytes;
break; break;
default: default:
...@@ -801,6 +803,37 @@ static void update_dentry_lease(struct dentry *dentry, ...@@ -801,6 +803,37 @@ static void update_dentry_lease(struct dentry *dentry,
return; return;
} }
/*
* Set dentry's directory position based on the current dir's max, and
* order it in d_subdirs, so that dcache_readdir behaves.
*/
static void ceph_set_dentry_offset(struct dentry *dn)
{
struct dentry *dir = dn->d_parent;
struct inode *inode = dn->d_parent->d_inode;
struct ceph_dentry_info *di;
BUG_ON(!inode);
di = ceph_dentry(dn);
spin_lock(&inode->i_lock);
if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
spin_unlock(&inode->i_lock);
return;
}
di->offset = ceph_inode(inode)->i_max_offset++;
spin_unlock(&inode->i_lock);
spin_lock(&dcache_lock);
spin_lock(&dn->d_lock);
list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
dn->d_u.d_child.prev, dn->d_u.d_child.next);
spin_unlock(&dn->d_lock);
spin_unlock(&dcache_lock);
}
/* /*
* splice a dentry to an inode. * splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe. * caller must hold directory i_mutex for this to be safe.
...@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, ...@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
{ {
struct dentry *realdn; struct dentry *realdn;
BUG_ON(dn->d_inode);
/* dn must be unhashed */ /* dn must be unhashed */
if (!d_unhashed(dn)) if (!d_unhashed(dn))
d_drop(dn); d_drop(dn);
...@@ -835,43 +870,16 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, ...@@ -835,43 +870,16 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
dn = realdn; dn = realdn;
} else { } else {
BUG_ON(!ceph_dentry(dn)); BUG_ON(!ceph_dentry(dn));
dout("dn %p attached to %p ino %llx.%llx\n", dout("dn %p attached to %p ino %llx.%llx\n",
dn, dn->d_inode, ceph_vinop(dn->d_inode)); dn, dn->d_inode, ceph_vinop(dn->d_inode));
} }
if ((!prehash || *prehash) && d_unhashed(dn)) if ((!prehash || *prehash) && d_unhashed(dn))
d_rehash(dn); d_rehash(dn);
ceph_set_dentry_offset(dn);
out: out:
return dn; return dn;
} }
/*
* Set dentry's directory position based on the current dir's max, and
* order it in d_subdirs, so that dcache_readdir behaves.
*/
static void ceph_set_dentry_offset(struct dentry *dn)
{
struct dentry *dir = dn->d_parent;
struct inode *inode = dn->d_parent->d_inode;
struct ceph_dentry_info *di;
BUG_ON(!inode);
di = ceph_dentry(dn);
spin_lock(&inode->i_lock);
di->offset = ceph_inode(inode)->i_max_offset++;
spin_unlock(&inode->i_lock);
spin_lock(&dcache_lock);
spin_lock(&dn->d_lock);
list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
dn->d_u.d_child.prev, dn->d_u.d_child.next);
spin_unlock(&dn->d_lock);
spin_unlock(&dcache_lock);
}
/* /*
* Incorporate results into the local cache. This is either just * Incorporate results into the local cache. This is either just
* one inode, or a directory, dentry, and possibly linked-to inode (e.g., * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
...@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
if (!rinfo->head->is_target && !rinfo->head->is_dentry) { if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
dout("fill_trace reply is empty!\n"); dout("fill_trace reply is empty!\n");
if (rinfo->head->result == 0 && req->r_locked_dir) { if (rinfo->head->result == 0 && req->r_locked_dir)
struct ceph_inode_info *ci = ceph_invalidate_dir_request(req);
ceph_inode(req->r_locked_dir);
dout(" clearing %p complete (empty trace)\n",
req->r_locked_dir);
ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
ci->i_release_count++;
}
return 0; return 0;
} }
...@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name, req->r_old_dentry->d_name.name,
dn, dn->d_name.len, dn->d_name.name); dn, dn->d_name.len, dn->d_name.name);
/* ensure target dentry is invalidated, despite /* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */ rehashing bug in vfs_rename_dir */
dn->d_time = jiffies; ceph_invalidate_dentry_lease(dn);
ceph_dentry(dn)->lease_shared_gen = 0;
/* take overwritten dentry's readdir offset */ /* take overwritten dentry's readdir offset */
dout("dn %p gets %p offset %lld (old offset %lld)\n",
req->r_old_dentry, dn, ceph_dentry(dn)->offset,
ceph_dentry(req->r_old_dentry)->offset);
ceph_dentry(req->r_old_dentry)->offset = ceph_dentry(req->r_old_dentry)->offset =
ceph_dentry(dn)->offset; ceph_dentry(dn)->offset;
dn = req->r_old_dentry; /* use old_dentry */ dn = req->r_old_dentry; /* use old_dentry */
in = dn->d_inode; in = dn->d_inode;
} }
...@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
goto done; goto done;
} }
req->r_dentry = dn; /* may have spliced */ req->r_dentry = dn; /* may have spliced */
ceph_set_dentry_offset(dn);
igrab(in); igrab(in);
} else if (ceph_ino(in) == vino.ino && } else if (ceph_ino(in) == vino.ino &&
ceph_snap(in) == vino.snap) { ceph_snap(in) == vino.snap) {
...@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
err = PTR_ERR(dn); err = PTR_ERR(dn);
goto done; goto done;
} }
ceph_set_dentry_offset(dn);
req->r_dentry = dn; /* may have spliced */ req->r_dentry = dn; /* may have spliced */
igrab(in); igrab(in);
rinfo->head->is_dentry = 1; /* fool notrace handlers */ rinfo->head->is_dentry = 1; /* fool notrace handlers */
...@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode) ...@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
if (queue_work(ceph_client(inode->i_sb)->trunc_wq, if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
&ci->i_vmtruncate_work)) { &ci->i_vmtruncate_work)) {
dout("ceph_queue_vmtruncate %p\n", inode); dout("ceph_queue_vmtruncate %p\n", inode);
igrab(inode); igrab(inode);
...@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *parent_inode = dentry->d_parent->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode;
const unsigned int ia_valid = attr->ia_valid; const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
int issued; int issued;
int release = 0, dirtied = 0; int release = 0, dirtied = 0;
int mask = 0; int mask = 0;
......
...@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_ioctl_dataloc dl; struct ceph_ioctl_dataloc dl;
struct inode *inode = file->f_dentry->d_inode; struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
u64 len = 1, olen; u64 len = 1, olen;
u64 tmp; u64 tmp;
struct ceph_object_layout ol; struct ceph_object_layout ol;
......
此差异已折叠。
...@@ -165,6 +165,8 @@ struct ceph_mds_request { ...@@ -165,6 +165,8 @@ struct ceph_mds_request {
struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
struct inode *r_target_inode; /* resulting inode */ struct inode *r_target_inode; /* resulting inode */
struct mutex r_fill_mutex;
union ceph_mds_request_args r_args; union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */ int r_fmode; /* file mode, if expecting cap */
...@@ -213,7 +215,7 @@ struct ceph_mds_request { ...@@ -213,7 +215,7 @@ struct ceph_mds_request {
struct completion r_safe_completion; struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback; ceph_mds_request_callback_t r_callback;
struct list_head r_unsafe_item; /* per-session unsafe list item */ struct list_head r_unsafe_item; /* per-session unsafe list item */
bool r_got_unsafe, r_got_safe; bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate; bool r_did_prepopulate;
u32 r_readdir_offset; u32 r_readdir_offset;
...@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, ...@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
struct inode *inode, struct inode *inode,
struct dentry *dn, int mask); struct dentry *dn, int mask);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern struct ceph_mds_request * extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
......
...@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con); ...@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
static void con_work(struct work_struct *); static void con_work(struct work_struct *);
static void ceph_fault(struct ceph_connection *con); static void ceph_fault(struct ceph_connection *con);
const char *ceph_name_type_str(int t)
{
switch (t) {
case CEPH_ENTITY_TYPE_MON: return "mon";
case CEPH_ENTITY_TYPE_MDS: return "mds";
case CEPH_ENTITY_TYPE_OSD: return "osd";
case CEPH_ENTITY_TYPE_CLIENT: return "client";
case CEPH_ENTITY_TYPE_ADMIN: return "admin";
default: return "???";
}
}
/* /*
* nicely render a sockaddr as a string. * nicely render a sockaddr as a string.
*/ */
...@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con) ...@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
ceph_msg_put(con->out_msg); ceph_msg_put(con->out_msg);
con->out_msg = NULL; con->out_msg = NULL;
} }
con->out_keepalive_pending = false;
con->in_seq = 0; con->in_seq = 0;
con->in_seq_acked = 0; con->in_seq_acked = 0;
} }
...@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con) ...@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
clear_bit(WRITE_PENDING, &con->state); clear_bit(WRITE_PENDING, &con->state);
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
reset_connection(con); reset_connection(con);
con->peer_global_seq = 0;
cancel_delayed_work(&con->work); cancel_delayed_work(&con->work);
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
queue_con(con); queue_con(con);
...@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, ...@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
con->connect_seq, global_seq, proto); con->connect_seq, global_seq, proto);
con->out_connect.features = CEPH_FEATURE_SUPPORTED; con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
con->out_connect.global_seq = cpu_to_le32(global_seq); con->out_connect.global_seq = cpu_to_le32(global_seq);
...@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con) ...@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
static int process_connect(struct ceph_connection *con) static int process_connect(struct ceph_connection *con)
{ {
u64 sup_feat = CEPH_FEATURE_SUPPORTED; u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
u64 req_feat = CEPH_FEATURE_REQUIRED; u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
u64 server_feat = le64_to_cpu(con->in_reply.features); u64 server_feat = le64_to_cpu(con->in_reply.features);
dout("process_connect on %p tag %d\n", con, (int)con->in_tag); dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
...@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con) ...@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
clear_bit(CONNECTING, &con->state); clear_bit(CONNECTING, &con->state);
con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
con->connect_seq++; con->connect_seq++;
con->peer_features = server_feat;
dout("process_connect got READY gseq %d cseq %d (%d)\n", dout("process_connect got READY gseq %d cseq %d (%d)\n",
con->peer_global_seq, con->peer_global_seq,
le32_to_cpu(con->in_reply.connect_seq), le32_to_cpu(con->in_reply.connect_seq),
...@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
if (skip) { if (skip) {
/* skip this message */ /* skip this message */
dout("alloc_msg returned NULL, skipping message\n"); dout("alloc_msg said skip message\n");
con->in_base_pos = -front_len - middle_len - data_len - con->in_base_pos = -front_len - middle_len - data_len -
sizeof(m->footer); sizeof(m->footer);
con->in_tag = CEPH_MSGR_TAG_READY; con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++; con->in_seq++;
return 0; return 0;
} }
if (IS_ERR(con->in_msg)) { if (!con->in_msg) {
ret = PTR_ERR(con->in_msg);
con->in_msg = NULL;
con->error_msg = con->error_msg =
"error allocating memory for incoming message"; "error allocating memory for incoming message";
return ret; return -ENOMEM;
} }
m = con->in_msg; m = con->in_msg;
m->front.iov_len = 0; /* haven't read it yet */ m->front.iov_len = 0; /* haven't read it yet */
...@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con) ...@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
/* if first message, set peer_name */ /* if first message, set peer_name */
if (con->peer_name.type == 0) if (con->peer_name.type == 0)
con->peer_name = msg->hdr.src.name; con->peer_name = msg->hdr.src;
con->in_seq++; con->in_seq++;
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
msg, le64_to_cpu(msg->hdr.seq), msg, le64_to_cpu(msg->hdr.seq),
ENTITY_NAME(msg->hdr.src.name), ENTITY_NAME(msg->hdr.src),
le16_to_cpu(msg->hdr.type), le16_to_cpu(msg->hdr.type),
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.front_len),
...@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con) ...@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
dout("try_write start %p state %lu nref %d\n", con, con->state, dout("try_write start %p state %lu nref %d\n", con, con->state,
atomic_read(&con->nref)); atomic_read(&con->nref));
mutex_lock(&con->mutex);
more: more:
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
...@@ -1639,7 +1627,6 @@ static int try_write(struct ceph_connection *con) ...@@ -1639,7 +1627,6 @@ static int try_write(struct ceph_connection *con)
done: done:
ret = 0; ret = 0;
out: out:
mutex_unlock(&con->mutex);
dout("try_write done on %p\n", con); dout("try_write done on %p\n", con);
return ret; return ret;
} }
...@@ -1651,7 +1638,6 @@ static int try_write(struct ceph_connection *con) ...@@ -1651,7 +1638,6 @@ static int try_write(struct ceph_connection *con)
*/ */
static int try_read(struct ceph_connection *con) static int try_read(struct ceph_connection *con)
{ {
struct ceph_messenger *msgr;
int ret = -1; int ret = -1;
if (!con->sock) if (!con->sock)
...@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con) ...@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
return 0; return 0;
dout("try_read start on %p\n", con); dout("try_read start on %p\n", con);
msgr = con->msgr;
mutex_lock(&con->mutex);
more: more:
dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
...@@ -1758,7 +1741,6 @@ static int try_read(struct ceph_connection *con) ...@@ -1758,7 +1741,6 @@ static int try_read(struct ceph_connection *con)
done: done:
ret = 0; ret = 0;
out: out:
mutex_unlock(&con->mutex);
dout("try_read done on %p\n", con); dout("try_read done on %p\n", con);
return ret; return ret;
...@@ -1830,6 +1812,8 @@ static void con_work(struct work_struct *work) ...@@ -1830,6 +1812,8 @@ static void con_work(struct work_struct *work)
dout("con_work %p start, clearing QUEUED\n", con); dout("con_work %p start, clearing QUEUED\n", con);
clear_bit(QUEUED, &con->state); clear_bit(QUEUED, &con->state);
mutex_lock(&con->mutex);
if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
dout("con_work CLOSED\n"); dout("con_work CLOSED\n");
con_close_socket(con); con_close_socket(con);
...@@ -1844,11 +1828,16 @@ static void con_work(struct work_struct *work) ...@@ -1844,11 +1828,16 @@ static void con_work(struct work_struct *work)
if (test_and_clear_bit(SOCK_CLOSED, &con->state) || if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
try_read(con) < 0 || try_read(con) < 0 ||
try_write(con) < 0) { try_write(con) < 0) {
mutex_unlock(&con->mutex);
backoff = 1; backoff = 1;
ceph_fault(con); /* error/fault path */ ceph_fault(con); /* error/fault path */
goto done_unlocked;
} }
done: done:
mutex_unlock(&con->mutex);
done_unlocked:
clear_bit(BUSY, &con->state); clear_bit(BUSY, &con->state);
dout("con->state=%lu\n", con->state); dout("con->state=%lu\n", con->state);
if (test_bit(QUEUED, &con->state)) { if (test_bit(QUEUED, &con->state)) {
...@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) ...@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
/* the zero page is needed if a request is "canceled" while the message /* the zero page is needed if a request is "canceled" while the message
* is being written over the socket */ * is being written over the socket */
msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
if (!msgr->zero_page) { if (!msgr->zero_page) {
kfree(msgr); kfree(msgr);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
} }
/* set src+dst */ /* set src+dst */
msg->hdr.src.name = con->msgr->inst.name; msg->hdr.src = con->msgr->inst.name;
msg->hdr.src.addr = con->msgr->my_enc_addr;
msg->hdr.orig_src = msg->hdr.src;
BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
...@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con) ...@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
* construct a new message with given type, size * construct a new message with given type, size
* the new msg has a ref count of 1. * the new msg has a ref count of 1.
*/ */
struct ceph_msg *ceph_msg_new(int type, int front_len, struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
int page_len, int page_off, struct page **pages)
{ {
struct ceph_msg *m; struct ceph_msg *m;
m = kmalloc(sizeof(*m), GFP_NOFS); m = kmalloc(sizeof(*m), flags);
if (m == NULL) if (m == NULL)
goto out; goto out;
kref_init(&m->kref); kref_init(&m->kref);
...@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, ...@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
m->hdr.version = 0; m->hdr.version = 0;
m->hdr.front_len = cpu_to_le32(front_len); m->hdr.front_len = cpu_to_le32(front_len);
m->hdr.middle_len = 0; m->hdr.middle_len = 0;
m->hdr.data_len = cpu_to_le32(page_len); m->hdr.data_len = 0;
m->hdr.data_off = cpu_to_le16(page_off); m->hdr.data_off = 0;
m->hdr.reserved = 0; m->hdr.reserved = 0;
m->footer.front_crc = 0; m->footer.front_crc = 0;
m->footer.middle_crc = 0; m->footer.middle_crc = 0;
...@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, ...@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
/* front */ /* front */
if (front_len) { if (front_len) {
if (front_len > PAGE_CACHE_SIZE) { if (front_len > PAGE_CACHE_SIZE) {
m->front.iov_base = __vmalloc(front_len, GFP_NOFS, m->front.iov_base = __vmalloc(front_len, flags,
PAGE_KERNEL); PAGE_KERNEL);
m->front_is_vmalloc = true; m->front_is_vmalloc = true;
} else { } else {
m->front.iov_base = kmalloc(front_len, GFP_NOFS); m->front.iov_base = kmalloc(front_len, flags);
} }
if (m->front.iov_base == NULL) { if (m->front.iov_base == NULL) {
pr_err("msg_new can't allocate %d bytes\n", pr_err("msg_new can't allocate %d bytes\n",
...@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, ...@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
m->middle = NULL; m->middle = NULL;
/* data */ /* data */
m->nr_pages = calc_pages_for(page_off, page_len); m->nr_pages = 0;
m->pages = pages; m->pages = NULL;
m->pagelist = NULL; m->pagelist = NULL;
dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, dout("ceph_msg_new %p front %d\n", m, front_len);
m->nr_pages);
return m; return m;
out2: out2:
ceph_msg_put(m); ceph_msg_put(m);
out: out:
pr_err("msg_new can't create type %d len %d\n", type, front_len); pr_err("msg_new can't create type %d front %d\n", type, front_len);
return ERR_PTR(-ENOMEM); return NULL;
} }
/* /*
...@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, ...@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip); msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (IS_ERR(msg)) if (!msg || *skip)
return msg;
if (*skip)
return NULL; return NULL;
} }
if (!msg) { if (!msg) {
*skip = 0; *skip = 0;
msg = ceph_msg_new(type, front_len, 0, 0, NULL); msg = ceph_msg_new(type, front_len, GFP_NOFS);
if (!msg) { if (!msg) {
pr_err("unable to allocate msg type %d len %d\n", pr_err("unable to allocate msg type %d len %d\n",
type, front_len); type, front_len);
return ERR_PTR(-ENOMEM); return NULL;
} }
} }
memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
if (middle_len) { if (middle_len && !msg->middle) {
ret = ceph_alloc_middle(con, msg); ret = ceph_alloc_middle(con, msg);
if (ret < 0) { if (ret < 0) {
ceph_msg_put(msg); ceph_msg_put(msg);
return msg; return NULL;
} }
} }
......
...@@ -49,10 +49,8 @@ struct ceph_connection_operations { ...@@ -49,10 +49,8 @@ struct ceph_connection_operations {
int *skip); int *skip);
}; };
extern const char *ceph_name_type_str(int t);
/* use format string %s%d */ /* use format string %s%d */
#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
struct ceph_messenger { struct ceph_messenger {
struct ceph_entity_inst inst; /* my name+address */ struct ceph_entity_inst inst; /* my name+address */
...@@ -144,6 +142,7 @@ struct ceph_connection { ...@@ -144,6 +142,7 @@ struct ceph_connection {
struct ceph_entity_addr peer_addr; /* peer address */ struct ceph_entity_addr peer_addr; /* peer address */
struct ceph_entity_name peer_name; /* peer name */ struct ceph_entity_name peer_name; /* peer name */
struct ceph_entity_addr peer_addr_for_me; struct ceph_entity_addr peer_addr_for_me;
unsigned peer_features;
u32 connect_seq; /* identify the most recent connection u32 connect_seq; /* identify the most recent connection
attempt for this connection, client */ attempt for this connection, client */
u32 peer_global_seq; /* peer's global seq for this connection */ u32 peer_global_seq; /* peer's global seq for this connection */
...@@ -158,7 +157,6 @@ struct ceph_connection { ...@@ -158,7 +157,6 @@ struct ceph_connection {
struct list_head out_queue; struct list_head out_queue;
struct list_head out_sent; /* sending or sent but unacked */ struct list_head out_sent; /* sending or sent but unacked */
u64 out_seq; /* last message queued for send */ u64 out_seq; /* last message queued for send */
u64 out_seq_sent; /* last message sent */
bool out_keepalive_pending; bool out_keepalive_pending;
u64 in_seq, in_seq_acked; /* last message received, acked */ u64 in_seq, in_seq_acked; /* last message received, acked */
...@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con); ...@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
extern void ceph_con_put(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con);
extern struct ceph_msg *ceph_msg_new(int type, int front_len, extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
int page_len, int page_off,
struct page **pages);
extern void ceph_msg_kfree(struct ceph_msg *m); extern void ceph_msg_kfree(struct ceph_msg *m);
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
* resend any outstanding requests. * resend any outstanding requests.
*/ */
const static struct ceph_connection_operations mon_con_ops; static const struct ceph_connection_operations mon_con_ops;
static int __validate_auth(struct ceph_mon_client *monc); static int __validate_auth(struct ceph_mon_client *monc);
...@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) ...@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
monc->pending_auth = 1; monc->pending_auth = 1;
monc->m_auth->front.iov_len = len; monc->m_auth->front.iov_len = len;
monc->m_auth->hdr.front_len = cpu_to_le32(len); monc->m_auth->hdr.front_len = cpu_to_le32(len);
ceph_con_revoke(monc->con, monc->m_auth);
ceph_msg_get(monc->m_auth); /* keep our ref */ ceph_msg_get(monc->m_auth); /* keep our ref */
ceph_con_send(monc->con, monc->m_auth); ceph_con_send(monc->con, monc->m_auth);
} }
...@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc) ...@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
monc->want_next_osdmap); monc->want_next_osdmap);
if ((__sub_expired(monc) && !monc->sub_sent) || if ((__sub_expired(monc) && !monc->sub_sent) ||
monc->want_next_osdmap == 1) { monc->want_next_osdmap == 1) {
struct ceph_msg *msg; struct ceph_msg *msg = monc->m_subscribe;
struct ceph_mon_subscribe_item *i; struct ceph_mon_subscribe_item *i;
void *p, *end; void *p, *end;
msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
if (!msg)
return;
p = msg->front.iov_base; p = msg->front.iov_base;
end = p + msg->front.iov_len; end = p + msg->front_max;
dout("__send_subscribe to 'mdsmap' %u+\n", dout("__send_subscribe to 'mdsmap' %u+\n",
(unsigned)monc->have_mdsmap); (unsigned)monc->have_mdsmap);
...@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) ...@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
msg->front.iov_len = p - msg->front.iov_base; msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_con_send(monc->con, msg); ceph_con_revoke(monc->con, msg);
ceph_con_send(monc->con, ceph_msg_get(msg));
monc->sub_sent = jiffies | 1; /* never 0 */ monc->sub_sent = jiffies | 1; /* never 0 */
} }
...@@ -353,14 +351,14 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, ...@@ -353,14 +351,14 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
/* /*
* statfs * statfs
*/ */
static struct ceph_mon_statfs_request *__lookup_statfs( static struct ceph_mon_generic_request *__lookup_generic_req(
struct ceph_mon_client *monc, u64 tid) struct ceph_mon_client *monc, u64 tid)
{ {
struct ceph_mon_statfs_request *req; struct ceph_mon_generic_request *req;
struct rb_node *n = monc->statfs_request_tree.rb_node; struct rb_node *n = monc->generic_request_tree.rb_node;
while (n) { while (n) {
req = rb_entry(n, struct ceph_mon_statfs_request, node); req = rb_entry(n, struct ceph_mon_generic_request, node);
if (tid < req->tid) if (tid < req->tid)
n = n->rb_left; n = n->rb_left;
else if (tid > req->tid) else if (tid > req->tid)
...@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs( ...@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
return NULL; return NULL;
} }
static void __insert_statfs(struct ceph_mon_client *monc, static void __insert_generic_request(struct ceph_mon_client *monc,
struct ceph_mon_statfs_request *new) struct ceph_mon_generic_request *new)
{ {
struct rb_node **p = &monc->statfs_request_tree.rb_node; struct rb_node **p = &monc->generic_request_tree.rb_node;
struct rb_node *parent = NULL; struct rb_node *parent = NULL;
struct ceph_mon_statfs_request *req = NULL; struct ceph_mon_generic_request *req = NULL;
while (*p) { while (*p) {
parent = *p; parent = *p;
req = rb_entry(parent, struct ceph_mon_statfs_request, node); req = rb_entry(parent, struct ceph_mon_generic_request, node);
if (new->tid < req->tid) if (new->tid < req->tid)
p = &(*p)->rb_left; p = &(*p)->rb_left;
else if (new->tid > req->tid) else if (new->tid > req->tid)
...@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc, ...@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
} }
rb_link_node(&new->node, parent, p); rb_link_node(&new->node, parent, p);
rb_insert_color(&new->node, &monc->statfs_request_tree); rb_insert_color(&new->node, &monc->generic_request_tree);
}
static void release_generic_request(struct kref *kref)
{
struct ceph_mon_generic_request *req =
container_of(kref, struct ceph_mon_generic_request, kref);
if (req->reply)
ceph_msg_put(req->reply);
if (req->request)
ceph_msg_put(req->request);
}
static void put_generic_request(struct ceph_mon_generic_request *req)
{
kref_put(&req->kref, release_generic_request);
}
static void get_generic_request(struct ceph_mon_generic_request *req)
{
kref_get(&req->kref);
}
static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
int *skip)
{
struct ceph_mon_client *monc = con->private;
struct ceph_mon_generic_request *req;
u64 tid = le64_to_cpu(hdr->tid);
struct ceph_msg *m;
mutex_lock(&monc->mutex);
req = __lookup_generic_req(monc, tid);
if (!req) {
dout("get_generic_reply %lld dne\n", tid);
*skip = 1;
m = NULL;
} else {
dout("get_generic_reply %lld got %p\n", tid, req->reply);
m = ceph_msg_get(req->reply);
/*
* we don't need to track the connection reading into
* this reply because we only have one open connection
* at a time, ever.
*/
}
mutex_unlock(&monc->mutex);
return m;
} }
static void handle_statfs_reply(struct ceph_mon_client *monc, static void handle_statfs_reply(struct ceph_mon_client *monc,
struct ceph_msg *msg) struct ceph_msg *msg)
{ {
struct ceph_mon_statfs_request *req; struct ceph_mon_generic_request *req;
struct ceph_mon_statfs_reply *reply = msg->front.iov_base; struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
u64 tid; u64 tid = le64_to_cpu(msg->hdr.tid);
if (msg->front.iov_len != sizeof(*reply)) if (msg->front.iov_len != sizeof(*reply))
goto bad; goto bad;
tid = le64_to_cpu(msg->hdr.tid);
dout("handle_statfs_reply %p tid %llu\n", msg, tid); dout("handle_statfs_reply %p tid %llu\n", msg, tid);
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
req = __lookup_statfs(monc, tid); req = __lookup_generic_req(monc, tid);
if (req) { if (req) {
*req->buf = reply->st; *(struct ceph_statfs *)req->buf = reply->st;
req->result = 0; req->result = 0;
get_generic_request(req);
} }
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
if (req) if (req) {
complete(&req->completion); complete(&req->completion);
put_generic_request(req);
}
return; return;
bad: bad:
pr_err("corrupt statfs reply, no tid\n"); pr_err("corrupt generic reply, no tid\n");
ceph_msg_dump(msg); ceph_msg_dump(msg);
} }
/* /*
* (re)send a statfs request * Do a synchronous statfs().
*/ */
static int send_statfs(struct ceph_mon_client *monc, int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
struct ceph_mon_statfs_request *req)
{ {
struct ceph_msg *msg; struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h; struct ceph_mon_statfs *h;
int err;
req = kzalloc(sizeof(*req), GFP_NOFS);
if (!req)
return -ENOMEM;
kref_init(&req->kref);
req->buf = buf;
init_completion(&req->completion);
dout("send_statfs tid %llu\n", req->tid); err = -ENOMEM;
msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
if (IS_ERR(msg)) if (!req->request)
return PTR_ERR(msg); goto out;
req->request = msg; req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
msg->hdr.tid = cpu_to_le64(req->tid); if (!req->reply)
h = msg->front.iov_base; goto out;
/* fill out request */
h = req->request->front.iov_base;
h->monhdr.have_version = 0; h->monhdr.have_version = 0;
h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0; h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid; h->fsid = monc->monmap->fsid;
ceph_con_send(monc->con, msg);
return 0;
}
/*
* Do a synchronous statfs().
*/
int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
{
struct ceph_mon_statfs_request req;
int err;
req.buf = buf;
init_completion(&req.completion);
/* allocate memory for reply */
err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
if (err)
return err;
/* register request */ /* register request */
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
req.tid = ++monc->last_tid; req->tid = ++monc->last_tid;
req.last_attempt = jiffies; req->request->hdr.tid = cpu_to_le64(req->tid);
req.delay = BASE_DELAY_INTERVAL; __insert_generic_request(monc, req);
__insert_statfs(monc, &req); monc->num_generic_requests++;
monc->num_statfs_requests++;
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
/* send request and wait */ /* send request and wait */
err = send_statfs(monc, &req); ceph_con_send(monc->con, ceph_msg_get(req->request));
if (!err) err = wait_for_completion_interruptible(&req->completion);
err = wait_for_completion_interruptible(&req.completion);
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
rb_erase(&req.node, &monc->statfs_request_tree); rb_erase(&req->node, &monc->generic_request_tree);
monc->num_statfs_requests--; monc->num_generic_requests--;
ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
if (!err) if (!err)
err = req.result; err = req->result;
out:
kref_put(&req->kref, release_generic_request);
return err; return err;
} }
/* /*
* Resend pending statfs requests. * Resend pending statfs requests.
*/ */
static void __resend_statfs(struct ceph_mon_client *monc) static void __resend_generic_request(struct ceph_mon_client *monc)
{ {
struct ceph_mon_statfs_request *req; struct ceph_mon_generic_request *req;
struct rb_node *p; struct rb_node *p;
for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
req = rb_entry(p, struct ceph_mon_statfs_request, node); req = rb_entry(p, struct ceph_mon_generic_request, node);
send_statfs(monc, req); ceph_con_revoke(monc->con, req->request);
ceph_con_send(monc->con, ceph_msg_get(req->request));
} }
} }
...@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
/* msg pools */ /* msgs */
err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, err = -ENOMEM;
sizeof(struct ceph_mon_subscribe_ack), 1, false); monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
if (err < 0) sizeof(struct ceph_mon_subscribe_ack),
GFP_NOFS);
if (!monc->m_subscribe_ack)
goto out_monmap; goto out_monmap;
err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
sizeof(struct ceph_mon_statfs_reply), 0, false); monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
if (err < 0) if (!monc->m_subscribe)
goto out_pool1; goto out_subscribe_ack;
err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
if (err < 0) monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
goto out_pool2; if (!monc->m_auth_reply)
goto out_subscribe;
monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
monc->pending_auth = 0; monc->pending_auth = 0;
if (IS_ERR(monc->m_auth)) { if (!monc->m_auth)
err = PTR_ERR(monc->m_auth); goto out_auth_reply;
monc->m_auth = NULL;
goto out_pool3;
}
monc->cur_mon = -1; monc->cur_mon = -1;
monc->hunting = true; monc->hunting = true;
...@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
monc->sub_sent = 0; monc->sub_sent = 0;
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->statfs_request_tree = RB_ROOT; monc->generic_request_tree = RB_ROOT;
monc->num_statfs_requests = 0; monc->num_generic_requests = 0;
monc->last_tid = 0; monc->last_tid = 0;
monc->have_mdsmap = 0; monc->have_mdsmap = 0;
...@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
monc->want_next_osdmap = 1; monc->want_next_osdmap = 1;
return 0; return 0;
out_pool3: out_auth_reply:
ceph_msgpool_destroy(&monc->msgpool_auth_reply); ceph_msg_put(monc->m_auth_reply);
out_pool2: out_subscribe:
ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); ceph_msg_put(monc->m_subscribe);
out_pool1: out_subscribe_ack:
ceph_msgpool_destroy(&monc->msgpool_statfs_reply); ceph_msg_put(monc->m_subscribe_ack);
out_monmap: out_monmap:
kfree(monc->monmap); kfree(monc->monmap);
out: out:
...@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ...@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
ceph_auth_destroy(monc->auth); ceph_auth_destroy(monc->auth);
ceph_msg_put(monc->m_auth); ceph_msg_put(monc->m_auth);
ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); ceph_msg_put(monc->m_auth_reply);
ceph_msgpool_destroy(&monc->msgpool_statfs_reply); ceph_msg_put(monc->m_subscribe);
ceph_msgpool_destroy(&monc->msgpool_auth_reply); ceph_msg_put(monc->m_subscribe_ack);
kfree(monc->monmap); kfree(monc->monmap);
} }
...@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ...@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
monc->client->msgr->inst.name.num = monc->auth->global_id; monc->client->msgr->inst.name.num = monc->auth->global_id;
__send_subscribe(monc); __send_subscribe(monc);
__resend_statfs(monc); __resend_generic_request(monc);
} }
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
...@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, ...@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
switch (type) { switch (type) {
case CEPH_MSG_MON_SUBSCRIBE_ACK: case CEPH_MSG_MON_SUBSCRIBE_ACK:
m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); m = ceph_msg_get(monc->m_subscribe_ack);
break; break;
case CEPH_MSG_STATFS_REPLY: case CEPH_MSG_STATFS_REPLY:
m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); return get_generic_reply(con, hdr, skip);
break;
case CEPH_MSG_AUTH_REPLY: case CEPH_MSG_AUTH_REPLY:
m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); m = ceph_msg_get(monc->m_auth_reply);
break; break;
case CEPH_MSG_MON_MAP: case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP: case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_MAP:
m = ceph_msg_new(type, front_len, 0, 0, NULL); m = ceph_msg_new(type, front_len, GFP_NOFS);
break; break;
} }
...@@ -826,7 +867,7 @@ static void mon_fault(struct ceph_connection *con) ...@@ -826,7 +867,7 @@ static void mon_fault(struct ceph_connection *con)
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
const static struct ceph_connection_operations mon_con_ops = { static const struct ceph_connection_operations mon_con_ops = {
.get = ceph_con_get, .get = ceph_con_get,
.put = ceph_con_put, .put = ceph_con_put,
.dispatch = dispatch, .dispatch = dispatch,
......
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
#define _FS_CEPH_MON_CLIENT_H #define _FS_CEPH_MON_CLIENT_H
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/kref.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include "messenger.h" #include "messenger.h"
#include "msgpool.h"
struct ceph_client; struct ceph_client;
struct ceph_mount_args; struct ceph_mount_args;
...@@ -22,7 +22,7 @@ struct ceph_monmap { ...@@ -22,7 +22,7 @@ struct ceph_monmap {
}; };
struct ceph_mon_client; struct ceph_mon_client;
struct ceph_mon_statfs_request; struct ceph_mon_generic_request;
/* /*
...@@ -40,17 +40,19 @@ struct ceph_mon_request { ...@@ -40,17 +40,19 @@ struct ceph_mon_request {
}; };
/* /*
* statfs() is done a bit differently because we need to get data back * ceph_mon_generic_request is being used for the statfs and poolop requests
* which are bening done a bit differently because we need to get data back
* to the caller * to the caller
*/ */
struct ceph_mon_statfs_request { struct ceph_mon_generic_request {
struct kref kref;
u64 tid; u64 tid;
struct rb_node node; struct rb_node node;
int result; int result;
struct ceph_statfs *buf; void *buf;
struct completion completion; struct completion completion;
unsigned long last_attempt, delay; /* jiffies */
struct ceph_msg *request; /* original request */ struct ceph_msg *request; /* original request */
struct ceph_msg *reply; /* and reply */
}; };
struct ceph_mon_client { struct ceph_mon_client {
...@@ -61,7 +63,7 @@ struct ceph_mon_client { ...@@ -61,7 +63,7 @@ struct ceph_mon_client {
struct delayed_work delayed_work; struct delayed_work delayed_work;
struct ceph_auth_client *auth; struct ceph_auth_client *auth;
struct ceph_msg *m_auth; struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
int pending_auth; int pending_auth;
bool hunting; bool hunting;
...@@ -70,14 +72,9 @@ struct ceph_mon_client { ...@@ -70,14 +72,9 @@ struct ceph_mon_client {
struct ceph_connection *con; struct ceph_connection *con;
bool have_fsid; bool have_fsid;
/* msg pools */ /* pending generic requests */
struct ceph_msgpool msgpool_subscribe_ack; struct rb_root generic_request_tree;
struct ceph_msgpool msgpool_statfs_reply; int num_generic_requests;
struct ceph_msgpool msgpool_auth_reply;
/* pending statfs requests */
struct rb_root statfs_request_tree;
int num_statfs_requests;
u64 last_tid; u64 last_tid;
/* mds/osd map */ /* mds/osd map */
......
...@@ -7,180 +7,58 @@ ...@@ -7,180 +7,58 @@
#include "msgpool.h" #include "msgpool.h"
/* static void *alloc_fn(gfp_t gfp_mask, void *arg)
* We use msg pools to preallocate memory for messages we expect to {
* receive over the wire, to avoid getting ourselves into OOM struct ceph_msgpool *pool = arg;
* conditions at unexpected times. We take use a few different void *p;
* strategies:
*
* - for request/response type interactions, we preallocate the
* memory needed for the response when we generate the request.
*
* - for messages we can receive at any time from the MDS, we preallocate
* a pool of messages we can re-use.
*
* - for writeback, we preallocate some number of messages to use for
* requests and their replies, so that we always make forward
* progress.
*
* The msgpool behaves like a mempool_t, but keeps preallocated
* ceph_msgs strung together on a list_head instead of using a pointer
* vector. This avoids vector reallocation when we adjust the number
* of preallocated items (which happens frequently).
*/
p = ceph_msg_new(0, pool->front_len, gfp_mask);
if (!p)
pr_err("msgpool %s alloc failed\n", pool->name);
return p;
}
/* static void free_fn(void *element, void *arg)
* Allocate or release as necessary to meet our target pool size.
*/
static int __fill_msgpool(struct ceph_msgpool *pool)
{ {
struct ceph_msg *msg; ceph_msg_put(element);
while (pool->num < pool->min) {
dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
pool->min);
spin_unlock(&pool->lock);
msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
spin_lock(&pool->lock);
if (IS_ERR(msg))
return PTR_ERR(msg);
msg->pool = pool;
list_add(&msg->list_head, &pool->msgs);
pool->num++;
}
while (pool->num > pool->min) {
msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
pool->min, msg);
list_del_init(&msg->list_head);
pool->num--;
ceph_msg_kfree(msg);
}
return 0;
} }
int ceph_msgpool_init(struct ceph_msgpool *pool, int ceph_msgpool_init(struct ceph_msgpool *pool,
int front_len, int min, bool blocking) int front_len, int size, bool blocking, const char *name)
{ {
int ret;
dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
spin_lock_init(&pool->lock);
pool->front_len = front_len; pool->front_len = front_len;
INIT_LIST_HEAD(&pool->msgs); pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
pool->num = 0; if (!pool->pool)
pool->min = min; return -ENOMEM;
pool->blocking = blocking; pool->name = name;
init_waitqueue_head(&pool->wait); return 0;
spin_lock(&pool->lock);
ret = __fill_msgpool(pool);
spin_unlock(&pool->lock);
return ret;
} }
void ceph_msgpool_destroy(struct ceph_msgpool *pool) void ceph_msgpool_destroy(struct ceph_msgpool *pool)
{ {
dout("msgpool_destroy %p\n", pool); mempool_destroy(pool->pool);
spin_lock(&pool->lock);
pool->min = 0;
__fill_msgpool(pool);
spin_unlock(&pool->lock);
}
int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
{
int ret;
spin_lock(&pool->lock);
dout("msgpool_resv %p delta %d\n", pool, delta);
pool->min += delta;
ret = __fill_msgpool(pool);
spin_unlock(&pool->lock);
return ret;
} }
struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
int front_len)
{ {
wait_queue_t wait; if (front_len > pool->front_len) {
struct ceph_msg *msg; pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
pool->name, front_len, pool->front_len);
if (front_len && front_len > pool->front_len) {
pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
pool, front_len, pool->front_len);
WARN_ON(1); WARN_ON(1);
/* try to alloc a fresh message */ /* try to alloc a fresh message */
msg = ceph_msg_new(0, front_len, 0, 0, NULL); return ceph_msg_new(0, front_len, GFP_NOFS);
if (!IS_ERR(msg))
return msg;
}
if (!front_len)
front_len = pool->front_len;
if (pool->blocking) {
/* mempool_t behavior; first try to alloc */
msg = ceph_msg_new(0, front_len, 0, 0, NULL);
if (!IS_ERR(msg))
return msg;
} }
while (1) { return mempool_alloc(pool->pool, GFP_NOFS);
spin_lock(&pool->lock);
if (likely(pool->num)) {
msg = list_entry(pool->msgs.next, struct ceph_msg,
list_head);
list_del_init(&msg->list_head);
pool->num--;
dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
pool->num, pool->min);
spin_unlock(&pool->lock);
return msg;
}
pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
pool->min, pool->blocking ? "waiting" : "may fail");
spin_unlock(&pool->lock);
if (!pool->blocking) {
WARN_ON(1);
/* maybe we can allocate it now? */
msg = ceph_msg_new(0, front_len, 0, 0, NULL);
if (!IS_ERR(msg))
return msg;
pr_err("msgpool_get %p empty + alloc failed\n", pool);
return ERR_PTR(-ENOMEM);
}
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
schedule();
finish_wait(&pool->wait, &wait);
}
} }
void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
{ {
spin_lock(&pool->lock);
if (pool->num < pool->min) {
/* reset msg front_len; user may have changed it */ /* reset msg front_len; user may have changed it */
msg->front.iov_len = pool->front_len; msg->front.iov_len = pool->front_len;
msg->hdr.front_len = cpu_to_le32(pool->front_len); msg->hdr.front_len = cpu_to_le32(pool->front_len);
kref_set(&msg->kref, 1); /* retake a single ref */ kref_init(&msg->kref); /* retake single ref */
list_add(&msg->list_head, &pool->msgs);
pool->num++;
dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
pool->num, pool->min);
spin_unlock(&pool->lock);
wake_up(&pool->wait);
} else {
dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
pool->num, pool->min);
spin_unlock(&pool->lock);
ceph_msg_kfree(msg);
}
} }
#ifndef _FS_CEPH_MSGPOOL #ifndef _FS_CEPH_MSGPOOL
#define _FS_CEPH_MSGPOOL #define _FS_CEPH_MSGPOOL
#include <linux/mempool.h>
#include "messenger.h" #include "messenger.h"
/* /*
...@@ -8,18 +9,15 @@ ...@@ -8,18 +9,15 @@
* avoid unexpected OOM conditions. * avoid unexpected OOM conditions.
*/ */
struct ceph_msgpool { struct ceph_msgpool {
spinlock_t lock; const char *name;
mempool_t *pool;
int front_len; /* preallocated payload size */ int front_len; /* preallocated payload size */
struct list_head msgs; /* msgs in the pool; each has 1 ref */
int num, min; /* cur, min # msgs in the pool */
bool blocking;
wait_queue_head_t wait;
}; };
extern int ceph_msgpool_init(struct ceph_msgpool *pool, extern int ceph_msgpool_init(struct ceph_msgpool *pool,
int front_len, int size, bool blocking); int front_len, int size, bool blocking,
const char *name);
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
int front_len); int front_len);
extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
......
...@@ -50,7 +50,6 @@ struct ceph_entity_name { ...@@ -50,7 +50,6 @@ struct ceph_entity_name {
#define CEPH_ENTITY_TYPE_MDS 0x02 #define CEPH_ENTITY_TYPE_MDS 0x02
#define CEPH_ENTITY_TYPE_OSD 0x04 #define CEPH_ENTITY_TYPE_OSD 0x04
#define CEPH_ENTITY_TYPE_CLIENT 0x08 #define CEPH_ENTITY_TYPE_CLIENT 0x08
#define CEPH_ENTITY_TYPE_ADMIN 0x10
#define CEPH_ENTITY_TYPE_AUTH 0x20 #define CEPH_ENTITY_TYPE_AUTH 0x20
#define CEPH_ENTITY_TYPE_ANY 0xFF #define CEPH_ENTITY_TYPE_ANY 0xFF
...@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply { ...@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
/* /*
* message header * message header
*/ */
struct ceph_msg_header { struct ceph_msg_header_old {
__le64 seq; /* message seq# for this session */ __le64 seq; /* message seq# for this session */
__le64 tid; /* transaction id */ __le64 tid; /* transaction id */
__le16 type; /* message type */ __le16 type; /* message type */
...@@ -138,6 +137,24 @@ struct ceph_msg_header { ...@@ -138,6 +137,24 @@ struct ceph_msg_header {
__le32 crc; /* header crc32c */ __le32 crc; /* header crc32c */
} __attribute__ ((packed)); } __attribute__ ((packed));
struct ceph_msg_header {
__le64 seq; /* message seq# for this session */
__le64 tid; /* transaction id */
__le16 type; /* message type */
__le16 priority; /* priority. higher value == higher priority */
__le16 version; /* version of message encoding */
__le32 front_len; /* bytes in main payload */
__le32 middle_len;/* bytes in middle payload */
__le32 data_len; /* bytes of data payload */
__le16 data_off; /* sender: include full offset;
receiver: mask against ~PAGE_MASK */
struct ceph_entity_name src;
__le32 reserved;
__le32 crc; /* header crc32c */
} __attribute__ ((packed));
#define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_LOW 64
#define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_DEFAULT 127
#define CEPH_MSG_PRIO_HIGH 196 #define CEPH_MSG_PRIO_HIGH 196
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#define OSD_OP_FRONT_LEN 4096 #define OSD_OP_FRONT_LEN 4096
#define OSD_OPREPLY_FRONT_LEN 512 #define OSD_OPREPLY_FRONT_LEN 512
const static struct ceph_connection_operations osd_con_ops; static const struct ceph_connection_operations osd_con_ops;
static int __kick_requests(struct ceph_osd_client *osdc, static int __kick_requests(struct ceph_osd_client *osdc,
struct ceph_osd *kickosd); struct ceph_osd *kickosd);
...@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ...@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
req = kzalloc(sizeof(*req), GFP_NOFS); req = kzalloc(sizeof(*req), GFP_NOFS);
} }
if (req == NULL) if (req == NULL)
return ERR_PTR(-ENOMEM); return NULL;
req->r_osdc = osdc; req->r_osdc = osdc;
req->r_mempool = use_mempool; req->r_mempool = use_mempool;
...@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ...@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
else else
msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
if (IS_ERR(msg)) { if (!msg) {
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
return ERR_PTR(PTR_ERR(msg)); return NULL;
} }
req->r_reply = msg; req->r_reply = msg;
...@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ...@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (use_mempool) if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0); msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
else else
msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
if (IS_ERR(msg)) { if (!msg) {
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
return ERR_PTR(PTR_ERR(msg)); return NULL;
} }
msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
memset(msg->front.iov_base, 0, msg->front.iov_len); memset(msg->front.iov_base, 0, msg->front.iov_len);
...@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work) ...@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
* should mark the osd as failed and we should find out about * should mark the osd as failed and we should find out about
* it from an updated osd map. * it from an updated osd map.
*/ */
while (!list_empty(&osdc->req_lru)) { while (timeout && !list_empty(&osdc->req_lru)) {
req = list_entry(osdc->req_lru.next, struct ceph_osd_request, req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
r_req_lru_item); r_req_lru_item);
...@@ -1078,6 +1078,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1078,6 +1078,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
if (newmap) if (newmap)
kick_requests(osdc, NULL); kick_requests(osdc, NULL);
up_read(&osdc->map_sem); up_read(&osdc->map_sem);
wake_up(&osdc->client->auth_wq);
return; return;
bad: bad:
...@@ -1087,45 +1088,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1087,45 +1088,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
return; return;
} }
/*
* A read request prepares specific pages that data is to be read into.
* When a message is being read off the wire, we call prepare_pages to
* find those pages.
* 0 = success, -1 failure.
*/
static int __prepare_pages(struct ceph_connection *con,
struct ceph_msg_header *hdr,
struct ceph_osd_request *req,
u64 tid,
struct ceph_msg *m)
{
struct ceph_osd *osd = con->private;
struct ceph_osd_client *osdc;
int ret = -1;
int data_len = le32_to_cpu(hdr->data_len);
unsigned data_off = le16_to_cpu(hdr->data_off);
int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
if (!osd)
return -1;
osdc = osd->o_osdc;
dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
tid, req->r_num_pages, want);
if (unlikely(req->r_num_pages < want))
goto out;
m->pages = req->r_pages;
m->nr_pages = req->r_num_pages;
ret = 0; /* success */
out:
BUG_ON(ret < 0 || m->nr_pages < want);
return ret;
}
/* /*
* Register request, send initial attempt. * Register request, send initial attempt.
*/ */
...@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) ...@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->req_mempool) if (!osdc->req_mempool)
goto out; goto out;
err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
"osd_op");
if (err < 0) if (err < 0)
goto out_mempool; goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, err = ceph_msgpool_init(&osdc->msgpool_op_reply,
OSD_OPREPLY_FRONT_LEN, 10, true); OSD_OPREPLY_FRONT_LEN, 10, true,
"osd_op_reply");
if (err < 0) if (err < 0)
goto out_msgpool; goto out_msgpool;
return 0; return 0;
...@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, ...@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, 0, truncate_seq, truncate_size, NULL, NULL, 0, truncate_seq, truncate_size, NULL,
false, 1); false, 1);
if (IS_ERR(req)) if (!req)
return PTR_ERR(req); return -ENOMEM;
/* it may be a short read due to an object boundary */ /* it may be a short read due to an object boundary */
req->r_pages = pages; req->r_pages = pages;
...@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, ...@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
snapc, do_sync, snapc, do_sync,
truncate_seq, truncate_size, mtime, truncate_seq, truncate_size, mtime,
nofail, 1); nofail, 1);
if (IS_ERR(req)) if (!req)
return PTR_ERR(req); return -ENOMEM;
/* it may be a short write due to an object boundary */ /* it may be a short write due to an object boundary */
req->r_pages = pages; req->r_pages = pages;
...@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
} }
/* /*
* lookup and return message for incoming reply * lookup and return message for incoming reply. set up reply message
* pages.
*/ */
static struct ceph_msg *get_reply(struct ceph_connection *con, static struct ceph_msg *get_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr, struct ceph_msg_header *hdr,
...@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ...@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
int front = le32_to_cpu(hdr->front_len); int front = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len); int data_len = le32_to_cpu(hdr->data_len);
u64 tid; u64 tid;
int err;
tid = le64_to_cpu(hdr->tid); tid = le64_to_cpu(hdr->tid);
mutex_lock(&osdc->request_mutex); mutex_lock(&osdc->request_mutex);
...@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ...@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply, req->r_con_filling_msg); req->r_reply, req->r_con_filling_msg);
ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
ceph_con_put(req->r_con_filling_msg); ceph_con_put(req->r_con_filling_msg);
req->r_con_filling_msg = NULL;
} }
if (front > req->r_reply->front.iov_len) { if (front > req->r_reply->front.iov_len) {
pr_warning("get_reply front %d > preallocated %d\n", pr_warning("get_reply front %d > preallocated %d\n",
front, (int)req->r_reply->front.iov_len); front, (int)req->r_reply->front.iov_len);
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
if (IS_ERR(m)) if (!m)
goto out; goto out;
ceph_msg_put(req->r_reply); ceph_msg_put(req->r_reply);
req->r_reply = m; req->r_reply = m;
...@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ...@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
m = ceph_msg_get(req->r_reply); m = ceph_msg_get(req->r_reply);
if (data_len > 0) { if (data_len > 0) {
err = __prepare_pages(con, hdr, req, tid, m); unsigned data_off = le16_to_cpu(hdr->data_off);
if (err < 0) { int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
if (unlikely(req->r_num_pages < want)) {
pr_warning("tid %lld reply %d > expected %d pages\n",
tid, want, m->nr_pages);
*skip = 1; *skip = 1;
ceph_msg_put(m); ceph_msg_put(m);
m = ERR_PTR(err); m = NULL;
goto out;
} }
m->pages = req->r_pages;
m->nr_pages = req->r_num_pages;
} }
*skip = 0; *skip = 0;
req->r_con_filling_msg = ceph_con_get(con); req->r_con_filling_msg = ceph_con_get(con);
...@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, ...@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
switch (type) { switch (type) {
case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_MAP:
return ceph_msg_new(type, front, 0, 0, NULL); return ceph_msg_new(type, front, GFP_NOFS);
case CEPH_MSG_OSD_OPREPLY: case CEPH_MSG_OSD_OPREPLY:
return get_reply(con, hdr, skip); return get_reply(con, hdr, skip);
default: default:
...@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con) ...@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc); return ceph_monc_validate_auth(&osdc->client->monc);
} }
const static struct ceph_connection_operations osd_con_ops = { static const struct ceph_connection_operations osd_con_ops = {
.get = get_osd_con, .get = get_osd_con,
.put = put_osd_con, .put = put_osd_con,
.dispatch = dispatch, .dispatch = dispatch,
......
...@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl) ...@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
static int ceph_pagelist_addpage(struct ceph_pagelist *pl) static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
{ {
struct page *page = alloc_page(GFP_NOFS); struct page *page = __page_cache_alloc(GFP_NOFS);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
pl->room += PAGE_SIZE; pl->room += PAGE_SIZE;
......
...@@ -101,8 +101,8 @@ struct ceph_pg_pool { ...@@ -101,8 +101,8 @@ struct ceph_pg_pool {
__le64 snap_seq; /* seq for per-pool snapshot */ __le64 snap_seq; /* seq for per-pool snapshot */
__le32 snap_epoch; /* epoch of last snap */ __le32 snap_epoch; /* epoch of last snap */
__le32 num_snaps; __le32 num_snaps;
__le32 num_removed_snap_intervals; __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
__le64 uid; __le64 auid; /* who owns the pg */
} __attribute__ ((packed)); } __attribute__ ((packed));
/* /*
...@@ -208,6 +208,7 @@ enum { ...@@ -208,6 +208,7 @@ enum {
/* read */ /* read */
CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
/* write */ /* write */
CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
...@@ -305,6 +306,22 @@ enum { ...@@ -305,6 +306,22 @@ enum {
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
#define EBLACKLISTED ESHUTDOWN /* blacklisted */ #define EBLACKLISTED ESHUTDOWN /* blacklisted */
/* xattr comparison */
enum {
CEPH_OSD_CMPXATTR_OP_NOP = 0,
CEPH_OSD_CMPXATTR_OP_EQ = 1,
CEPH_OSD_CMPXATTR_OP_NE = 2,
CEPH_OSD_CMPXATTR_OP_GT = 3,
CEPH_OSD_CMPXATTR_OP_GTE = 4,
CEPH_OSD_CMPXATTR_OP_LT = 5,
CEPH_OSD_CMPXATTR_OP_LTE = 6
};
enum {
CEPH_OSD_CMPXATTR_MODE_STRING = 1,
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
/* /*
* an individual object operation. each may be accompanied by some data * an individual object operation. each may be accompanied by some data
* payload * payload
...@@ -321,6 +338,8 @@ struct ceph_osd_op { ...@@ -321,6 +338,8 @@ struct ceph_osd_op {
struct { struct {
__le32 name_len; __le32 name_len;
__le32 value_len; __le32 value_len;
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} __attribute__ ((packed)) xattr; } __attribute__ ((packed)) xattr;
struct { struct {
__u8 class_len; __u8 class_len;
......
...@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, ...@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap) struct ceph_cap_snap *capsnap)
{ {
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
BUG_ON(capsnap->writing); BUG_ON(capsnap->writing);
capsnap->size = inode->i_size; capsnap->size = inode->i_size;
......
...@@ -8,14 +8,11 @@ ...@@ -8,14 +8,11 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/mount.h> #include <linux/mount.h>
#include <linux/parser.h> #include <linux/parser.h>
#include <linux/rwsem.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/statfs.h> #include <linux/statfs.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include "decode.h" #include "decode.h"
#include "super.h" #include "super.h"
...@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) ...@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
static int ceph_syncfs(struct super_block *sb, int wait) static int ceph_syncfs(struct super_block *sb, int wait)
{ {
dout("sync_fs %d\n", wait); dout("sync_fs %d\n", wait);
ceph_osdc_sync(&ceph_client(sb)->osdc); ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
ceph_mdsc_sync(&ceph_client(sb)->mdsc); ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
dout("sync_fs %d done\n", wait); dout("sync_fs %d done\n", wait);
return 0; return 0;
} }
static int default_congestion_kb(void)
{
int congestion_kb;
/*
* Copied from NFS
*
* congestion size, scale with available memory.
*
* 64MB: 8192k
* 128MB: 11585k
* 256MB: 16384k
* 512MB: 23170k
* 1GB: 32768k
* 2GB: 46340k
* 4GB: 65536k
* 8GB: 92681k
* 16GB: 131072k
*
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
return congestion_kb;
}
/** /**
* ceph_show_options - Show mount options in /proc/mounts * ceph_show_options - Show mount options in /proc/mounts
...@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) ...@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",nocrc"); seq_puts(m, ",nocrc");
if (args->flags & CEPH_OPT_NOASYNCREADDIR) if (args->flags & CEPH_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir"); seq_puts(m, ",noasyncreaddir");
if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, ",osdkeepalivetimeout=%d",
args->osd_keepalive_timeout);
if (args->wsize)
seq_printf(m, ",wsize=%d", args->wsize);
if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
seq_printf(m, ",rsize=%d", args->rsize);
if (args->congestion_kb != default_congestion_kb())
seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
seq_printf(m, ",caps_wanted_delay_min=%d",
args->caps_wanted_delay_min);
if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
seq_printf(m, ",caps_wanted_delay_max=%d",
args->caps_wanted_delay_max);
if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
seq_printf(m, ",cap_release_safety=%d",
args->cap_release_safety);
if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
seq_printf(m, ",snapdirname=%s", args->snapdir_name); seq_printf(m, ",snapdirname=%s", args->snapdir_name);
if (args->name) if (args->name)
...@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo) ...@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
inode_init_once(&ci->vfs_inode); inode_init_once(&ci->vfs_inode);
} }
static int default_congestion_kb(void)
{
int congestion_kb;
/*
* Copied from NFS
*
* congestion size, scale with available memory.
*
* 64MB: 8192k
* 128MB: 11585k
* 256MB: 16384k
* 512MB: 23170k
* 1GB: 32768k
* 2GB: 46340k
* 4GB: 65536k
* 8GB: 92681k
* 16GB: 131072k
*
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
return congestion_kb;
}
static int __init init_caches(void) static int __init init_caches(void)
{ {
ceph_inode_cachep = kmem_cache_create("ceph_inode_info", ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
...@@ -308,7 +333,9 @@ enum { ...@@ -308,7 +333,9 @@ enum {
Opt_osd_idle_ttl, Opt_osd_idle_ttl,
Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max, Opt_caps_wanted_delay_max,
Opt_cap_release_safety,
Opt_readdir_max_entries, Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb, Opt_congestion_kb,
Opt_last_int, Opt_last_int,
/* int args above */ /* int args above */
...@@ -339,7 +366,9 @@ static match_table_t arg_tokens = { ...@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
{Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
{Opt_cap_release_safety, "cap_release_safety=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"},
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */ /* int args above */
{Opt_snapdirname, "snapdirname=%s"}, {Opt_snapdirname, "snapdirname=%s"},
...@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, ...@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
args->max_readdir = 1024; args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
args->congestion_kb = default_congestion_kb(); args->congestion_kb = default_congestion_kb();
/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
...@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, ...@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
case Opt_readdir_max_entries: case Opt_readdir_max_entries:
args->max_readdir = intval; args->max_readdir = intval;
break; break;
case Opt_readdir_max_bytes:
args->max_readdir_bytes = intval;
break;
case Opt_congestion_kb: case Opt_congestion_kb:
args->congestion_kb = intval; args->congestion_kb = intval;
break; break;
...@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) ...@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
/* /*
* true if we have the mon map (and have thus joined the cluster) * true if we have the mon map (and have thus joined the cluster)
*/ */
static int have_mon_map(struct ceph_client *client) static int have_mon_and_osd_map(struct ceph_client *client)
{ {
return client->monc.monmap && client->monc.monmap->epoch; return client->monc.monmap && client->monc.monmap->epoch &&
client->osdc.osdmap && client->osdc.osdmap->epoch;
} }
/* /*
...@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, ...@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
if (err < 0) if (err < 0)
goto out; goto out;
while (!have_mon_map(client)) { while (!have_mon_and_osd_map(client)) {
err = -EIO; err = -EIO;
if (timeout && time_after_eq(jiffies, started + timeout)) if (timeout && time_after_eq(jiffies, started + timeout))
goto out; goto out;
...@@ -770,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, ...@@ -770,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
/* wait */ /* wait */
dout("mount waiting for mon_map\n"); dout("mount waiting for mon_map\n");
err = wait_event_interruptible_timeout(client->auth_wq, err = wait_event_interruptible_timeout(client->auth_wq,
have_mon_map(client) || (client->auth_err < 0), have_mon_and_osd_map(client) || (client->auth_err < 0),
timeout); timeout);
if (err == -EINTR || err == -ERESTARTSYS) if (err == -EINTR || err == -ERESTARTSYS)
goto out; goto out;
...@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data) ...@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
/* /*
* construct our own bdi so we can control readahead, etc. * construct our own bdi so we can control readahead, etc.
*/ */
static atomic_long_t bdi_seq = ATOMIC_INIT(0);
static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
{ {
int err; int err;
...@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) ...@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
client->backing_dev_info.ra_pages = client->backing_dev_info.ra_pages =
(client->mount_args->rsize + PAGE_CACHE_SIZE - 1) (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT; >> PAGE_SHIFT;
err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
atomic_long_inc_return(&bdi_seq));
if (!err) if (!err)
sb->s_bdi = &client->backing_dev_info; sb->s_bdi = &client->backing_dev_info;
return err; return err;
...@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type, ...@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
goto out; goto out;
} }
if (ceph_client(sb) != client) { if (ceph_sb_to_client(sb) != client) {
ceph_destroy_client(client); ceph_destroy_client(client);
client = ceph_client(sb); client = ceph_sb_to_client(sb);
dout("get_sb got existing client %p\n", client); dout("get_sb got existing client %p\n", client);
} else { } else {
dout("get_sb using new client %p\n", client); dout("get_sb using new client %p\n", client);
......
...@@ -52,24 +52,25 @@ ...@@ -52,24 +52,25 @@
struct ceph_mount_args { struct ceph_mount_args {
int sb_flags; int sb_flags;
int flags;
struct ceph_fsid fsid;
struct ceph_entity_addr my_addr;
int num_mon; int num_mon;
struct ceph_entity_addr *mon_addr; struct ceph_entity_addr *mon_addr;
int flags;
int mount_timeout; int mount_timeout;
int osd_idle_ttl; int osd_idle_ttl;
int caps_wanted_delay_min, caps_wanted_delay_max;
struct ceph_fsid fsid;
struct ceph_entity_addr my_addr;
int wsize;
int rsize; /* max readahead */
int max_readdir; /* max readdir size */
int congestion_kb; /* max readdir size */
int osd_timeout; int osd_timeout;
int osd_keepalive_timeout; int osd_keepalive_timeout;
int wsize;
int rsize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
char *snapdir_name; /* default ".snap" */ char *snapdir_name; /* default ".snap" */
char *name; char *name;
char *secret; char *secret;
int cap_release_safety;
}; };
/* /*
...@@ -80,13 +81,14 @@ struct ceph_mount_args { ...@@ -80,13 +81,14 @@ struct ceph_mount_args {
#define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_KEEPALIVE_DEFAULT 5
#define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_OSD_IDLE_TTL_DEFAULT 60
#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap" #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
#define CEPH_AUTH_NAME_DEFAULT "guest" #define CEPH_AUTH_NAME_DEFAULT "guest"
/* /*
* Delay telling the MDS we no longer want caps, in case we reopen * Delay telling the MDS we no longer want caps, in case we reopen
* the file. Delay a minimum amount of time, even if we send a cap * the file. Delay a minimum amount of time, even if we send a cap
...@@ -96,6 +98,7 @@ struct ceph_mount_args { ...@@ -96,6 +98,7 @@ struct ceph_mount_args {
#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
/* mount state */ /* mount state */
enum { enum {
...@@ -160,12 +163,6 @@ struct ceph_client { ...@@ -160,12 +163,6 @@ struct ceph_client {
#endif #endif
}; };
static inline struct ceph_client *ceph_client(struct super_block *sb)
{
return sb->s_fs_info;
}
/* /*
* File i/o capability. This tracks shared state with the metadata * File i/o capability. This tracks shared state with the metadata
* server that allows us to cache or writeback attributes or to read * server that allows us to cache or writeback attributes or to read
...@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, ...@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_add(struct dentry *dn);
extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
/* /*
* our d_ops vary depending on whether the inode is live, * our d_ops vary depending on whether the inode is live,
......
...@@ -7,7 +7,8 @@ ...@@ -7,7 +7,8 @@
static bool ceph_is_valid_xattr(const char *name) static bool ceph_is_valid_xattr(const char *name)
{ {
return !strncmp(name, XATTR_SECURITY_PREFIX, return !strncmp(name, "ceph.", 5) ||
!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) || XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
...@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, ...@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
} }
static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
{ true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, { true, "ceph.dir.entries", ceph_vxattrcb_entries},
{ true, "user.ceph.dir.files", ceph_vxattrcb_files}, { true, "ceph.dir.files", ceph_vxattrcb_files},
{ true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
{ true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
{ true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
{ true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
{ true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
{ true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
{ true, NULL, NULL } { true, NULL, NULL }
}; };
...@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ...@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
} }
static struct ceph_vxattr_cb ceph_file_vxattrs[] = { static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
{ true, "user.ceph.layout", ceph_vxattrcb_layout}, { true, "ceph.layout", ceph_vxattrcb_layout},
{ NULL, NULL } { NULL, NULL }
}; };
...@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci, ...@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.names_size -= xattr->name_len;
ci->i_xattrs.vals_size -= xattr->val_len; ci->i_xattrs.vals_size -= xattr->val_len;
} }
if (!xattr) {
pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
&ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
xattr->val);
return -ENOMEM;
}
ci->i_xattrs.names_size += name_len; ci->i_xattrs.names_size += name_len;
ci->i_xattrs.vals_size += val_len; ci->i_xattrs.vals_size += val_len;
if (val) if (val)
...@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ...@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
ci->i_xattrs.version, ci->i_xattrs.index_version); ci->i_xattrs.version, ci->i_xattrs.index_version);
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version > ci->i_xattrs.version)) { (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto list_xattr; goto list_xattr;
} else { } else {
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
...@@ -622,7 +617,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ...@@ -622,7 +617,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
static int ceph_sync_setxattr(struct dentry *dentry, const char *name, static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
const char *value, size_t size, int flags) const char *value, size_t size, int flags)
{ {
struct ceph_client *client = ceph_client(dentry->d_sb); struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
struct inode *inode = dentry->d_inode; struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct inode *parent_inode = dentry->d_parent->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode;
...@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, ...@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
return -ENOMEM; return -ENOMEM;
err = -ENOMEM; err = -ENOMEM;
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
pages[i] = alloc_page(GFP_NOFS); pages[i] = __page_cache_alloc(GFP_NOFS);
if (!pages[i]) { if (!pages[i]) {
nr_pages = i; nr_pages = i;
goto out; goto out;
...@@ -779,7 +774,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -779,7 +774,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
static int ceph_send_removexattr(struct dentry *dentry, const char *name) static int ceph_send_removexattr(struct dentry *dentry, const char *name)
{ {
struct ceph_client *client = ceph_client(dentry->d_sb); struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = &client->mdsc; struct ceph_mds_client *mdsc = &client->mdsc;
struct inode *inode = dentry->d_inode; struct inode *inode = dentry->d_inode;
struct inode *parent_inode = dentry->d_parent->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册