diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index d74040fac343643e3895871dfaa238bd14da3b45..177927a8f007739aae66f0833806670acf587334 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -38,6 +38,9 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 7: + * - DLM join domain includes the live nodemap + * * New in version 6: * - DLM lockres remote refcount fixes. * @@ -54,7 +57,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 6ULL +#define O2NET_PROTOCOL_VERSION 7ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 2f4f5d4edb07f398bb2200ef6a209bcaf51a48cb..e90b92f9ece1e01d45c43a684ea12facd83d705d 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -625,12 +625,16 @@ struct dlm_begin_reco }; +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) + struct dlm_query_join_request { u8 node_idx; u8 pad1[2]; u8 name_len; u8 domain[O2NM_MAX_NAME_LEN]; + u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)]; }; struct dlm_assert_joined diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6590e1bca23c9ffc38a2858e7506c07f7ddf517e..19b57a6bcb1a72c257b81d6a6fc40e4f1401486f 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -48,6 +48,36 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +/* + * ocfs2 node maps are array of long int, which limits to send them freely + * across the wire due to endianness issues. To workaround this, we convert + * long ints to byte arrays. Following 3 routines are helper functions to + * set/test/copy bits within those array of bytes + */ +static inline void byte_set_bit(u8 nr, u8 map[]) +{ + map[nr >> 3] |= (1UL << (nr & 7)); +} + +static inline int byte_test_bit(u8 nr, u8 map[]) +{ + return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0; +} + +static inline void byte_copymap(u8 dmap[], unsigned long smap[], + unsigned int sz) +{ + unsigned int nn; + + if (!sz) + return; + + memset(dmap, 0, ((sz + 7) >> 3)); + for (nn = 0 ; nn < sz; nn++) + if (test_bit(nn, smap)) + byte_set_bit(nn, dmap); +} + static void dlm_free_pagevec(void **vec, int pages) { while (pages--) @@ -641,6 +671,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_query_join_request *query; enum dlm_query_join_response response; struct dlm_ctxt *dlm = NULL; + u8 nodenum; query = (struct dlm_query_join_request *) msg->buf; @@ -664,6 +695,25 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + if (!dlm) + goto unlock_respond; + + /* + * There is a small window where the joining node may not see the + * node(s) that just left but still part of the cluster. DISALLOW + * join request if joining node has different node map. + */ + nodenum=0; + while (nodenum < O2NM_MAX_NODES) { + if (test_bit(nodenum, dlm->domain_map)) { + if (!byte_test_bit(nodenum, query->node_map)) { + response = JOIN_DISALLOW; + goto unlock_respond; + } + } + nodenum++; + } + /* Once the dlm ctxt is marked as leaving then we don't want * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome @@ -705,6 +755,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->spinlock); } +unlock_respond: spin_unlock(&dlm_domain_lock); respond: @@ -854,6 +905,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm, join_msg.name_len = strlen(dlm->name); memcpy(join_msg.domain, dlm->name, join_msg.name_len); + /* copy live node map to join message */ + byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, sizeof(join_msg), node, &retval); if (status < 0 && status != -ENOPROTOOPT) {