Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Turbo码先生
redis
提交
6fa9b1a4
R
redis
项目概览
Turbo码先生
/
redis
与 Fork 源项目一致
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
R
redis
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
6fa9b1a4
编写于
10月 08, 2013
作者:
A
antirez
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'bettercluster' into unstable
上级
0150c70b
ae2763f5
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
371 addition
and
200 deletion
+371
-200
src/cluster.c
src/cluster.c
+328
-188
src/endianconv.h
src/endianconv.h
+10
-0
src/redis-trib.rb
src/redis-trib.rb
+3
-3
src/redis.c
src/redis.c
+3
-0
src/redis.h
src/redis.h
+26
-6
src/replication.c
src/replication.c
+1
-1
src/sentinel.c
src/sentinel.c
+0
-2
未找到文件。
src/cluster.c
浏览文件 @
6fa9b1a4
...
...
@@ -53,12 +53,31 @@ int clusterDelSlot(int slot);
int
clusterDelNodeSlots
(
clusterNode
*
node
);
int
clusterNodeSetSlotBit
(
clusterNode
*
n
,
int
slot
);
void
clusterSetMaster
(
clusterNode
*
n
);
void
clusterHandleSlaveFailover
(
void
);
int
bitmapTestBit
(
unsigned
char
*
bitmap
,
int
pos
);
void
clusterDoBeforeSleep
(
int
flags
);
/* -----------------------------------------------------------------------------
* Initialization
* -------------------------------------------------------------------------- */
/* This function is called at startup in order to set the currentEpoch
* (which is not saved on permanent storage) to the greatest configEpoch found
* in the loaded nodes (configEpoch is stored on permanent storage as soon as
* it changes for some node). */
void
clusterSetStartupEpoch
()
{
dictIterator
*
di
;
dictEntry
*
de
;
di
=
dictGetSafeIterator
(
server
.
cluster
->
nodes
);
while
((
de
=
dictNext
(
di
))
!=
NULL
)
{
clusterNode
*
node
=
dictGetVal
(
de
);
if
(
node
->
configEpoch
>
server
.
cluster
->
currentEpoch
)
server
.
cluster
->
currentEpoch
=
node
->
configEpoch
;
}
dictReleaseIterator
(
di
);
}
int
clusterLoadConfig
(
char
*
filename
)
{
FILE
*
fp
=
fopen
(
filename
,
"r"
);
char
*
line
;
...
...
@@ -117,8 +136,6 @@ int clusterLoadConfig(char *filename) {
n
->
flags
|=
REDIS_NODE_HANDSHAKE
;
}
else
if
(
!
strcasecmp
(
s
,
"noaddr"
))
{
n
->
flags
|=
REDIS_NODE_NOADDR
;
}
else
if
(
!
strcasecmp
(
s
,
"promoted"
))
{
n
->
flags
|=
REDIS_NODE_PROMOTED
;
}
else
if
(
!
strcasecmp
(
s
,
"noflags"
))
{
/* nothing to do */
}
else
{
...
...
@@ -143,8 +160,11 @@ int clusterLoadConfig(char *filename) {
if
(
atoi
(
argv
[
4
]))
n
->
ping_sent
=
time
(
NULL
);
if
(
atoi
(
argv
[
5
]))
n
->
pong_received
=
time
(
NULL
);
/* Set configEpoch for this node. */
n
->
configEpoch
=
strtoull
(
argv
[
6
],
NULL
,
10
);
/* Populate hash slots served by this instance. */
for
(
j
=
7
;
j
<
argc
;
j
++
)
{
for
(
j
=
8
;
j
<
argc
;
j
++
)
{
int
start
,
stop
;
if
(
argv
[
j
][
0
]
==
'['
)
{
...
...
@@ -189,6 +209,7 @@ int clusterLoadConfig(char *filename) {
redisAssert
(
server
.
cluster
->
myself
!=
NULL
);
redisLog
(
REDIS_NOTICE
,
"Node configuration loaded, I'm %.40s"
,
server
.
cluster
->
myself
->
name
);
clusterSetStartupEpoch
();
clusterUpdateState
();
return
REDIS_OK
;
...
...
@@ -202,13 +223,14 @@ fmterr:
*
* This function writes the node config and returns 0, on error -1
* is returned. */
int
clusterSaveConfig
(
void
)
{
int
clusterSaveConfig
(
int
do_fsync
)
{
sds
ci
=
clusterGenNodesDescription
(
REDIS_NODE_HANDSHAKE
);
int
fd
;
if
((
fd
=
open
(
server
.
cluster_configfile
,
O_WRONLY
|
O_CREAT
|
O_TRUNC
,
0644
))
==
-
1
)
goto
err
;
if
(
write
(
fd
,
ci
,
sdslen
(
ci
))
!=
(
ssize_t
)
sdslen
(
ci
))
goto
err
;
if
(
do_fsync
)
fsync
(
fd
);
close
(
fd
);
sdsfree
(
ci
);
return
0
;
...
...
@@ -218,8 +240,8 @@ err:
return
-
1
;
}
void
clusterSaveConfigOrDie
(
void
)
{
if
(
clusterSaveConfig
()
==
-
1
)
{
void
clusterSaveConfigOrDie
(
int
do_fsync
)
{
if
(
clusterSaveConfig
(
do_fsync
)
==
-
1
)
{
redisLog
(
REDIS_WARNING
,
"Fatal: can't update cluster config file."
);
exit
(
1
);
}
...
...
@@ -230,11 +252,16 @@ void clusterInit(void) {
server
.
cluster
=
zmalloc
(
sizeof
(
clusterState
));
server
.
cluster
->
myself
=
NULL
;
server
.
cluster
->
currentEpoch
=
0
;
server
.
cluster
->
state
=
REDIS_CLUSTER_FAIL
;
server
.
cluster
->
size
=
1
;
server
.
cluster
->
nodes
=
dictCreate
(
&
clusterNodesDictType
,
NULL
);
server
.
cluster
->
failover_auth_time
=
0
;
server
.
cluster
->
failover_auth_count
=
0
;
server
.
cluster
->
failover_auth_epoch
=
0
;
server
.
cluster
->
last_vote_epoch
=
0
;
server
.
cluster
->
stats_bus_messages_sent
=
0
;
server
.
cluster
->
stats_bus_messages_received
=
0
;
memset
(
server
.
cluster
->
migrating_slots_to
,
0
,
sizeof
(
server
.
cluster
->
migrating_slots_to
));
memset
(
server
.
cluster
->
importing_slots_from
,
0
,
...
...
@@ -251,7 +278,7 @@ void clusterInit(void) {
clusterAddNode
(
server
.
cluster
->
myself
);
saveconf
=
1
;
}
if
(
saveconf
)
clusterSaveConfigOrDie
();
if
(
saveconf
)
clusterSaveConfigOrDie
(
1
);
/* We need a listening TCP port for our cluster messaging needs. */
server
.
cfd_count
=
0
;
...
...
@@ -360,6 +387,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
else
getRandomHexChars
(
node
->
name
,
REDIS_CLUSTER_NAMELEN
);
node
->
ctime
=
time
(
NULL
);
node
->
configEpoch
=
0
;
node
->
flags
=
flags
;
memset
(
node
->
slots
,
0
,
sizeof
(
node
->
slots
));
node
->
numslots
=
0
;
...
...
@@ -372,6 +400,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
memset
(
node
->
ip
,
0
,
sizeof
(
node
->
ip
));
node
->
port
=
0
;
node
->
fail_reports
=
listCreate
();
node
->
voted_time
=
0
;
listSetFreeMethod
(
node
->
fail_reports
,
zfree
);
return
node
;
}
...
...
@@ -637,15 +666,13 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
* reachable nodes to flag the node as FAIL. */
if
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_MASTER
)
clusterSendFail
(
node
->
name
);
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
}
/* This function is called only if a node is marked as FAIL, but we are able
* to reach it again. It checks if there are the conditions to undo the FAIL
* state. */
void
clearNodeFailureIfNeeded
(
clusterNode
*
node
)
{
int
changes
=
0
;
time_t
now
=
time
(
NULL
);
redisAssert
(
node
->
flags
&
REDIS_NODE_FAIL
);
...
...
@@ -654,10 +681,10 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
* node again. */
if
(
node
->
flags
&
REDIS_NODE_SLAVE
)
{
redisLog
(
REDIS_NOTICE
,
"Clear FAIL state for node %.40s: slave is
already reachable
."
,
"Clear FAIL state for node %.40s: slave is
reachable again
."
,
node
->
name
);
node
->
flags
&=
~
REDIS_NODE_FAIL
;
c
hanges
++
;
c
lusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
)
;
}
/* If it is a master and...
...
...
@@ -677,13 +704,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
"Clear FAIL state for node %.40s: is reachable again and nobody is serving its slots after some time."
,
node
->
name
);
node
->
flags
&=
~
REDIS_NODE_FAIL
;
changes
++
;
}
/* Update state and save config. */
if
(
changes
)
{
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
}
}
...
...
@@ -727,7 +748,6 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
if
(
flags
&
REDIS_NODE_FAIL
)
ci
=
sdscat
(
ci
,
"fail,"
);
if
(
flags
&
REDIS_NODE_HANDSHAKE
)
ci
=
sdscat
(
ci
,
"handshake,"
);
if
(
flags
&
REDIS_NODE_NOADDR
)
ci
=
sdscat
(
ci
,
"noaddr,"
);
if
(
flags
&
REDIS_NODE_PROMOTED
)
ci
=
sdscat
(
ci
,
"promoted,"
);
if
(
ci
[
sdslen
(
ci
)
-
1
]
==
','
)
ci
[
sdslen
(
ci
)
-
1
]
=
' '
;
redisLog
(
REDIS_DEBUG
,
"GOSSIP %.40s %s:%d %s"
,
...
...
@@ -850,8 +870,10 @@ int clusterProcessPacket(clusterLink *link) {
uint32_t
totlen
=
ntohl
(
hdr
->
totlen
);
uint16_t
type
=
ntohs
(
hdr
->
type
);
uint16_t
flags
=
ntohs
(
hdr
->
flags
);
uint64_t
senderCurrentEpoch
,
senderConfigEpoch
;
clusterNode
*
sender
;
server
.
cluster
->
stats_bus_messages_received
++
;
redisLog
(
REDIS_DEBUG
,
"--- Processing packet of type %d, %lu bytes"
,
type
,
(
unsigned
long
)
totlen
);
...
...
@@ -886,11 +908,23 @@ int clusterProcessPacket(clusterLink *link) {
if
(
totlen
!=
explen
)
return
1
;
}
/*
Process packets by typ
e. */
/*
Check if the sender is a known nod
e. */
sender
=
clusterLookupNode
(
hdr
->
sender
);
if
(
sender
&&
!
(
sender
->
flags
&
REDIS_NODE_HANDSHAKE
))
{
/* Update our curretEpoch if we see a newer epoch in the cluster. */
senderCurrentEpoch
=
ntohu64
(
hdr
->
currentEpoch
);
senderConfigEpoch
=
ntohu64
(
hdr
->
configEpoch
);
if
(
senderCurrentEpoch
>
server
.
cluster
->
currentEpoch
)
server
.
cluster
->
currentEpoch
=
senderCurrentEpoch
;
/* Update the sender configEpoch if it is publishing a newer one. */
if
(
senderConfigEpoch
>
sender
->
configEpoch
)
{
sender
->
configEpoch
=
senderConfigEpoch
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_FSYNC_CONFIG
);
}
}
/* Process packets by type. */
if
(
type
==
CLUSTERMSG_TYPE_PING
||
type
==
CLUSTERMSG_TYPE_MEET
)
{
int
update_config
=
0
;
redisLog
(
REDIS_DEBUG
,
"Ping packet received: %p"
,
(
void
*
)
link
->
node
);
/* Add this node if it is new for us and the msg type is MEET.
...
...
@@ -904,7 +938,7 @@ int clusterProcessPacket(clusterLink *link) {
nodeIp2String
(
node
->
ip
,
link
);
node
->
port
=
ntohs
(
hdr
->
port
);
clusterAddNode
(
node
);
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
)
;
}
/* Get info from the gossip section */
...
...
@@ -912,18 +946,12 @@ int clusterProcessPacket(clusterLink *link) {
/* Anyway reply with a PONG */
clusterSendPing
(
link
,
CLUSTERMSG_TYPE_PONG
);
/* Update config if needed */
if
(
update_config
)
clusterSaveConfigOrDie
();
}
/* PING or PONG: process config information. */
if
(
type
==
CLUSTERMSG_TYPE_PING
||
type
==
CLUSTERMSG_TYPE_PONG
||
type
==
CLUSTERMSG_TYPE_MEET
)
{
int
update_state
=
0
;
int
update_config
=
0
;
redisLog
(
REDIS_DEBUG
,
"%s packet received: %p"
,
type
==
CLUSTERMSG_TYPE_PING
?
"ping"
:
"pong"
,
(
void
*
)
link
->
node
);
...
...
@@ -936,8 +964,8 @@ int clusterProcessPacket(clusterLink *link) {
"Handshake error: we already know node %.40s, updating the address if needed."
,
sender
->
name
);
if
(
nodeUpdateAddressIfNeeded
(
sender
,
link
,
ntohs
(
hdr
->
port
)))
{
cluster
UpdateState
();
clusterSaveConfigOrDie
(
);
cluster
DoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
);
}
/* Free this node as we alrady have it. This will
* cause the link to be freed as well. */
...
...
@@ -952,7 +980,7 @@ int clusterProcessPacket(clusterLink *link) {
link
->
node
->
name
);
link
->
node
->
flags
&=
~
REDIS_NODE_HANDSHAKE
;
link
->
node
->
flags
|=
flags
&
(
REDIS_NODE_MASTER
|
REDIS_NODE_SLAVE
);
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
)
;
}
else
if
(
memcmp
(
link
->
node
->
name
,
hdr
->
sender
,
REDIS_CLUSTER_NAMELEN
)
!=
0
)
{
...
...
@@ -964,7 +992,7 @@ int clusterProcessPacket(clusterLink *link) {
link
->
node
->
ip
[
0
]
=
'\0'
;
link
->
node
->
port
=
0
;
freeClusterLink
(
link
);
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
)
;
/* FIXME: remove this node if we already have it.
*
* If we already have it but the IP is different, use
...
...
@@ -979,8 +1007,7 @@ int clusterProcessPacket(clusterLink *link) {
!
(
sender
->
flags
&
REDIS_NODE_HANDSHAKE
)
&&
nodeUpdateAddressIfNeeded
(
sender
,
link
,
ntohs
(
hdr
->
port
)))
{
update_state
=
1
;
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
);
}
/* Update our info about the node */
...
...
@@ -996,7 +1023,8 @@ int clusterProcessPacket(clusterLink *link) {
* conditions detected by clearNodeFailureIfNeeded(). */
if
(
link
->
node
->
flags
&
REDIS_NODE_PFAIL
)
{
link
->
node
->
flags
&=
~
REDIS_NODE_PFAIL
;
update_state
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
);
}
else
if
(
link
->
node
->
flags
&
REDIS_NODE_FAIL
)
{
clearNodeFailureIfNeeded
(
link
->
node
);
}
...
...
@@ -1009,9 +1037,6 @@ int clusterProcessPacket(clusterLink *link) {
{
/* Node is a master. */
if
(
sender
->
flags
&
REDIS_NODE_SLAVE
)
{
/* Slave turned into master! */
clusterNode
*
oldmaster
=
sender
->
slaveof
;
/* Reconfigure node as master. */
if
(
sender
->
slaveof
)
clusterNodeRemoveSlave
(
sender
->
slaveof
,
sender
);
...
...
@@ -1019,32 +1044,9 @@ int clusterProcessPacket(clusterLink *link) {
sender
->
flags
|=
REDIS_NODE_MASTER
;
sender
->
slaveof
=
NULL
;
/* If this node used to be our slave, and now has the
* PROMOTED flag set. We'll turn ourself into a slave
* of the new master. */
if
(
flags
&
REDIS_NODE_PROMOTED
&&
oldmaster
==
server
.
cluster
->
myself
)
{
redisLog
(
REDIS_WARNING
,
"One of my slaves took my place. Reconfiguring myself as a replica of %.40s"
,
sender
->
name
);
clusterDelNodeSlots
(
server
.
cluster
->
myself
);
clusterSetMaster
(
sender
);
}
/* If we are a slave, and this node used to be a slave
* of our master, and now has the PROMOTED flag set, we
* need to switch our replication setup over it. */
if
(
flags
&
REDIS_NODE_PROMOTED
&&
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_SLAVE
&&
server
.
cluster
->
myself
->
slaveof
==
oldmaster
)
{
redisLog
(
REDIS_WARNING
,
"One of the slaves failed over my master. Reconfiguring myself as a replica of %.40s"
,
sender
->
name
);
clusterDelNodeSlots
(
server
.
cluster
->
myself
);
clusterSetMaster
(
sender
);
}
/* Update config and state. */
update_state
=
1
;
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
)
;
}
}
else
{
/* Node is a slave. */
...
...
@@ -1060,8 +1062,8 @@ int clusterProcessPacket(clusterLink *link) {
if
(
sender
->
numslaves
)
clusterNodeResetSlaves
(
sender
);
/* Update config and state. */
update_state
=
1
;
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
)
;
}
/* Master node changed for this slave? */
...
...
@@ -1070,6 +1072,9 @@ int clusterProcessPacket(clusterLink *link) {
clusterNodeRemoveSlave
(
sender
->
slaveof
,
sender
);
clusterNodeAddSlave
(
master
,
sender
);
sender
->
slaveof
=
master
;
/* Update config. */
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
);
}
}
}
...
...
@@ -1083,55 +1088,91 @@ int clusterProcessPacket(clusterLink *link) {
changes
=
memcmp
(
sender
->
slots
,
hdr
->
myslots
,
sizeof
(
hdr
->
myslots
))
!=
0
;
if
(
changes
)
{
clusterNode
*
curmaster
,
*
newmaster
=
NULL
;
/* Here we set curmaster to this node or the node this node
* replicates to if it's a slave. In the for loop we are
* interested to check if slots are taken away from curmaster. */
if
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_MASTER
)
curmaster
=
server
.
cluster
->
myself
;
else
curmaster
=
server
.
cluster
->
myself
->
slaveof
;
for
(
j
=
0
;
j
<
REDIS_CLUSTER_SLOTS
;
j
++
)
{
if
(
bitmapTestBit
(
hdr
->
myslots
,
j
))
{
/*
If this slot was not served, or served by a node
*
in FAIL state, update the table with the new node
*
claiming to serve the slot
. */
/*
We rebind the slot to the new node claiming it if:
*
1) The slot was unassigned.
*
2) The new node claims it with a greater configEpoch
. */
if
(
server
.
cluster
->
slots
[
j
]
==
sender
)
continue
;
if
(
server
.
cluster
->
slots
[
j
]
==
NULL
||
server
.
cluster
->
slots
[
j
]
->
flags
&
REDIS_NODE_FAIL
)
server
.
cluster
->
slots
[
j
]
->
configEpoch
<
senderConfigEpoch
)
{
if
(
server
.
cluster
->
slots
[
j
]
==
curmaster
)
newmaster
=
sender
;
clusterDelSlot
(
j
);
clusterAddSlot
(
sender
,
j
);
update_state
=
update_config
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_FSYNC_CONFIG
);
}
}
else
{
/* This node claims to no longer handling the slot,
* however we don't change our config as this is likely
* happening because a resharding is in progress, and
* it already knows where to redirect clients. */
* however we don't change our config as this is likely:
* 1) Rehashing in progress.
* 2) Failover.
* In both cases we'll be informed about who is serving
* the slot eventually. In the meantime it's up to the
* original owner to try to redirect our clients to the
* right node. */
}
}
/* If at least one slot was reassigned from a node to another node
* with a greater configEpoch, it is possible that:
* 1) We are a master is left without slots. This means that we were
* failed over and we should turn into a replica of the new
* master.
* 2) We are a slave and our master is left without slots. We need
* to replicate to the new slots owner. */
if
(
newmaster
&&
curmaster
->
numslots
==
0
)
{
redisLog
(
REDIS_WARNING
,
"Configuration change detected. Reconfiguring myself as a replica of %.40s"
,
sender
->
name
);
clusterSetMaster
(
sender
);
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_FSYNC_CONFIG
);
}
}
}
/* Get info from the gossip section */
clusterProcessGossipSection
(
hdr
,
link
);
/* Update the cluster state if needed */
if
(
update_state
)
clusterUpdateState
();
if
(
update_config
)
clusterSaveConfigOrDie
();
}
else
if
(
type
==
CLUSTERMSG_TYPE_FAIL
&&
sender
)
{
}
else
if
(
type
==
CLUSTERMSG_TYPE_FAIL
)
{
clusterNode
*
failing
;
failing
=
clusterLookupNode
(
hdr
->
data
.
fail
.
about
.
nodename
);
if
(
failing
&&
!
(
failing
->
flags
&
(
REDIS_NODE_FAIL
|
REDIS_NODE_MYSELF
)))
{
if
(
sender
)
{
failing
=
clusterLookupNode
(
hdr
->
data
.
fail
.
about
.
nodename
);
if
(
failing
&&
!
(
failing
->
flags
&
(
REDIS_NODE_FAIL
|
REDIS_NODE_MYSELF
)))
{
redisLog
(
REDIS_NOTICE
,
"FAIL message received from %.40s about %.40s"
,
hdr
->
sender
,
hdr
->
data
.
fail
.
about
.
nodename
);
failing
->
flags
|=
REDIS_NODE_FAIL
;
failing
->
fail_time
=
time
(
NULL
);
failing
->
flags
&=
~
REDIS_NODE_PFAIL
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
);
}
}
else
{
redisLog
(
REDIS_NOTICE
,
"
FAIL message received from
%.40s about %.40s"
,
"
Ignoring FAIL message from unknonw node
%.40s about %.40s"
,
hdr
->
sender
,
hdr
->
data
.
fail
.
about
.
nodename
);
failing
->
flags
|=
REDIS_NODE_FAIL
;
failing
->
fail_time
=
time
(
NULL
);
failing
->
flags
&=
~
REDIS_NODE_PFAIL
;
clusterUpdateState
();
clusterSaveConfigOrDie
();
}
}
else
if
(
type
==
CLUSTERMSG_TYPE_PUBLISH
)
{
robj
*
channel
,
*
message
;
uint32_t
channel_len
,
message_len
;
/* Don't bother creating useless objects if there are no Pub/Sub subscribers. */
/* Don't bother creating useless objects if there are no
* Pub/Sub subscribers. */
if
(
dictSize
(
server
.
pubsub_channels
)
||
listLength
(
server
.
pubsub_patterns
))
{
channel_len
=
ntohl
(
hdr
->
data
.
publish
.
msg
.
channel_len
);
message_len
=
ntohl
(
hdr
->
data
.
publish
.
msg
.
message_len
);
...
...
@@ -1145,15 +1186,21 @@ int clusterProcessPacket(clusterLink *link) {
}
}
else
if
(
type
==
CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST
)
{
if
(
!
sender
)
return
1
;
/* We don't know that node. */
/* If we are not a master, ignore that message at all. */
if
(
!
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_MASTER
))
return
0
;
clusterSendFailoverAuthIfNeeded
(
sender
,
hdr
);
}
else
if
(
type
==
CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK
)
{
if
(
!
sender
)
return
1
;
/* We don't know that node. */
/* If this is a master, increment the number of acknowledges
* we received so far. */
if
(
sender
->
flags
&
REDIS_NODE_MASTER
)
/* We consider this vote only if the sender is a master serving
* a non zero number of slots, and its currentEpoch is greater or
* equal to epoch where this node started the election. */
if
(
sender
->
flags
&
REDIS_NODE_MASTER
&&
sender
->
numslots
>
0
&&
senderCurrentEpoch
>=
server
.
cluster
->
failover_auth_epoch
)
{
server
.
cluster
->
failover_auth_count
++
;
/* Maybe we reached a quorum here, set a flag to make sure
* we check ASAP. */
clusterDoBeforeSleep
(
CLUSTER_TODO_HANDLE_FAILOVER
);
}
}
else
{
redisLog
(
REDIS_WARNING
,
"Received unknown packet type: %d"
,
type
);
}
...
...
@@ -1253,17 +1300,26 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
}
}
/* Put stuff into the send buffer. */
/* Put stuff into the send buffer.
*
* It is guaranteed that this function will never have as a side effect
* the link to be invalidated, so it is safe to call this function
* from event handlers that will do stuff with the same link later. */
void
clusterSendMessage
(
clusterLink
*
link
,
unsigned
char
*
msg
,
size_t
msglen
)
{
if
(
sdslen
(
link
->
sndbuf
)
==
0
&&
msglen
!=
0
)
aeCreateFileEvent
(
server
.
el
,
link
->
fd
,
AE_WRITABLE
,
clusterWriteHandler
,
link
);
link
->
sndbuf
=
sdscatlen
(
link
->
sndbuf
,
msg
,
msglen
);
server
.
cluster
->
stats_bus_messages_sent
++
;
}
/* Send a message to all the nodes that are part of the cluster having
* a connected link. */
* a connected link.
*
* It is guaranteed that this function will never have as a side effect
* some node->link to be invalidated, so it is safe to call this function
* from event handlers that will do stuff with node links later. */
void
clusterBroadcastMessage
(
void
*
buf
,
size_t
len
)
{
dictIterator
*
di
;
dictEntry
*
de
;
...
...
@@ -1283,12 +1339,21 @@ void clusterBroadcastMessage(void *buf, size_t len) {
/* Build the message header */
void
clusterBuildMessageHdr
(
clusterMsg
*
hdr
,
int
type
)
{
int
totlen
=
0
;
clusterNode
*
master
;
/* If this node is a master, we send its slots bitmap and configEpoch.
* If this node is a slave we send the master's information instead (the
* node is flagged as slave so the receiver knows that it is NOT really
* in charge for this slots. */
master
=
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_SLAVE
&&
server
.
cluster
->
myself
->
slaveof
)
?
server
.
cluster
->
myself
->
slaveof
:
server
.
cluster
->
myself
;
memset
(
hdr
,
0
,
sizeof
(
*
hdr
));
hdr
->
type
=
htons
(
type
);
memcpy
(
hdr
->
sender
,
server
.
cluster
->
myself
->
name
,
REDIS_CLUSTER_NAMELEN
);
memcpy
(
hdr
->
myslots
,
server
.
cluster
->
myself
->
slots
,
sizeof
(
hdr
->
myslots
));
memcpy
(
hdr
->
myslots
,
master
->
slots
,
sizeof
(
hdr
->
myslots
));
memset
(
hdr
->
slaveof
,
0
,
REDIS_CLUSTER_NAMELEN
);
if
(
server
.
cluster
->
myself
->
slaveof
!=
NULL
)
{
memcpy
(
hdr
->
slaveof
,
server
.
cluster
->
myself
->
slaveof
->
name
,
...
...
@@ -1298,12 +1363,16 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
hdr
->
flags
=
htons
(
server
.
cluster
->
myself
->
flags
);
hdr
->
state
=
server
.
cluster
->
state
;
/* Set the currentEpoch and configEpochs. */
hdr
->
currentEpoch
=
htonu64
(
server
.
cluster
->
currentEpoch
);
hdr
->
configEpoch
=
htonu64
(
master
->
configEpoch
);
if
(
type
==
CLUSTERMSG_TYPE_FAIL
)
{
totlen
=
sizeof
(
clusterMsg
)
-
sizeof
(
union
clusterMsgData
);
totlen
+=
sizeof
(
clusterMsgDataFail
);
}
hdr
->
totlen
=
htonl
(
totlen
);
/* For PING, PONG, and MEET, fixing the totlen field is up to the caller */
/* For PING, PONG, and MEET, fixing the totlen field is up to the caller
.
*/
}
/* Send a PING or PONG packet to the specified node, making sure to add enough
...
...
@@ -1370,10 +1439,11 @@ void clusterSendPing(clusterLink *link, int type) {
clusterSendMessage
(
link
,
buf
,
totlen
);
}
/* Send a PONG packet to every connected node that's not in handshake state.
/* Send a PONG packet to every connected node that's not in handshake state
* and for which we have a valid link.
*
* In Redis Cluster p
ings are not just used
for failure detection, but also
* to carry important configuration information
s
. So broadcasting a pong is
* In Redis Cluster p
ongs are not used just
for failure detection, but also
* to carry important configuration information. So broadcasting a pong is
* useful when something changes in the configuration and we want to make
* the cluster aware ASAP (for instance after a slave promotion). */
void
clusterBroadcastPong
(
void
)
{
...
...
@@ -1384,6 +1454,7 @@ void clusterBroadcastPong(void) {
while
((
de
=
dictNext
(
di
))
!=
NULL
)
{
clusterNode
*
node
=
dictGetVal
(
de
);
if
(
!
node
->
link
)
continue
;
if
(
node
->
flags
&
(
REDIS_NODE_MYSELF
|
REDIS_NODE_HANDSHAKE
))
continue
;
clusterSendPing
(
node
->
link
,
CLUSTERMSG_TYPE_PONG
);
}
...
...
@@ -1477,14 +1548,11 @@ void clusterRequestFailoverAuth(void) {
clusterBuildMessageHdr
(
hdr
,
CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST
);
totlen
=
sizeof
(
clusterMsg
)
-
sizeof
(
union
clusterMsgData
);
hdr
->
totlen
=
htonl
(
totlen
);
hdr
->
time
=
mstime
();
clusterBroadcastMessage
(
buf
,
totlen
);
}
/* Send a FAILOVER_AUTH_ACK message to the specified node.
* Reqtime is the time field from the original failover auth request packet,
* so that the receiver is able to check the reply age. */
void
clusterSendFailoverAuth
(
clusterNode
*
node
,
uint64_t
reqtime
)
{
/* Send a FAILOVER_AUTH_ACK message to the specified node. */
void
clusterSendFailoverAuth
(
clusterNode
*
node
)
{
unsigned
char
buf
[
4096
];
clusterMsg
*
hdr
=
(
clusterMsg
*
)
buf
;
uint32_t
totlen
;
...
...
@@ -1493,47 +1561,57 @@ void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) {
clusterBuildMessageHdr
(
hdr
,
CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK
);
totlen
=
sizeof
(
clusterMsg
)
-
sizeof
(
union
clusterMsgData
);
hdr
->
totlen
=
htonl
(
totlen
);
hdr
->
time
=
reqtime
;
clusterSendMessage
(
node
->
link
,
buf
,
totlen
);
}
/* If we believe 'node' is the "first slave" of it's master, reply with
* a FAILOVER_AUTH_GRANTED packet.
* The 'request' field points to the authorization request packet header, we
* need it in order to copy back the 'time' field in our reply.
*
* To be a first slave the sender must:
* 1) Be a slave.
* 2) Its master should be in FAIL state.
* 3) Ordering all the slaves IDs for its master by run-id, it should be the
* first (the smallest) among the ones not in FAIL / PFAIL state.
*/
/* Vote for the node asking for our vote if there are the conditions. */
void
clusterSendFailoverAuthIfNeeded
(
clusterNode
*
node
,
clusterMsg
*
request
)
{
char
first
[
REDIS_CLUSTER_NAMELEN
];
clusterNode
*
master
=
node
->
slaveof
;
uint64_t
requestCurrentEpoch
=
ntohu64
(
request
->
currentEpoch
);
uint64_t
requestConfigEpoch
=
ntohu64
(
request
->
configEpoch
);
unsigned
char
*
claimed_slots
=
request
->
myslots
;
int
j
;
/* Node is a slave? Its master is down? */
/* IF we are not a master serving at least 1 slot, we don't have the
* right to vote, as the cluster size in Redis Cluster is the number
* of masters serving at least one slot, and quorum is the cluster size + 1 */
if
(
!
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_MASTER
))
return
;
if
(
server
.
cluster
->
myself
->
numslots
==
0
)
return
;
/* Request epoch must be >= our currentEpoch. */
if
(
requestCurrentEpoch
<
server
.
cluster
->
currentEpoch
)
return
;
/* I already voted for this epoch? Return ASAP. */
if
(
server
.
cluster
->
last_vote_epoch
==
server
.
cluster
->
currentEpoch
)
return
;
/* Node must be a slave and its master down. */
if
(
!
(
node
->
flags
&
REDIS_NODE_SLAVE
)
||
master
==
NULL
||
!
(
master
->
flags
&
REDIS_NODE_FAIL
))
return
;
/* Iterate all the master slaves to check what's the first one. */
memset
(
first
,
0xff
,
sizeof
(
first
));
for
(
j
=
0
;
j
<
master
->
numslaves
;
j
++
)
{
clusterNode
*
slave
=
master
->
slaves
[
j
];
/* We did not voted for a slave about this master for two
* times the node timeout. This is not strictly needed for correctness
* of the algorithm but makes the base case more linear. */
if
(
server
.
unixtime
-
node
->
slaveof
->
voted_time
<
server
.
cluster_node_timeout
*
2
)
return
;
if
(
slave
->
flags
&
(
REDIS_NODE_FAIL
|
REDIS_NODE_PFAIL
))
continue
;
if
(
memcmp
(
slave
->
name
,
first
,
sizeof
(
first
))
<
0
)
{
memcpy
(
first
,
slave
->
name
,
sizeof
(
first
));
}
/* The slave requesting the vote must have a configEpoch for the claimed slots
* that is >= the one of the masters currently serving the same slots in the
* current configuration. */
for
(
j
=
0
;
j
<
REDIS_CLUSTER_SLOTS
;
j
++
)
{
if
(
bitmapTestBit
(
claimed_slots
,
j
)
==
0
)
continue
;
if
(
server
.
cluster
->
slots
[
j
]
==
NULL
||
server
.
cluster
->
slots
[
j
]
->
configEpoch
<=
requestConfigEpoch
)
continue
;
/* If we reached this point we found a slot that in our current slots
* is served by a master with a greater configEpoch than the one claimed
* by the slave requesting our vote. Refuse to vote for this slave. */
return
;
}
/* Is 'node' the first slave? */
if
(
memcmp
(
node
->
name
,
first
,
sizeof
(
first
))
!=
0
)
return
;
/* We can send the packet. */
clusterSendFailoverAuth
(
node
,
request
->
time
);
/* We can vote for this slave. */
clusterSendFailoverAuth
(
node
);
server
.
cluster
->
last_vote_epoch
=
server
.
cluster
->
currentEpoch
;
node
->
slaveof
->
voted_time
=
server
.
unixtime
;
}
/* This function is called if we are a slave node and our master serving
...
...
@@ -1541,16 +1619,37 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
*
* The gaol of this function is:
* 1) To check if we are able to perform a failover, is our data updated?
* 2) Ask reachable masters the authorization to perform the failover.
* 3) Check if there is the majority of masters agreeing we should failover.
* 4) Perform the failover informing all the other nodes.
* 2) Try to get elected by masters.
* 3) Perform the failover informing all the other nodes.
*/
void
clusterHandleSlaveFailover
(
void
)
{
time_t
data_age
=
server
.
unixtime
-
server
.
repl_down_since
;
time_t
auth_age
=
server
.
unixtime
-
server
.
cluster
->
failover_auth_time
;
time_t
data_age
;
mstime_t
auth_age
=
mstime
()
-
server
.
cluster
->
failover_auth_time
;
int
needed_quorum
=
(
server
.
cluster
->
size
/
2
)
+
1
;
int
j
;
/* Set data_age to the number of seconds we are disconnected from the master. */
if
(
server
.
repl_state
==
REDIS_REPL_CONNECTED
)
{
data_age
=
server
.
unixtime
-
server
.
master
->
lastinteraction
;
}
else
{
data_age
=
server
.
unixtime
-
server
.
repl_down_since
;
}
/* Pre conditions to run the function:
* 1) We are a slave.
* 2) Our master is flagged as FAIL.
* 3) It is serving slots. */
if
(
!
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_SLAVE
)
||
server
.
cluster
->
myself
->
slaveof
==
NULL
||
!
(
server
.
cluster
->
myself
->
slaveof
->
flags
&
REDIS_NODE_FAIL
)
||
server
.
cluster
->
myself
->
slaveof
->
numslots
==
0
)
return
;
/* Remove the node timeout from the data age as it is fine that we are
* disconnected from our master at least for the time it was down to be
* flagged as FAIL, that's the baseline. */
if
(
data_age
>
server
.
cluster_node_timeout
)
data_age
-=
server
.
cluster_node_timeout
;
/* Check if our data is recent enough. For now we just use a fixed
* constant of ten times the node timeout since the cluster should
* react much faster to a master down. */
...
...
@@ -1558,19 +1657,40 @@ void clusterHandleSlaveFailover(void) {
server
.
cluster_node_timeout
*
REDIS_CLUSTER_SLAVE_VALIDITY_MULT
)
return
;
/* TODO: check if we are the first slave as well? Or just rely on the
* master authorization? */
/* Ask masters if we are authorized to perform the failover. If there
* is a pending auth request that's too old, reset it. */
/* Compute the time at which we can start an election. */
if
(
server
.
cluster
->
failover_auth_time
==
0
||
auth_age
>
server
.
cluster_node_timeout
*
REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT
)
server
.
cluster_node_timeout
*
1000
*
REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT
)
{
redisLog
(
REDIS_WARNING
,
"Asking masters if I can failover..."
);
server
.
cluster
->
failover_auth_time
=
time
(
NULL
);
server
.
cluster
->
failover_auth_time
=
mstime
()
+
500
+
/* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
data_age
*
100
+
/* Add 100 milliseconds for every second of age. */
random
()
%
500
;
/* Random delay between 0 and 500 milliseconds. */
server
.
cluster
->
failover_auth_count
=
0
;
server
.
cluster
->
failover_auth_sent
=
0
;
redisLog
(
REDIS_WARNING
,
"Start of election delayed for %lld milliseconds."
,
server
.
cluster
->
failover_auth_time
-
mstime
());
return
;
}
/* Return ASAP if we can't still start the election. */
if
(
mstime
()
<
server
.
cluster
->
failover_auth_time
)
return
;
/* Return ASAP if the election is too old to be valid. */
if
(
mstime
()
-
server
.
cluster
->
failover_auth_time
>
server
.
cluster_node_timeout
*
1000
)
return
;
/* Ask for votes if needed. */
if
(
server
.
cluster
->
failover_auth_sent
==
0
)
{
server
.
cluster
->
currentEpoch
++
;
server
.
cluster
->
failover_auth_epoch
=
server
.
cluster
->
currentEpoch
;
redisLog
(
REDIS_WARNING
,
"Starting a failover election for epoch %llu."
,
server
.
cluster
->
currentEpoch
);
clusterRequestFailoverAuth
();
server
.
cluster
->
failover_auth_sent
=
1
;
clusterDoBeforeSleep
(
CLUSTER_TODO_SAVE_CONFIG
|
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_FSYNC_CONFIG
);
return
;
/* Wait for replies. */
}
...
...
@@ -1579,7 +1699,7 @@ void clusterHandleSlaveFailover(void) {
clusterNode
*
oldmaster
=
server
.
cluster
->
myself
->
slaveof
;
redisLog
(
REDIS_WARNING
,
"
Masters quorum reached: failing over my (failing)
master."
);
"
Failover election won: I'm the new
master."
);
/* We have the quorum, perform all the steps to correctly promote
* this slave to a master.
*
...
...
@@ -1588,7 +1708,6 @@ void clusterHandleSlaveFailover(void) {
server
.
cluster
->
myself
);
server
.
cluster
->
myself
->
flags
&=
~
REDIS_NODE_SLAVE
;
server
.
cluster
->
myself
->
flags
|=
REDIS_NODE_MASTER
;
server
.
cluster
->
myself
->
flags
|=
REDIS_NODE_PROMOTED
;
server
.
cluster
->
myself
->
slaveof
=
NULL
;
replicationUnsetMaster
();
...
...
@@ -1600,13 +1719,16 @@ void clusterHandleSlaveFailover(void) {
}
}
/* 3) Pong all the other nodes so that they can update the state
* accordingly and detect that we switched to master role. */
clusterBroadcastPong
();
/* 3) Update my configEpoch to the epoch of the election. */
server
.
cluster
->
myself
->
configEpoch
=
server
.
cluster
->
failover_auth_epoch
;
/* 4) Update state and save config. */
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterSaveConfigOrDie
(
1
);
/* 5) Pong all the other nodes so that they can update the state
* accordingly and detect that we switched to master role. */
clusterBroadcastPong
();
}
}
...
...
@@ -1767,30 +1889,44 @@ void clusterCron(void) {
server
.
cluster
->
myself
->
slaveof
->
port
);
}
/* If we are a slave and our master is down, but is serving slots,
* call the function that handles the failover.
* This function is called with a small delay in order to let the
* FAIL message to propagate after failure detection, this is not
* strictly required but makes 99.99% of failovers mechanically
* simpler. */
if
(
server
.
cluster
->
myself
->
flags
&
REDIS_NODE_SLAVE
&&
server
.
cluster
->
myself
->
slaveof
&&
server
.
cluster
->
myself
->
slaveof
->
flags
&
REDIS_NODE_FAIL
&&
(
server
.
unixtime
-
server
.
cluster
->
myself
->
slaveof
->
fail_time
)
>
REDIS_CLUSTER_FAILOVER_DELAY
&&
server
.
cluster
->
myself
->
slaveof
->
numslots
!=
0
)
{
clusterHandleSlaveFailover
();
if
(
update_state
)
clusterUpdateState
();
}
/* This function is called before the event handler returns to sleep for
* events. It is useful to perform operations that must be done ASAP in
* reaction to events fired but that are not safe to perform inside event
* handlers, or to perform potentially expansive tasks that we need to do
* a single time before replying to clients. */
void
clusterBeforeSleep
(
void
)
{
/* Handle failover, this is needed when it is likely that there is already
* the quorum from masters in order to react fast. */
if
(
server
.
cluster
->
todo_before_sleep
&
CLUSTER_TODO_HANDLE_FAILOVER
)
clusterHandleSlaveFailover
();
/* Update the cluster state. */
if
(
server
.
cluster
->
todo_before_sleep
&
CLUSTER_TODO_UPDATE_STATE
)
clusterUpdateState
();
/* Save the config, possibly using fsync. */
if
(
server
.
cluster
->
todo_before_sleep
&
CLUSTER_TODO_SAVE_CONFIG
)
{
int
fsync
=
server
.
cluster
->
todo_before_sleep
&
CLUSTER_TODO_FSYNC_CONFIG
;
clusterSaveConfigOrDie
(
fsync
);
}
if
(
update_state
)
clusterUpdateState
();
/* Reset our flags. */
server
.
cluster
->
todo_before_sleep
=
0
;
}
void
clusterDoBeforeSleep
(
int
flags
)
{
server
.
cluster
->
todo_before_sleep
|=
flags
;
}
/* -----------------------------------------------------------------------------
* Slots management
* -------------------------------------------------------------------------- */
/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is
z
et,
/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is
s
et,
* otherwise 0. */
int
bitmapTestBit
(
unsigned
char
*
bitmap
,
int
pos
)
{
off_t
byte
=
pos
/
8
;
...
...
@@ -1913,8 +2049,14 @@ void clusterUpdateState(void) {
}
/* If we can't reach at least half the masters, change the cluster state
* as FAIL, as we are not even able to mark nodes as FAIL in this side
* of the netsplit because of lack of majority. */
* to FAIL, as we are not even able to mark nodes as FAIL in this side
* of the netsplit because of lack of majority.
*
* TODO: when this condition is entered, we should not undo it for some
* (small) time after the majority is reachable again, to make sure that
* other nodes have enough time to inform this node of a configuration change.
* Otherwise a client with an old routing table may write to this node
* and later it may turn into a slave losing the write. */
{
int
needed_quorum
=
(
server
.
cluster
->
size
/
2
)
+
1
;
...
...
@@ -1991,7 +2133,7 @@ int verifyClusterConfigWithData(void) {
server
.
cluster
->
importing_slots_from
[
j
]
=
server
.
cluster
->
slots
[
j
];
}
}
if
(
update_config
)
clusterSaveConfigOrDie
();
if
(
update_config
)
clusterSaveConfigOrDie
(
1
);
return
REDIS_OK
;
}
...
...
@@ -2011,9 +2153,6 @@ void clusterSetMaster(clusterNode *n) {
myself
->
flags
&=
~
REDIS_NODE_MASTER
;
myself
->
flags
|=
REDIS_NODE_SLAVE
;
}
/* Clear the promoted flag anyway if we are a slave, to ensure it will
* be set only when the node turns into a master because of fail over. */
myself
->
flags
&=
~
REDIS_NODE_PROMOTED
;
myself
->
slaveof
=
n
;
replicationSetMaster
(
n
->
ip
,
n
->
port
);
}
...
...
@@ -2061,7 +2200,6 @@ sds clusterGenNodesDescription(int filter) {
if
(
node
->
flags
&
REDIS_NODE_FAIL
)
ci
=
sdscat
(
ci
,
"fail,"
);
if
(
node
->
flags
&
REDIS_NODE_HANDSHAKE
)
ci
=
sdscat
(
ci
,
"handshake,"
);
if
(
node
->
flags
&
REDIS_NODE_NOADDR
)
ci
=
sdscat
(
ci
,
"noaddr,"
);
if
(
node
->
flags
&
REDIS_NODE_PROMOTED
)
ci
=
sdscat
(
ci
,
"promoted,"
);
if
(
ci
[
sdslen
(
ci
)
-
1
]
==
','
)
ci
[
sdslen
(
ci
)
-
1
]
=
' '
;
/* Slave of... or just "-" */
...
...
@@ -2071,9 +2209,10 @@ sds clusterGenNodesDescription(int filter) {
ci
=
sdscatprintf
(
ci
,
"- "
);
/* Latency from the POV of this node, link status */
ci
=
sdscatprintf
(
ci
,
"%ld %ld %s"
,
ci
=
sdscatprintf
(
ci
,
"%ld %ld %
llu %
s"
,
(
long
)
node
->
ping_sent
,
(
long
)
node
->
pong_received
,
(
unsigned
long
long
)
node
->
configEpoch
,
(
node
->
link
||
node
->
flags
&
REDIS_NODE_MYSELF
)
?
"connected"
:
"disconnected"
);
...
...
@@ -2193,8 +2332,7 @@ void clusterCommand(redisClient *c) {
return
;
}
clusterDelNodeSlots
(
server
.
cluster
->
myself
);
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
addReply
(
c
,
shared
.
ok
);
}
else
if
((
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"addslots"
)
||
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"delslots"
))
&&
c
->
argc
>=
3
)
...
...
@@ -2244,8 +2382,7 @@ void clusterCommand(redisClient *c) {
}
}
zfree
(
slots
);
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
addReply
(
c
,
shared
.
ok
);
}
else
if
(
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"setslot"
)
&&
c
->
argc
>=
4
)
{
/* SETSLOT 10 MIGRATING <node ID> */
...
...
@@ -2321,8 +2458,7 @@ void clusterCommand(redisClient *c) {
addReplyError
(
c
,
"Invalid CLUSTER SETSLOT action or number of arguments"
);
return
;
}
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
addReply
(
c
,
shared
.
ok
);
}
else
if
(
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"info"
)
&&
c
->
argc
==
2
)
{
/* CLUSTER INFO */
...
...
@@ -2352,20 +2488,26 @@ void clusterCommand(redisClient *c) {
"cluster_slots_fail:%d
\r\n
"
"cluster_known_nodes:%lu
\r\n
"
"cluster_size:%d
\r\n
"
"cluster_current_epoch:%llu
\r\n
"
"cluster_stats_messages_sent:%lld
\r\n
"
"cluster_stats_messages_received:%lld
\r\n
"
,
statestr
[
server
.
cluster
->
state
],
slots_assigned
,
slots_ok
,
slots_pfail
,
slots_fail
,
dictSize
(
server
.
cluster
->
nodes
),
server
.
cluster
->
size
server
.
cluster
->
size
,
(
unsigned
long
long
)
server
.
cluster
->
currentEpoch
,
server
.
cluster
->
stats_bus_messages_sent
,
server
.
cluster
->
stats_bus_messages_received
);
addReplySds
(
c
,
sdscatprintf
(
sdsempty
(),
"$%lu
\r\n
"
,
(
unsigned
long
)
sdslen
(
info
)));
addReplySds
(
c
,
info
);
addReply
(
c
,
shared
.
crlf
);
}
else
if
(
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"saveconfig"
)
&&
c
->
argc
==
2
)
{
int
retval
=
clusterSaveConfig
();
int
retval
=
clusterSaveConfig
(
1
);
if
(
retval
==
0
)
addReply
(
c
,
shared
.
ok
);
...
...
@@ -2417,8 +2559,7 @@ void clusterCommand(redisClient *c) {
return
;
}
clusterDelNode
(
n
);
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
addReply
(
c
,
shared
.
ok
);
}
else
if
(
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"replicate"
)
&&
c
->
argc
==
3
)
{
/* CLUSTER REPLICATE <NODE ID> */
...
...
@@ -2453,8 +2594,7 @@ void clusterCommand(redisClient *c) {
/* Set the master. */
clusterSetMaster
(
n
);
clusterUpdateState
();
clusterSaveConfigOrDie
();
clusterDoBeforeSleep
(
CLUSTER_TODO_UPDATE_STATE
|
CLUSTER_TODO_SAVE_CONFIG
);
addReply
(
c
,
shared
.
ok
);
}
else
{
addReplyError
(
c
,
"Wrong CLUSTER subcommand or number of arguments"
);
...
...
src/endianconv.h
浏览文件 @
6fa9b1a4
...
...
@@ -61,4 +61,14 @@ uint64_t intrev64(uint64_t v);
#define intrev64ifbe(v) intrev64(v)
#endif
/* The functions htonu64() and ntohu64() convert the specified value to
* network byte ordering and back. In big endian systems they are no-ops. */
#if (BYTE_ORDER == BIG_ENDIAN)
#define htonu64(v) (v)
#define ntohu64(v) (v)
#else
#define htonu64(v) intrev64(v)
#define ntohu64(v) intrev64(v)
#endif
#endif
src/redis-trib.rb
浏览文件 @
6fa9b1a4
...
...
@@ -118,8 +118,8 @@ class ClusterNode
nodes
.
each
{
|
n
|
# name addr flags role ping_sent ping_recv link_status slots
split
=
n
.
split
name
,
addr
,
flags
,
role
,
ping_sent
,
ping_recv
,
link_status
=
split
[
0
..
6
]
slots
=
split
[
7
..-
1
]
name
,
addr
,
flags
,
role
,
ping_sent
,
ping_recv
,
config_epoch
,
link_status
=
split
[
0
..
6
]
slots
=
split
[
8
..-
1
]
info
=
{
:name
=>
name
,
:addr
=>
addr
,
...
...
@@ -230,7 +230,7 @@ class ClusterNode
config
=
[]
@r
.
cluster
(
"nodes"
).
each_line
{
|
l
|
s
=
l
.
split
slots
=
s
[
7
..-
1
].
select
{
|
x
|
x
[
0
..
0
]
!=
"["
}
slots
=
s
[
8
..-
1
].
select
{
|
x
|
x
[
0
..
0
]
!=
"["
}
next
if
slots
.
length
==
0
config
<<
s
[
0
]
+
":"
+
(
slots
.
sort
.
join
(
","
))
}
...
...
src/redis.c
浏览文件 @
6fa9b1a4
...
...
@@ -1203,6 +1203,9 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
/* Write the AOF buffer on disk */
flushAppendOnlyFile
(
0
);
/* Call the Redis Cluster before sleep function. */
if
(
server
.
cluster_enabled
)
clusterBeforeSleep
();
}
/* =========================== Server initialization ======================== */
...
...
src/redis.h
浏览文件 @
6fa9b1a4
...
...
@@ -368,6 +368,8 @@
* Data types
*----------------------------------------------------------------------------*/
typedef
long
long
mstime_t
;
/* millisecond time type. */
/* A redis object, that is a type able to hold a string / list / set */
/* The actual Redis Object */
...
...
@@ -581,7 +583,7 @@ typedef struct redisOpArray {
#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2
/* Undo fail if master is back. */
#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10
/* Some additional time. */
#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10
/* Slave data validity. */
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT
1
/* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT
4
/* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_DELAY 5
/* Seconds */
struct
clusterNode
;
...
...
@@ -617,6 +619,7 @@ struct clusterNode {
time_t
ctime
;
/* Node object creation time. */
char
name
[
REDIS_CLUSTER_NAMELEN
];
/* Node name, hex string, sha1-size */
int
flags
;
/* REDIS_NODE_... */
uint64_t
configEpoch
;
/* Last configEpoch observed for this node */
unsigned
char
slots
[
REDIS_CLUSTER_SLOTS
/
8
];
/* slots handled by this node */
int
numslots
;
/* Number of slots handled by this node */
int
numslaves
;
/* Number of slave nodes, if this is a master */
...
...
@@ -625,6 +628,7 @@ struct clusterNode {
time_t
ping_sent
;
/* Unix time we sent latest ping */
time_t
pong_received
;
/* Unix time we received the pong */
time_t
fail_time
;
/* Unix time when FAIL flag was set */
time_t
voted_time
;
/* Last time we voted for a slave of this master */
char
ip
[
REDIS_IP_STR_LEN
];
/* Latest known IP address of this node */
int
port
;
/* Latest known port of this node */
clusterLink
*
link
;
/* TCP/IP link with this node */
...
...
@@ -634,6 +638,7 @@ typedef struct clusterNode clusterNode;
typedef
struct
{
clusterNode
*
myself
;
/* This node */
uint64_t
currentEpoch
;
int
state
;
/* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */
int
size
;
/* Num of master nodes with at least one slot */
dict
*
nodes
;
/* Hash table of name -> clusterNode structures */
...
...
@@ -641,10 +646,24 @@ typedef struct {
clusterNode
*
importing_slots_from
[
REDIS_CLUSTER_SLOTS
];
clusterNode
*
slots
[
REDIS_CLUSTER_SLOTS
];
zskiplist
*
slots_to_keys
;
int
failover_auth_time
;
/* Time at which we sent the AUTH request. */
int
failover_auth_count
;
/* Number of authorizations received. */
/* The following fields are used to take the slave state on elections. */
mstime_t
failover_auth_time
;
/* Time at which we'll try to get elected in ms*/
int
failover_auth_count
;
/* Number of votes received so far. */
int
failover_auth_sent
;
/* True if we already asked for votes. */
uint64_t
failover_auth_epoch
;
/* Epoch of the current election. */
/* The followign fields are uesd by masters to take state on elections. */
uint64_t
last_vote_epoch
;
/* Epoch of the last vote granted. */
int
todo_before_sleep
;
/* Things to do in clusterBeforeSleep(). */
long
long
stats_bus_messages_sent
;
/* Num of msg sent via cluster bus. */
long
long
stats_bus_messages_received
;
/* Num of msg received via cluster bus. */
}
clusterState
;
/* clusterState todo_before_sleep flags. */
#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0)
#define CLUSTER_TODO_UPDATE_STATE (1<<1)
#define CLUSTER_TODO_SAVE_CONFIG (1<<2)
#define CLUSTER_TODO_FSYNC_CONFIG (1<<3)
/* Redis cluster messages header */
/* Note that the PING, PONG and MEET messages are actually the same exact
...
...
@@ -704,9 +723,9 @@ typedef struct {
uint32_t
totlen
;
/* Total length of this message */
uint16_t
type
;
/* Message type */
uint16_t
count
;
/* Only used for some kind of messages. */
uint64_t
time
;
/* Time at which this request was sent (in milliseconds),
this field is copied in reply messages so that the
original sender knows how old the reply is
. */
uint64_t
currentEpoch
;
/* The epoch accordingly to the sending node. */
uint64_t
configEpoch
;
/* The config epoch if it's a master, or the last epoch
advertised by its master if it is a slave
. */
char
sender
[
REDIS_CLUSTER_NAMELEN
];
/* Name of the sender node */
unsigned
char
myslots
[
REDIS_CLUSTER_SLOTS
/
8
];
char
slaveof
[
REDIS_CLUSTER_NAMELEN
];
...
...
@@ -1367,6 +1386,7 @@ void clusterCron(void);
clusterNode
*
getNodeByQuery
(
redisClient
*
c
,
struct
redisCommand
*
cmd
,
robj
**
argv
,
int
argc
,
int
*
hashslot
,
int
*
ask
);
void
clusterPropagatePublish
(
robj
*
channel
,
robj
*
message
);
void
migrateCloseTimedoutSockets
(
void
);
void
clusterBeforeSleep
(
void
);
/* Sentinel */
void
initSentinelConfig
(
void
);
...
...
src/replication.c
浏览文件 @
6fa9b1a4
...
...
@@ -343,7 +343,7 @@ int masterTryPartialResynchronization(redisClient *c) {
/* Run id "?" is used by slaves that want to force a full resync. */
if
(
master_runid
[
0
]
!=
'?'
)
{
redisLog
(
REDIS_NOTICE
,
"Partial resynchronization not accepted: "
"Runid mismatch (Client asked for
'%s', I'm
'%s')"
,
"Runid mismatch (Client asked for
runid '%s', my runid is
'%s')"
,
master_runid
,
server
.
runid
);
}
else
{
redisLog
(
REDIS_NOTICE
,
"Full resync requested by slave."
);
...
...
src/sentinel.c
浏览文件 @
6fa9b1a4
...
...
@@ -43,8 +43,6 @@ extern char **environ;
/* ======================== Sentinel global state =========================== */
typedef
long
long
mstime_t
;
/* millisecond time type. */
/* Address object, used to describe an ip:port pair. */
typedef
struct
sentinelAddr
{
char
*
ip
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录