Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
别团等shy哥发育
redis
提交
4cf0cd57
R
redis
项目概览
别团等shy哥发育
/
redis
与 Fork 源项目一致
从无法访问的项目Fork
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
R
redis
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
4cf0cd57
编写于
2月 05, 2014
作者:
A
antirez
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Cluster: manual failover initial implementation.
上级
4919a13f
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
180 addition
and
5 deletion
+180
-5
src/cluster.c
src/cluster.c
+161
-4
src/cluster.h
src/cluster.h
+19
-1
未找到文件。
src/cluster.c
浏览文件 @
4cf0cd57
...
...
@@ -67,6 +67,7 @@ void clusterHandleSlaveMigration(int max_slaves);
int
bitmapTestBit
(
unsigned
char
*
bitmap
,
int
pos
);
void
clusterDoBeforeSleep
(
int
flags
);
void
clusterSendUpdate
(
clusterLink
*
link
,
clusterNode
*
node
);
void
resetManualFailover
(
void
);
/* -----------------------------------------------------------------------------
* Initialization
...
...
@@ -344,6 +345,7 @@ void clusterInit(void) {
/* The slots -> keys map is a sorted set. Init it. */
server
.
cluster
->
slots_to_keys
=
zslCreate
();
resetManualFailover
();
}
/* -----------------------------------------------------------------------------
...
...
@@ -1167,7 +1169,9 @@ int clusterProcessPacket(clusterLink *link) {
ntohl
(
hdr
->
data
.
publish
.
msg
.
message_len
);
if
(
totlen
!=
explen
)
return
1
;
}
else
if
(
type
==
CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST
||
type
==
CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK
)
{
type
==
CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK
||
type
==
CLUSTERMSG_TYPE_MFSTART
)
{
uint32_t
explen
=
sizeof
(
clusterMsg
)
-
sizeof
(
union
clusterMsgData
);
if
(
totlen
!=
explen
)
return
1
;
...
...
@@ -1194,6 +1198,17 @@ int clusterProcessPacket(clusterLink *link) {
/* Update the replication offset info for this node. */
sender
->
repl_offset
=
ntohu64
(
hdr
->
offset
);
sender
->
repl_offset_time
=
mstime
();
/* If we are a slave performing a manual failover and our master
* sent its offset while already paused, populate the MF state. */
if
(
server
.
cluster
->
mf_end
&&
nodeIsSlave
(
myself
)
&&
myself
->
slaveof
==
sender
&&
hdr
->
mflags
[
0
]
&
CLUSTERMSG_FLAG0_PAUSED
&&
server
.
cluster
->
mf_master_offset
==
0
)
{
server
.
cluster
->
mf_master_offset
=
sender
->
repl_offset
;
redisLog
(
REDIS_WARNING
,
"Received replication offset for paused master manual failover: %lld"
,
server
.
cluster
->
mf_master_offset
);
}
}
/* Process packets by type. */
...
...
@@ -1464,6 +1479,18 @@ int clusterProcessPacket(clusterLink *link) {
* we check ASAP. */
clusterDoBeforeSleep
(
CLUSTER_TODO_HANDLE_FAILOVER
);
}
}
else
if
(
type
==
CLUSTERMSG_TYPE_MFSTART
)
{
/* This message is acceptable only if I'm a master and the sender
* is one of my slaves. */
if
(
!
sender
||
sender
->
slaveof
!=
myself
)
return
1
;
/* Manual failover requested from slaves. Initialize the state
* accordingly. */
resetManualFailover
();
server
.
cluster
->
mf_end
=
mstime
()
+
REDIS_CLUSTER_MF_TIMEOUT
;
server
.
cluster
->
mf_slave
=
sender
;
pauseClients
(
mstime
()
+
(
REDIS_CLUSTER_MF_TIMEOUT
*
2
));
redisLog
(
REDIS_WARNING
,
"Manual failover requested by slave %.40s."
,
sender
->
name
);
}
else
if
(
type
==
CLUSTERMSG_TYPE_UPDATE
)
{
clusterNode
*
n
;
/* The node the update is about. */
uint64_t
reportedConfigEpoch
=
ntohu64
(
hdr
->
data
.
update
.
nodecfg
.
configEpoch
);
...
...
@@ -1651,6 +1678,10 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) {
offset
=
server
.
master_repl_offset
;
hdr
->
offset
=
htonu64
(
offset
);
/* Set the message flags. */
if
(
nodeIsMaster
(
myself
)
&&
server
.
cluster
->
mf_end
)
hdr
->
mflags
[
0
]
|=
CLUSTERMSG_FLAG0_PAUSED
;
/* Compute the message length for certain messages. For other messages
* this is up to the caller. */
if
(
type
==
CLUSTERMSG_TYPE_FAIL
)
{
...
...
@@ -1883,6 +1914,19 @@ void clusterSendFailoverAuth(clusterNode *node) {
clusterSendMessage
(
node
->
link
,
buf
,
totlen
);
}
/* Send a MFSTART message to the specified node. */
void
clusterSendMFStart
(
clusterNode
*
node
)
{
unsigned
char
buf
[
sizeof
(
clusterMsg
)];
clusterMsg
*
hdr
=
(
clusterMsg
*
)
buf
;
uint32_t
totlen
;
if
(
!
node
->
link
)
return
;
clusterBuildMessageHdr
(
hdr
,
CLUSTERMSG_TYPE_MFSTART
);
totlen
=
sizeof
(
clusterMsg
)
-
sizeof
(
union
clusterMsgData
);
hdr
->
totlen
=
htonl
(
totlen
);
clusterSendMessage
(
node
->
link
,
buf
,
totlen
);
}
/* Vote for the node asking for our vote if there are the conditions. */
void
clusterSendFailoverAuthIfNeeded
(
clusterNode
*
node
,
clusterMsg
*
request
)
{
clusterNode
*
master
=
node
->
slaveof
;
...
...
@@ -1975,11 +2019,11 @@ void clusterHandleSlaveFailover(void) {
/* Pre conditions to run the function:
* 1) We are a slave.
* 2) Our master is flagged as FAIL.
* 2) Our master is flagged as FAIL
, or this is a manual failover
.
* 3) It is serving slots. */
if
(
nodeIsMaster
(
myself
)
||
myself
->
slaveof
==
NULL
||
!
nodeFailed
(
myself
->
slaveof
)
||
(
!
nodeFailed
(
myself
->
slaveof
)
&&
server
.
cluster
->
mf_end
==
0
)
||
myself
->
slaveof
->
numslots
==
0
)
return
;
/* Set data_age to the number of seconds we are disconnected from
...
...
@@ -2019,6 +2063,11 @@ void clusterHandleSlaveFailover(void) {
* less updated replication offset, are penalized. */
server
.
cluster
->
failover_auth_time
+=
server
.
cluster
->
failover_auth_rank
*
1000
;
/* However if this is a manual failover, no delay is needed. */
if
(
server
.
cluster
->
mf_end
)
{
server
.
cluster
->
failover_auth_time
=
mstime
();
server
.
cluster
->
failover_auth_rank
=
0
;
}
redisLog
(
REDIS_WARNING
,
"Start of election delayed for %lld milliseconds "
"(rank #%d, offset %lld)."
,
...
...
@@ -2035,7 +2084,9 @@ void clusterHandleSlaveFailover(void) {
/* It is possible that we received more updated offsets from other
* slaves for the same master since we computed our election delay.
* Update the delay if our rank changed. */
if
(
server
.
cluster
->
failover_auth_sent
==
0
)
{
if
(
server
.
cluster
->
failover_auth_sent
==
0
&&
server
.
cluster
->
mf_end
==
0
)
{
int
newrank
=
clusterGetSlaveRank
();
if
(
newrank
>
server
.
cluster
->
failover_auth_rank
)
{
long
long
added_delay
=
...
...
@@ -2102,6 +2153,9 @@ void clusterHandleSlaveFailover(void) {
/* 5) Pong all the other nodes so that they can update the state
* accordingly and detect that we switched to master role. */
clusterBroadcastPong
(
CLUSTER_BROADCAST_ALL
);
/* 6) If there was a manual failover in progress, clear the state. */
resetManualFailover
();
}
}
...
...
@@ -2190,6 +2244,79 @@ void clusterHandleSlaveMigration(int max_slaves) {
}
}
/* -----------------------------------------------------------------------------
* CLUSTER manual failover
*
* This are the important steps performed by slaves during a manual failover:
* 1) User send CLUSTER FAILOVER command. The failover state is initialized
* setting mf_end to the millisecond unix time at which we'll abort the
* attempt.
* 2) Slave sends a MFSTART message to the master requesting to pause clients
* for two times the manual failover timeout REDIS_CLUSTER_MF_TIMEOUT.
* When master is paused for manual failover, it also starts to flag
* packets with CLUSTERMSG_FLAG0_PAUSED.
* 3) Slave waits for master to send its replication offset flagged as PAUSED.
* 4) If slave received the offset from the master, and its offset matches,
* mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform
* the failover as usually, with the difference that the vote request
* will be modified to force masters to vote for a slave that has a
* working master.
*
* From the point of view of the master things are simpler: when a
* PAUSE_CLIENTS packet is received the master sets mf_end as well and
* the sender in mf_slave. During the time limit for the manual failover
* the master will just send PINGs more often to this slave, flagged with
* the PAUSED flag, so that the slave will set mf_master_offset when receiving
* a packet from the master with this flag set.
*
* The gaol of the manual failover is to perform a fast failover without
* data loss due to the asynchronous master-slave replication.
* -------------------------------------------------------------------------- */
/* Reset the manual failover state. This works for both masters and slavesa
* as all the state about manual failover is cleared.
*
* The function can be used both to initialize the manual failover state at
* startup or to abort a manual failover in progress. */
void
resetManualFailover
(
void
)
{
if
(
server
.
cluster
->
mf_end
&&
clientsArePaused
())
{
server
.
clients_pause_end_time
=
0
;
clientsArePaused
();
/* Just use the side effect of the function. */
}
server
.
cluster
->
mf_end
=
0
;
/* No manual failover in progress. */
server
.
cluster
->
mf_can_start
=
0
;
server
.
cluster
->
mf_slave
=
NULL
;
server
.
cluster
->
mf_master_offset
=
0
;
}
/* If a manual failover timed out, abort it. */
void
manualFailoverCheckTimeout
(
void
)
{
if
(
server
.
cluster
->
mf_end
<
mstime
())
{
redisLog
(
REDIS_WARNING
,
"Manual failover timed out."
);
resetManualFailover
();
}
}
/* This function is called from the cluster cron function in order to go
* forward with a manual failover state machine. */
void
clusterHandleManualFailover
(
void
)
{
/* Return ASAP if no manual failover is in progress. */
if
(
server
.
cluster
->
mf_end
==
0
)
return
;
/* If mf_can_start is non-zero, the failover was alrady triggered so the
* next steps are performed by clusterHandleSlaveFailover(). */
if
(
server
.
cluster
->
mf_can_start
)
return
;
if
(
server
.
cluster
->
mf_master_offset
==
0
)
return
;
/* Wait for offset... */
if
(
server
.
cluster
->
mf_master_offset
==
replicationGetSlaveOffset
())
{
/* Our replication offset matches the master replication offset
* announced after clients were paused. We can start the failover. */
server
.
cluster
->
mf_can_start
=
1
;
redisLog
(
REDIS_WARNING
,
"All master replication stream processed, manual failover can start."
);
}
}
/* -----------------------------------------------------------------------------
* CLUSTER cron job
* -------------------------------------------------------------------------- */
...
...
@@ -2351,6 +2478,17 @@ void clusterCron(void) {
continue
;
}
/* If we are a master and one of the slaves requested a manual
* failover, ping it continuously. */
if
(
server
.
cluster
->
mf_end
&&
nodeIsMaster
(
myself
)
&&
server
.
cluster
->
mf_slave
==
node
&&
node
->
link
)
{
clusterSendPing
(
node
->
link
,
CLUSTERMSG_TYPE_PING
);
continue
;
}
/* Check only if we have an active ping for this instance. */
if
(
node
->
ping_sent
==
0
)
continue
;
...
...
@@ -2383,7 +2521,11 @@ void clusterCron(void) {
replicationSetMaster
(
myself
->
slaveof
->
ip
,
myself
->
slaveof
->
port
);
}
/* Abourt a manual failover if the timeout is reached. */
manualFailoverCheckTimeout
();
if
(
nodeIsSlave
(
myself
))
{
clusterHandleManualFailover
();
clusterHandleSlaveFailover
();
/* If there are orphaned slaves, and we are a slave among the masters
* with the max number of non-failing slaves, consider migrating to
...
...
@@ -2700,6 +2842,7 @@ void clusterSetMaster(clusterNode *n) {
myself
->
slaveof
=
n
;
clusterNodeAddSlave
(
n
,
myself
);
replicationSetMaster
(
n
->
ip
,
n
->
port
);
resetManualFailover
();
}
/* -----------------------------------------------------------------------------
...
...
@@ -3152,6 +3295,20 @@ void clusterCommand(redisClient *c) {
addReplyBulkCString
(
c
,
ni
);
sdsfree
(
ni
);
}
}
else
if
(
!
strcasecmp
(
c
->
argv
[
1
]
->
ptr
,
"failover"
)
&&
c
->
argc
==
2
)
{
if
(
nodeIsMaster
(
myself
))
{
addReplyError
(
c
,
"You should send CLUSTER FAILOVER to a slave"
);
return
;
}
else
if
(
myself
->
slaveof
==
NULL
||
nodeFailed
(
myself
->
slaveof
)
||
myself
->
slaveof
->
link
==
NULL
)
{
addReplyError
(
c
,
"Master is down or failed, "
"please use CLUSTER FAILOVER FORCE"
);
return
;
}
resetManualFailover
();
server
.
cluster
->
mf_end
=
mstime
()
+
REDIS_CLUSTER_MF_TIMEOUT
;
clusterSendMFStart
(
myself
->
slaveof
);
}
else
{
addReplyError
(
c
,
"Wrong CLUSTER subcommand or number of arguments"
);
}
...
...
src/cluster.h
浏览文件 @
4cf0cd57
...
...
@@ -22,6 +22,8 @@
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4
/* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_DELAY 5
/* Seconds */
#define REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER 1
#define REDIS_CLUSTER_MF_TIMEOUT 5000
/* Milliseconds to do a manual failover. */
#define REDIS_CLUSTER_MF_PAUSE_MULT 2
/* Master pause manual failover mult. */
struct
clusterNode
;
...
...
@@ -100,6 +102,16 @@ typedef struct clusterState {
int
failover_auth_sent
;
/* True if we already asked for votes. */
int
failover_auth_rank
;
/* This slave rank for current auth request. */
uint64_t
failover_auth_epoch
;
/* Epoch of the current election. */
/* Manual failover state in common. */
mstime_t
mf_end
;
/* Manual failover time limit (ms unixtime).
It is zero if there is no MF in progress. */
/* Manual failover state of master. */
clusterNode
*
mf_slave
;
/* Slave performing the manual failover. */
/* Manual failover state of slave. */
long
long
mf_master_offset
;
/* Master offset the slave needs to start MF
or zero if stil not received. */
int
mf_can_start
;
/* If non-zero signal that the manual failover
can start requesting masters vote. */
/* The followign fields are uesd by masters to take state on elections. */
uint64_t
last_vote_epoch
;
/* Epoch of the last vote granted. */
int
todo_before_sleep
;
/* Things to do in clusterBeforeSleep(). */
...
...
@@ -127,6 +139,7 @@ typedef struct clusterState {
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5
/* May I failover? */
#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6
/* Yes, you have my vote */
#define CLUSTERMSG_TYPE_UPDATE 7
/* Another node slots configuration */
#define CLUSTERMSG_TYPE_MFSTART 8
/* Pause clients for manual failover */
/* Initially we don't know our "name", but we'll find it once we connect
* to the first node, using the getsockname() function. Then we'll use this
...
...
@@ -180,6 +193,7 @@ union clusterMsgData {
}
update
;
};
typedef
struct
{
uint32_t
totlen
;
/* Total length of this message */
uint16_t
type
;
/* Message type */
...
...
@@ -197,12 +211,16 @@ typedef struct {
uint16_t
port
;
/* Sender TCP base port */
uint16_t
flags
;
/* Sender node flags */
unsigned
char
state
;
/* Cluster state from the POV of the sender */
unsigned
char
notused2
[
3
];
/* Reserved for future use. For alignment
. */
unsigned
char
mflags
[
3
];
/* Message flags: CLUSTERMSG_FLAG[012]_..
. */
union
clusterMsgData
data
;
}
clusterMsg
;
#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData))
/* Message flags better specify the packet content or are used to
* provide some information about the node state. */
#define CLUSTERMSG_FLAG0_PAUSED (1<<0)
/* Master paused for manual failover. */
/* ---------------------- API exported outside cluster.c -------------------- */
clusterNode
*
getNodeByQuery
(
redisClient
*
c
,
struct
redisCommand
*
cmd
,
robj
**
argv
,
int
argc
,
int
*
hashslot
,
int
*
ask
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录