Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
taosdata
TDengine
提交
f7ad45fe
T
TDengine
项目概览
taosdata
/
TDengine
1 年多 前同步成功
通知
1185
Star
22016
Fork
4786
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
TDengine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f7ad45fe
编写于
10月 29, 2020
作者:
陶建辉(Jeff)
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
race condition for timer and forward
上级
cbebf0c7
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
148 addition
and
102 deletion
+148
-102
src/sync/src/syncMain.c
src/sync/src/syncMain.c
+148
-102
未找到文件。
src/sync/src/syncMain.c
浏览文件 @
f7ad45fe
...
...
@@ -20,6 +20,7 @@
#include "tlog.h"
#include "tutil.h"
#include "ttimer.h"
#include "tref.h"
#include "tsocket.h"
#include "tglobal.h"
#include "taoserror.h"
...
...
@@ -43,6 +44,7 @@ char tsNodeFqdn[TSDB_FQDN_LEN];
static
ttpool_h
tsTcpPool
;
static
void
*
syncTmrCtrl
=
NULL
;
static
void
*
vgIdHash
;
static
int
tsSyncRefId
=
-
1
;
// local functions
static
void
syncProcessSyncRequest
(
char
*
pMsg
,
SSyncPeer
*
pPeer
);
...
...
@@ -54,13 +56,13 @@ static int syncProcessPeerMsg(void *param, void *buffer);
static
void
syncProcessIncommingConnection
(
int
connFd
,
uint32_t
sourceIp
);
static
void
syncRemovePeer
(
SSyncPeer
*
pPeer
);
static
void
syncAddArbitrator
(
SSyncNode
*
pNode
);
static
void
syncAddNodeRef
(
SSyncNode
*
pNode
);
static
void
syncDecNodeRef
(
SSyncNode
*
pNode
);
static
void
syncFreeNode
(
void
*
);
static
void
syncRemoveConfirmedFwdInfo
(
SSyncNode
*
pNode
);
static
void
syncMonitorFwdInfos
(
void
*
param
,
void
*
tmrId
);
static
void
syncProcessFwdAck
(
SSyncNode
*
pNode
,
SFwdInfo
*
pFwdInfo
,
int32_t
code
);
static
void
syncSaveFwdInfo
(
SSyncNode
*
pNode
,
uint64_t
version
,
void
*
mhandle
);
static
void
syncRestartPeer
(
SSyncPeer
*
pPeer
);
static
int32_t
syncForwardToPeerImpl
(
SSyncNode
*
pNode
,
void
*
data
,
void
*
mhandle
,
int
qtyp
);
static
SSyncPeer
*
syncAddPeer
(
SSyncNode
*
pNode
,
const
SNodeInfo
*
pInfo
);
char
*
syncRole
[]
=
{
...
...
@@ -106,6 +108,12 @@ int32_t syncInit() {
return
-
1
;
}
tsSyncRefId
=
taosOpenRef
(
200
,
syncFreeNode
);
if
(
tsSyncRefId
<
0
)
{
syncCleanUp
();
return
-
1
;
}
tstrncpy
(
tsNodeFqdn
,
tsLocalFqdn
,
sizeof
(
tsNodeFqdn
));
sInfo
(
"sync module initialized successfully"
);
...
...
@@ -128,6 +136,8 @@ void syncCleanUp() {
vgIdHash
=
NULL
;
}
taosCloseRef
(
tsSyncRefId
);
sInfo
(
"sync module is cleaned up"
);
}
...
...
@@ -159,6 +169,12 @@ void *syncStart(const SSyncInfo *pInfo) {
pNode
->
quorum
=
pCfg
->
quorum
;
if
(
pNode
->
quorum
>
pNode
->
replica
)
pNode
->
quorum
=
pNode
->
replica
;
int
ret
=
taosAddRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
{
syncFreeNode
(
pNode
);
return
NULL
;
}
for
(
int
i
=
0
;
i
<
pCfg
->
replica
;
++
i
)
{
const
SNodeInfo
*
pNodeInfo
=
pCfg
->
nodeInfo
+
i
;
pNode
->
peerInfo
[
i
]
=
syncAddPeer
(
pNode
,
pNodeInfo
);
...
...
@@ -167,8 +183,6 @@ void *syncStart(const SSyncInfo *pInfo) {
}
}
syncAddNodeRef
(
pNode
);
if
(
pNode
->
selfIndex
<
0
)
{
sInfo
(
"vgId:%d, this node is not configured"
,
pNode
->
vgId
);
terrno
=
TSDB_CODE_SYN_INVALID_CONFIG
;
...
...
@@ -210,7 +224,9 @@ void syncStop(void *param) {
SSyncNode
*
pNode
=
param
;
SSyncPeer
*
pPeer
;
if
(
pNode
==
NULL
)
return
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
return
;
sInfo
(
"vgId:%d, cleanup sync"
,
pNode
->
vgId
);
pthread_mutex_lock
(
&
(
pNode
->
mutex
));
...
...
@@ -228,14 +244,17 @@ void syncStop(void *param) {
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
syncDecNodeRef
(
pNode
);
taosReleaseRef
(
tsSyncRefId
,
pNode
);
taosRemoveRef
(
tsSyncRefId
,
pNode
);
}
int32_t
syncReconfig
(
void
*
param
,
const
SSyncCfg
*
pNewCfg
)
{
SSyncNode
*
pNode
=
param
;
int
i
,
j
;
if
(
pNode
==
NULL
)
return
TSDB_CODE_SYN_INVALID_CONFIG
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
return
TSDB_CODE_SYN_INVALID_CONFIG
;
sInfo
(
"vgId:%d, reconfig, role:%s replica:%d old:%d"
,
pNode
->
vgId
,
syncRole
[
nodeRole
],
pNewCfg
->
replica
,
pNode
->
replica
);
...
...
@@ -298,105 +317,63 @@ int32_t syncReconfig(void *param, const SSyncCfg *pNewCfg) {
syncRole
[
nodeRole
]);
syncBroadcastStatus
(
pNode
);
taosReleaseRef
(
tsSyncRefId
,
pNode
);
return
0
;
}
int32_t
syncForwardToPeer
(
void
*
param
,
void
*
data
,
void
*
mhandle
,
int
qtype
)
{
SSyncNode
*
pNode
=
param
;
SSyncPeer
*
pPeer
;
SSyncHead
*
pSyncHead
;
SWalHead
*
pWalHead
=
data
;
int
fwdLen
;
int
code
=
0
;
if
(
pNode
==
NULL
)
return
0
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
return
0
;
if
(
nodeRole
==
TAOS_SYNC_ROLE_SLAVE
&&
pWalHead
->
version
!=
nodeVersion
+
1
)
{
sError
(
"vgId:%d, received ver:%"
PRIu64
", inconsistent with last ver:%"
PRIu64
", restart connection"
,
pNode
->
vgId
,
pWalHead
->
version
,
nodeVersion
);
for
(
int
i
=
0
;
i
<
pNode
->
replica
;
++
i
)
{
pPeer
=
pNode
->
peerInfo
[
i
];
syncRestartConnection
(
pPeer
);
}
return
TSDB_CODE_SYN_INVALID_VERSION
;
}
// always update version
nodeVersion
=
pWalHead
->
version
;
sDebug
(
"vgId:%d, replica:%d nodeRole:%s qtype:%d ver:%"
PRIu64
,
pNode
->
vgId
,
pNode
->
replica
,
syncRole
[
nodeRole
],
qtype
,
pWalHead
->
version
);
int32_t
code
=
syncForwardToPeerImpl
(
pNode
,
data
,
mhandle
,
qtype
);
if
(
pNode
->
replica
==
1
||
nodeRole
!=
TAOS_SYNC_ROLE_MASTER
)
return
0
;
// only pkt from RPC or CQ can be forwarded
if
(
qtype
!=
TAOS_QTYPE_RPC
&&
qtype
!=
TAOS_QTYPE_CQ
)
return
0
;
// a hacker way to improve the performance
pSyncHead
=
(
SSyncHead
*
)(((
char
*
)
pWalHead
)
-
sizeof
(
SSyncHead
));
pSyncHead
->
type
=
TAOS_SMSG_FORWARD
;
pSyncHead
->
pversion
=
0
;
pSyncHead
->
len
=
sizeof
(
SWalHead
)
+
pWalHead
->
len
;
fwdLen
=
pSyncHead
->
len
+
sizeof
(
SSyncHead
);
// include the WAL and SYNC head
pthread_mutex_lock
(
&
(
pNode
->
mutex
));
for
(
int
i
=
0
;
i
<
pNode
->
replica
;
++
i
)
{
pPeer
=
pNode
->
peerInfo
[
i
];
if
(
pPeer
==
NULL
||
pPeer
->
peerFd
<
0
)
continue
;
if
(
pPeer
->
role
!=
TAOS_SYNC_ROLE_SLAVE
&&
pPeer
->
sstatus
!=
TAOS_SYNC_STATUS_CACHE
)
continue
;
if
(
pNode
->
quorum
>
1
&&
code
==
0
)
{
syncSaveFwdInfo
(
pNode
,
pWalHead
->
version
,
mhandle
);
code
=
1
;
}
int
retLen
=
write
(
pPeer
->
peerFd
,
pSyncHead
,
fwdLen
);
if
(
retLen
==
fwdLen
)
{
sDebug
(
"%s, forward is sent, ver:%"
PRIu64
" contLen:%d"
,
pPeer
->
id
,
pWalHead
->
version
,
pWalHead
->
len
);
}
else
{
sError
(
"%s, failed to forward, ver:%"
PRIu64
" retLen:%d"
,
pPeer
->
id
,
pWalHead
->
version
,
retLen
);
syncRestartConnection
(
pPeer
);
}
}
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
taosReleaseRef
(
tsSyncRefId
,
pNode
);
return
code
;
}
void
syncConfirmForward
(
void
*
param
,
uint64_t
version
,
int32_t
code
)
{
SSyncNode
*
pNode
=
param
;
if
(
pNode
==
NULL
)
return
;
if
(
pNode
->
quorum
<=
1
)
return
;
SSyncPeer
*
pPeer
=
pNode
->
pMaster
;
if
(
pPeer
==
NULL
)
return
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
)
;
if
(
ret
<
0
)
return
;
char
msg
[
sizeof
(
SSyncHead
)
+
sizeof
(
SFwdRsp
)]
=
{
0
};
SSyncPeer
*
pPeer
=
pNode
->
pMaster
;
if
(
pPeer
&&
pNode
->
quorum
>
1
)
{
char
msg
[
sizeof
(
SSyncHead
)
+
sizeof
(
SFwdRsp
)]
=
{
0
};
SSyncHead
*
pHead
=
(
SSyncHead
*
)
msg
;
pHead
->
type
=
TAOS_SMSG_FORWARD_RSP
;
pHead
->
len
=
sizeof
(
SFwdRsp
);
SSyncHead
*
pHead
=
(
SSyncHead
*
)
msg
;
pHead
->
type
=
TAOS_SMSG_FORWARD_RSP
;
pHead
->
len
=
sizeof
(
SFwdRsp
);
SFwdRsp
*
pFwdRsp
=
(
SFwdRsp
*
)(
msg
+
sizeof
(
SSyncHead
));
pFwdRsp
->
version
=
version
;
pFwdRsp
->
code
=
code
;
SFwdRsp
*
pFwdRsp
=
(
SFwdRsp
*
)(
msg
+
sizeof
(
SSyncHead
));
pFwdRsp
->
version
=
version
;
pFwdRsp
->
code
=
code
;
int
msgLen
=
sizeof
(
SSyncHead
)
+
sizeof
(
SFwdRsp
);
int
retLen
=
write
(
pPeer
->
peerFd
,
msg
,
msgLen
);
int
msgLen
=
sizeof
(
SSyncHead
)
+
sizeof
(
SFwdRsp
);
int
retLen
=
write
(
pPeer
->
peerFd
,
msg
,
msgLen
);
if
(
retLen
==
msgLen
)
{
sDebug
(
"%s, forward-rsp is sent, ver:%"
PRIu64
,
pPeer
->
id
,
version
);
}
else
{
sDebug
(
"%s, failed to send forward ack, restart"
,
pPeer
->
id
);
syncRestartConnection
(
pPeer
);
if
(
retLen
==
msgLen
)
{
sDebug
(
"%s, forward-rsp is sent, ver:%"
PRIu64
,
pPeer
->
id
,
version
);
}
else
{
sDebug
(
"%s, failed to send forward ack, restart"
,
pPeer
->
id
);
syncRestartConnection
(
pPeer
);
}
}
taosReleaseRef
(
tsSyncRefId
,
pNode
);
}
void
syncRecover
(
void
*
param
)
{
SSyncNode
*
pNode
=
param
;
SSyncPeer
*
pPeer
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
return
;
// to do: add a few lines to check if recover is OK
// if take this node to unsync state, the whole system may not work
...
...
@@ -414,17 +391,24 @@ void syncRecover(void *param) {
}
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
taosReleaseRef
(
tsSyncRefId
,
pNode
);
}
int
syncGetNodesRole
(
void
*
param
,
SNodesRole
*
pNodesRole
)
{
SSyncNode
*
pNode
=
param
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
return
-
1
;
pNodesRole
->
selfIndex
=
pNode
->
selfIndex
;
for
(
int
i
=
0
;
i
<
pNode
->
replica
;
++
i
)
{
pNodesRole
->
nodeId
[
i
]
=
pNode
->
peerInfo
[
i
]
->
nodeId
;
pNodesRole
->
role
[
i
]
=
pNode
->
peerInfo
[
i
]
->
role
;
}
taosReleaseRef
(
tsSyncRefId
,
pNode
);
return
0
;
}
...
...
@@ -457,22 +441,20 @@ static void syncAddArbitrator(SSyncNode *pNode) {
pNode
->
peerInfo
[
TAOS_SYNC_MAX_REPLICA
]
=
syncAddPeer
(
pNode
,
&
nodeInfo
);
}
static
void
syncAddNodeRef
(
SSyncNode
*
pNode
)
{
atomic_add_fetch_8
(
&
pNode
->
refCount
,
1
);
}
static
void
syncFreeNode
(
void
*
param
)
{
SSyncNode
*
pNode
=
param
;
static
void
syncDecNodeRef
(
SSyncNode
*
pNode
)
{
if
(
atomic_sub_fetch_8
(
&
pNode
->
refCount
,
1
)
==
0
)
{
pthread_mutex_destroy
(
&
pNode
->
mutex
);
taosTFree
(
pNode
->
pRecv
);
taosTFree
(
pNode
->
pSyncFwds
);
taosTFree
(
pNode
);
}
pthread_mutex_destroy
(
&
pNode
->
mutex
);
taosTFree
(
pNode
->
pRecv
);
taosTFree
(
pNode
->
pSyncFwds
);
taosTFree
(
pNode
);
}
void
syncAddPeerRef
(
SSyncPeer
*
pPeer
)
{
atomic_add_fetch_8
(
&
pPeer
->
refCount
,
1
);
}
int
syncDecPeerRef
(
SSyncPeer
*
pPeer
)
{
if
(
atomic_sub_fetch_8
(
&
pPeer
->
refCount
,
1
)
==
0
)
{
syncDecNodeRef
(
pPeer
->
pSyncNode
);
taosReleaseRef
(
tsSyncRefId
,
pPeer
->
pSyncNode
);
sDebug
(
"%s, resource is freed"
,
pPeer
->
id
);
taosTFree
(
pPeer
->
watchFd
);
...
...
@@ -529,7 +511,7 @@ static SSyncPeer *syncAddPeer(SSyncNode *pNode, const SNodeInfo *pInfo) {
taosTmrReset
(
syncCheckPeerConnection
,
checkMs
,
pPeer
,
syncTmrCtrl
,
&
pPeer
->
timer
);
}
syncAddNodeRef
(
pNode
);
taosAcquireRef
(
tsSyncRefId
,
pNode
);
return
pPeer
;
}
...
...
@@ -1122,7 +1104,6 @@ static void syncProcessBrokenLink(void *param) {
SSyncPeer
*
pPeer
=
param
;
SSyncNode
*
pNode
=
pPeer
->
pSyncNode
;
syncAddNodeRef
(
pNode
);
pthread_mutex_lock
(
&
(
pNode
->
mutex
));
sDebug
(
"%s, TCP link is broken(%s)"
,
pPeer
->
id
,
strerror
(
errno
));
...
...
@@ -1133,7 +1114,6 @@ static void syncProcessBrokenLink(void *param) {
}
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
syncDecNodeRef
(
pNode
);
}
static
void
syncSaveFwdInfo
(
SSyncNode
*
pNode
,
uint64_t
version
,
void
*
mhandle
)
{
...
...
@@ -1202,22 +1182,88 @@ static void syncProcessFwdAck(SSyncNode *pNode, SFwdInfo *pFwdInfo, int32_t code
static
void
syncMonitorFwdInfos
(
void
*
param
,
void
*
tmrId
)
{
SSyncNode
*
pNode
=
param
;
int
ret
=
taosAcquireRef
(
tsSyncRefId
,
pNode
);
if
(
ret
<
0
)
return
;
SSyncFwds
*
pSyncFwds
=
pNode
->
pSyncFwds
;
if
(
pSyncFwds
==
NULL
)
return
;
uint64_t
time
=
taosGetTimestampMs
();
if
(
pSyncFwds
)
{;
uint64_t
time
=
taosGetTimestampMs
();
if
(
pSyncFwds
->
fwds
>
0
)
{
pthread_mutex_lock
(
&
(
pNode
->
mutex
));
for
(
int
i
=
0
;
i
<
pSyncFwds
->
fwds
;
++
i
)
{
SFwdInfo
*
pFwdInfo
=
pSyncFwds
->
fwdInfo
+
(
pSyncFwds
->
first
+
i
)
%
tsMaxFwdInfo
;
if
(
time
-
pFwdInfo
->
time
<
2000
)
break
;
syncProcessFwdAck
(
pNode
,
pFwdInfo
,
TSDB_CODE_RPC_NETWORK_UNAVAIL
);
if
(
pSyncFwds
->
fwds
>
0
)
{
pthread_mutex_lock
(
&
(
pNode
->
mutex
));
for
(
int
i
=
0
;
i
<
pSyncFwds
->
fwds
;
++
i
)
{
SFwdInfo
*
pFwdInfo
=
pSyncFwds
->
fwdInfo
+
(
pSyncFwds
->
first
+
i
)
%
tsMaxFwdInfo
;
if
(
time
-
pFwdInfo
->
time
<
2000
)
break
;
syncProcessFwdAck
(
pNode
,
pFwdInfo
,
TSDB_CODE_RPC_NETWORK_UNAVAIL
);
}
syncRemoveConfirmedFwdInfo
(
pNode
);
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
}
syncRemoveConfirmedFwdInfo
(
pNode
);
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
pNode
->
pFwdTimer
=
taosTmrStart
(
syncMonitorFwdInfos
,
300
,
pNode
,
syncTmrCtrl
);
}
}
pNode
->
pFwdTimer
=
taosTmrStart
(
syncMonitorFwdInfos
,
300
,
pNode
,
syncTmrCtrl
);
static
int32_t
syncForwardToPeerImpl
(
SSyncNode
*
pNode
,
void
*
data
,
void
*
mhandle
,
int
qtype
)
{
SSyncPeer
*
pPeer
;
SSyncHead
*
pSyncHead
;
SWalHead
*
pWalHead
=
data
;
int
fwdLen
;
int32_t
code
=
0
;
if
(
nodeRole
==
TAOS_SYNC_ROLE_SLAVE
&&
pWalHead
->
version
!=
nodeVersion
+
1
)
{
sError
(
"vgId:%d, received ver:%"
PRIu64
", inconsistent with last ver:%"
PRIu64
", restart connection"
,
pNode
->
vgId
,
pWalHead
->
version
,
nodeVersion
);
for
(
int
i
=
0
;
i
<
pNode
->
replica
;
++
i
)
{
pPeer
=
pNode
->
peerInfo
[
i
];
syncRestartConnection
(
pPeer
);
}
return
TSDB_CODE_SYN_INVALID_VERSION
;
}
// always update version
nodeVersion
=
pWalHead
->
version
;
sDebug
(
"vgId:%d, replica:%d nodeRole:%s qtype:%d ver:%"
PRIu64
,
pNode
->
vgId
,
pNode
->
replica
,
syncRole
[
nodeRole
],
qtype
,
pWalHead
->
version
);
if
(
pNode
->
replica
==
1
||
nodeRole
!=
TAOS_SYNC_ROLE_MASTER
)
return
0
;
// only pkt from RPC or CQ can be forwarded
if
(
qtype
!=
TAOS_QTYPE_RPC
&&
qtype
!=
TAOS_QTYPE_CQ
)
return
0
;
// a hacker way to improve the performance
pSyncHead
=
(
SSyncHead
*
)(((
char
*
)
pWalHead
)
-
sizeof
(
SSyncHead
));
pSyncHead
->
type
=
TAOS_SMSG_FORWARD
;
pSyncHead
->
pversion
=
0
;
pSyncHead
->
len
=
sizeof
(
SWalHead
)
+
pWalHead
->
len
;
fwdLen
=
pSyncHead
->
len
+
sizeof
(
SSyncHead
);
// include the WAL and SYNC head
pthread_mutex_lock
(
&
(
pNode
->
mutex
));
for
(
int
i
=
0
;
i
<
pNode
->
replica
;
++
i
)
{
pPeer
=
pNode
->
peerInfo
[
i
];
if
(
pPeer
==
NULL
||
pPeer
->
peerFd
<
0
)
continue
;
if
(
pPeer
->
role
!=
TAOS_SYNC_ROLE_SLAVE
&&
pPeer
->
sstatus
!=
TAOS_SYNC_STATUS_CACHE
)
continue
;
if
(
pNode
->
quorum
>
1
&&
code
==
0
)
{
syncSaveFwdInfo
(
pNode
,
pWalHead
->
version
,
mhandle
);
code
=
1
;
}
int
retLen
=
write
(
pPeer
->
peerFd
,
pSyncHead
,
fwdLen
);
if
(
retLen
==
fwdLen
)
{
sDebug
(
"%s, forward is sent, ver:%"
PRIu64
" contLen:%d"
,
pPeer
->
id
,
pWalHead
->
version
,
pWalHead
->
len
);
}
else
{
sError
(
"%s, failed to forward, ver:%"
PRIu64
" retLen:%d"
,
pPeer
->
id
,
pWalHead
->
version
,
retLen
);
syncRestartConnection
(
pPeer
);
}
}
pthread_mutex_unlock
(
&
(
pNode
->
mutex
));
return
code
;
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录