提交 07d234af 编写于 作者: S Shuduo Sang

Merge branch 'develop' into hotfix/sangshuduo/TD-3143-taosdemo-windows

......@@ -225,7 +225,13 @@ SHOW MNODES;
## <a class="anchor" id="arbitrator"></a>Arbitrator的使用
如果副本数为偶数,当一个vnode group里一半vnode不工作时,是无法从中选出master的。同理,一半mnode不工作时,是无法选出mnode的master的,因为存在“split brain”问题。为解决这个问题,TDengine引入了Arbitrator的概念。Arbitrator模拟一个vnode或mnode在工作,但只简单的负责网络连接,不处理任何数据插入或访问。只要包含Arbitrator在内,超过半数的vnode或mnode工作,那么该vnode group或mnode组就可以正常的提供数据插入或查询服务。比如对于副本数为2的情形,如果一个节点A离线,但另外一个节点B正常,而且能连接到Arbitrator,那么节点B就能正常工作。
如果副本数为偶数,当一个 vnode group 里一半 vnode 不工作时,是无法从中选出 master 的。同理,一半 mnode 不工作时,是无法选出 mnode 的 master 的,因为存在“split brain”问题。为解决这个问题,TDengine 引入了 Arbitrator 的概念。Arbitrator 模拟一个 vnode 或 mnode 在工作,但只简单的负责网络连接,不处理任何数据插入或访问。只要包含 Arbitrator 在内,超过半数的 vnode 或 mnode 工作,那么该 vnode group 或 mnode 组就可以正常的提供数据插入或查询服务。比如对于副本数为 2 的情形,如果一个节点 A 离线,但另外一个节点 B 正常,而且能连接到 Arbitrator,那么节点 B 就能正常工作。
TDengine提供一个执行程序,名为 tarbitrator,找任何一台Linux服务器运行它即可。请点击[安装包下载](https://www.taosdata.com/cn/all-downloads/),在TDengine Arbitrator Linux一节中,选择适合的版本下载并安装。该程序对系统资源几乎没有要求,只需要保证有网络连接即可。该应用的命令行参数`-p`可以指定其对外服务的端口号,缺省是6042。配置每个taosd实例时,可以在配置文件taos.cfg里将参数arbitrator设置为Arbitrator的End Point。如果该参数配置了,当副本数为偶数时,系统将自动连接配置的Arbitrator。如果副本数为奇数,即使配置了Arbitrator,系统也不会去建立连接。
总之,在目前版本下,TDengine 建议在双副本环境要配置 Arbitrator,以提升系统的可用性。
Arbitrator 的执行程序名为 tarbitrator。该程序对系统资源几乎没有要求,只需要保证有网络连接,找任何一台 Linux 服务器运行它即可。以下简要描述安装配置的步骤:
1. 请点击 [安装包下载](https://www.taosdata.com/cn/all-downloads/),在 TDengine Arbitrator Linux 一节中,选择合适的版本下载并安装。
2. 该应用的命令行参数 `-p` 可以指定其对外服务的端口号,缺省是 6042。
3. 修改每个 taosd 实例的配置文件,在 taos.cfg 里将参数 arbitrator 设置为 tarbitrator 程序所对应的 End Point。(如果该参数配置了,当副本数为偶数时,系统将自动连接配置的 Arbitrator。如果副本数为奇数,即使配置了 Arbitrator,系统也不会去建立连接。)
4. 在配置文件中配置了的 Arbitrator,会出现在 `SHOW DNODES;` 指令的返回结果中,对应的 role 列的值会是“arb”。
......@@ -198,6 +198,14 @@ void dnodeCleanupVnodes() {
static void dnodeProcessStatusRsp(SRpcMsg *pMsg) {
if (pMsg->code != TSDB_CODE_SUCCESS) {
dError("status rsp is received, error:%s", tstrerror(pMsg->code));
if (pMsg->code == TSDB_CODE_MND_DNODE_NOT_EXIST) {
char clusterId[TSDB_CLUSTER_ID_LEN];
dnodeGetClusterId(clusterId);
if (clusterId[0] != '\0') {
dError("exit zombie dropped dnode");
exit(EXIT_FAILURE);
}
}
taosTmrReset(dnodeSendStatusMsg, tsStatusInterval * 1000, NULL, tsDnodeTmr, &tsStatusTimer);
return;
}
......
此差异已折叠。
......@@ -35,16 +35,19 @@ import os
import signal
import traceback
import resource
from guppy import hpy
# from guppy import hpy
import gc
from crash_gen.service_manager import ServiceManager, TdeInstance
from crash_gen.misc import Logging, Status, CrashGenError, Dice, Helper, Progress
from crash_gen.db import DbConn, MyTDSql, DbConnNative, DbManager
import crash_gen.settings
import taos
import requests
crash_gen.settings.init()
# Require Python 3
if sys.version_info[0] < 3:
raise Exception("Must be using Python 3")
......@@ -259,6 +262,7 @@ class ThreadCoordinator:
self._execStats = ExecutionStats()
self._runStatus = Status.STATUS_RUNNING
self._initDbs()
self._stepStartTime = None # Track how long it takes to execute each step
def getTaskExecutor(self):
return self._te
......@@ -394,6 +398,10 @@ class ThreadCoordinator:
try:
self._syncAtBarrier() # For now just cross the barrier
Progress.emit(Progress.END_THREAD_STEP)
if self._stepStartTime :
stepExecTime = time.time() - self._stepStartTime
Progress.emitStr('{:.3f}s/{}'.format(stepExecTime, DbConnNative.totalRequests))
DbConnNative.resetTotalRequests() # reset to zero
except threading.BrokenBarrierError as err:
self._execStats.registerFailure("Aborted due to worker thread timeout")
Logging.error("\n")
......@@ -433,6 +441,7 @@ class ThreadCoordinator:
# Then we move on to the next step
Progress.emit(Progress.BEGIN_THREAD_STEP)
self._stepStartTime = time.time()
self._releaseAllWorkerThreads(transitionFailed)
if hasAbortedTask or transitionFailed : # abnormal ending, workers waiting at "gate"
......@@ -691,7 +700,7 @@ class AnyState:
def canDropDb(self):
# If user requests to run up to a number of DBs,
# we'd then not do drop_db operations any more
if gConfig.max_dbs > 0 :
if gConfig.max_dbs > 0 or gConfig.use_shadow_db :
return False
return self._info[self.CAN_DROP_DB]
......@@ -699,6 +708,8 @@ class AnyState:
return self._info[self.CAN_CREATE_FIXED_SUPER_TABLE]
def canDropFixedSuperTable(self):
if gConfig.use_shadow_db: # duplicate writes to shaddow DB, in which case let's disable dropping s-table
return False
return self._info[self.CAN_DROP_FIXED_SUPER_TABLE]
def canAddData(self):
......@@ -1037,7 +1048,7 @@ class Database:
_clsLock = threading.Lock() # class wide lock
_lastInt = 101 # next one is initial integer
_lastTick = 0
_lastLaggingTick = 0 # lagging tick, for unsequenced insersions
_lastLaggingTick = 0 # lagging tick, for out-of-sequence (oos) data insertions
def __init__(self, dbNum: int, dbc: DbConn): # TODO: remove dbc
self._dbNum = dbNum # we assign a number to databases, for our testing purpose
......@@ -1093,21 +1104,24 @@ class Database:
t3 = datetime.datetime(2012, 1, 1) # default "keep" is 10 years
t4 = datetime.datetime.fromtimestamp(
t3.timestamp() + elSec2) # see explanation above
Logging.debug("Setting up TICKS to start from: {}".format(t4))
Logging.info("Setting up TICKS to start from: {}".format(t4))
return t4
@classmethod
def getNextTick(cls):
def getNextTick(cls):
'''
Fetch a timestamp tick, with some random factor, may not be unique.
'''
with cls._clsLock: # prevent duplicate tick
if cls._lastLaggingTick==0 or cls._lastTick==0 : # not initialized
# 10k at 1/20 chance, should be enough to avoid overlaps
tick = cls.setupLastTick()
cls._lastTick = tick
cls._lastLaggingTick = tick + datetime.timedelta(0, -10000)
cls._lastLaggingTick = tick + datetime.timedelta(0, -60*2) # lagging behind 2 minutes, should catch up fast
# if : # should be quite a bit into the future
if Dice.throw(20) == 0: # 1 in 20 chance, return lagging tick
cls._lastLaggingTick += datetime.timedelta(0, 1) # Go back in time 100 seconds
if gConfig.mix_oos_data and Dice.throw(20) == 0: # if asked to do so, and 1 in 20 chance, return lagging tick
cls._lastLaggingTick += datetime.timedelta(0, 1) # pick the next sequence from the lagging tick sequence
return cls._lastLaggingTick
else: # regular
# add one second to it
......@@ -1334,7 +1348,8 @@ class Task():
elif self._isErrAcceptable(errno2, err.__str__()):
self.logDebug("[=] Acceptable Taos library exception: errno=0x{:X}, msg: {}, SQL: {}".format(
errno2, err, wt.getDbConn().getLastSql()))
print("_", end="", flush=True)
# print("_", end="", flush=True)
Progress.emit(Progress.ACCEPTABLE_ERROR)
self._err = err
else: # not an acceptable error
errMsg = "[=] Unexpected Taos library exception ({}): errno=0x{:X}, msg: {}, SQL: {}".format(
......@@ -1563,8 +1578,11 @@ class TaskCreateDb(StateTransitionTask):
# numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N
numReplica = gConfig.max_replicas # fixed, always
repStr = "replica {}".format(numReplica)
self.execWtSql(wt, "create database {} {}"
.format(self._db.getName(), repStr) )
updatePostfix = "update 1" if gConfig.verify_data else "" # allow update only when "verify data" is active
dbName = self._db.getName()
self.execWtSql(wt, "create database {} {} {} ".format(dbName, repStr, updatePostfix ) )
if dbName == "db_0" and gConfig.use_shadow_db:
self.execWtSql(wt, "create database {} {} {} ".format("db_s", repStr, updatePostfix ) )
class TaskDropDb(StateTransitionTask):
@classmethod
......@@ -1774,13 +1792,13 @@ class TdSuperTable:
]) # TODO: add more from 'top'
if aggExpr not in ['stddev(speed)']: #TODO: STDDEV not valid for super tables?!
sql = "select {} from {}.{}".format(aggExpr, self._dbName, self.getName())
if Dice.throw(3) == 0: # 1 in X chance
sql = sql + ' GROUP BY color'
Progress.emit(Progress.QUERY_GROUP_BY)
# Logging.info("Executing GROUP-BY query: " + sql)
ret.append(SqlQuery(sql))
# if aggExpr not in ['stddev(speed)']: # STDDEV not valid for super tables?! (Done in TD-1049)
sql = "select {} from {}.{}".format(aggExpr, self._dbName, self.getName())
if Dice.throw(3) == 0: # 1 in X chance
sql = sql + ' GROUP BY color'
Progress.emit(Progress.QUERY_GROUP_BY)
# Logging.info("Executing GROUP-BY query: " + sql)
ret.append(SqlQuery(sql))
return ret
......@@ -1988,7 +2006,7 @@ class TaskAddData(StateTransitionTask):
numRecords = self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS
fullTableName = db.getName() + '.' + regTableName
sql = "insert into {} values ".format(fullTableName)
sql = "INSERT INTO {} VALUES ".format(fullTableName)
for j in range(numRecords): # number of records per table
nextInt = db.getNextInt()
nextTick = db.getNextTick()
......@@ -2016,12 +2034,24 @@ class TaskAddData(StateTransitionTask):
# print("_w" + str(nextInt % 100), end="", flush=True) # Trace what was written
try:
sql = "insert into {} values ('{}', {}, '{}');".format( # removed: tags ('{}', {})
sql = "INSERT INTO {} VALUES ('{}', {}, '{}');".format( # removed: tags ('{}', {})
fullTableName,
# ds.getFixedSuperTableName(),
# ds.getNextBinary(), ds.getNextFloat(),
nextTick, nextInt, nextColor)
dbc.execute(sql)
# Quick hack, attach an update statement here. TODO: create an "update" task
if (not gConfig.use_shadow_db) and Dice.throw(5) == 0: # 1 in N chance, plus not using shaddow DB
nextInt = db.getNextInt()
nextColor = db.getNextColor()
sql = "INSERt INTO {} VALUES ('{}', {}, '{}');".format( # "INSERt" means "update" here
fullTableName,
nextTick, nextInt, nextColor)
# sql = "UPDATE {} set speed={}, color='{}' WHERE ts='{}'".format(
# fullTableName, db.getNextInt(), db.getNextColor(), nextTick)
dbc.execute(sql)
except: # Any exception at all
if gConfig.verify_data:
self.unlockTable(fullTableName)
......@@ -2070,7 +2100,8 @@ class TaskAddData(StateTransitionTask):
random.shuffle(tblSeq) # now we have random sequence
for i in tblSeq:
if (i in self.activeTable): # wow already active
print("x", end="", flush=True) # concurrent insertion
# print("x", end="", flush=True) # concurrent insertion
Progress.emit(Progress.CONCURRENT_INSERTION)
else:
self.activeTable.add(i) # marking it active
......@@ -2373,6 +2404,11 @@ class MainExec:
'--larger-data',
action='store_true',
help='Write larger amount of data during write operations (default: false)')
parser.add_argument(
'-m',
'--mix-oos-data',
action='store_false',
help='Mix out-of-sequence data into the test data stream (default: true)')
parser.add_argument(
'-n',
'--dynamic-db-table-names',
......@@ -2414,6 +2450,11 @@ class MainExec:
'--verify-data',
action='store_true',
help='Verify data written in a number of places by reading back (default: false)')
parser.add_argument(
'-w',
'--use-shadow-db',
action='store_true',
help='Use a shaddow database to verify data integrity (default: false)')
parser.add_argument(
'-x',
'--continue-on-exception',
......@@ -2422,6 +2463,11 @@ class MainExec:
global gConfig
gConfig = parser.parse_args()
crash_gen.settings.gConfig = gConfig # TODO: fix this hack, consolidate this global var
# Sanity check for arguments
if gConfig.use_shadow_db and gConfig.max_dbs>1 :
raise CrashGenError("Cannot combine use-shadow-db with max-dbs of more than 1")
Logging.clsInit(gConfig)
......
......@@ -18,6 +18,8 @@ import datetime
import traceback
# from .service_manager import TdeInstance
import crash_gen.settings
class DbConn:
TYPE_NATIVE = "native-c"
TYPE_REST = "rest-api"
......@@ -244,7 +246,7 @@ class MyTDSql:
self._conn.close() # TODO: very important, cursor close does NOT close DB connection!
self._cursor.close()
def _execInternal(self, sql):
def _execInternal(self, sql):
startTime = time.time()
# Logging.debug("Executing SQL: " + sql)
ret = self._cursor.execute(sql)
......@@ -257,6 +259,27 @@ class MyTDSql:
cls.longestQuery = sql
cls.longestQueryTime = queryTime
cls.lqStartTime = startTime
# Now write to the shadow database
if crash_gen.settings.gConfig.use_shadow_db:
if sql[:11] == "INSERT INTO":
if sql[:16] == "INSERT INTO db_0":
sql2 = "INSERT INTO db_s" + sql[16:]
self._cursor.execute(sql2)
else:
raise CrashGenError("Did not find db_0 in INSERT statement: {}".format(sql))
else: # not an insert statement
pass
if sql[:12] == "CREATE TABLE":
if sql[:17] == "CREATE TABLE db_0":
sql2 = sql.replace('db_0', 'db_s')
self._cursor.execute(sql2)
else:
raise CrashGenError("Did not find db_0 in CREATE TABLE statement: {}".format(sql))
else: # not an insert statement
pass
return ret
def query(self, sql):
......@@ -302,12 +325,18 @@ class DbConnNative(DbConn):
_lock = threading.Lock()
# _connInfoDisplayed = False # TODO: find another way to display this
totalConnections = 0 # Not private
totalRequests = 0
def __init__(self, dbTarget):
super().__init__(dbTarget)
self._type = self.TYPE_NATIVE
self._conn = None
# self._cursor = None
# self._cursor = None
@classmethod
def resetTotalRequests(cls):
with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
cls.totalRequests = 0
def openByType(self): # Open connection
# global gContainer
......@@ -356,6 +385,8 @@ class DbConnNative(DbConn):
Logging.debug("[SQL] Executing SQL: {}".format(sql))
self._lastSql = sql
nRows = self._tdSql.execute(sql)
cls = self.__class__
cls.totalRequests += 1
Logging.debug(
"[SQL] Execution Result, nRows = {}, SQL = {}".format(
nRows, sql))
......@@ -369,6 +400,8 @@ class DbConnNative(DbConn):
Logging.debug("[SQL] Executing SQL: {}".format(sql))
self._lastSql = sql
nRows = self._tdSql.query(sql)
cls = self.__class__
cls.totalRequests += 1
Logging.debug(
"[SQL] Query Result, nRows = {}, SQL = {}".format(
nRows, sql))
......
......@@ -176,11 +176,13 @@ class Progress:
SERVICE_START_NAP = 7
CREATE_TABLE_ATTEMPT = 8
QUERY_GROUP_BY = 9
CONCURRENT_INSERTION = 10
ACCEPTABLE_ERROR = 11
tokens = {
STEP_BOUNDARY: '.',
BEGIN_THREAD_STEP: '[',
END_THREAD_STEP: '] ',
BEGIN_THREAD_STEP: ' [',
END_THREAD_STEP: ']',
SERVICE_HEART_BEAT: '.Y.',
SERVICE_RECONNECT_START: '<r.',
SERVICE_RECONNECT_SUCCESS: '.r>',
......@@ -188,8 +190,14 @@ class Progress:
SERVICE_START_NAP: '_zz',
CREATE_TABLE_ATTEMPT: 'c',
QUERY_GROUP_BY: 'g',
CONCURRENT_INSERTION: 'x',
ACCEPTABLE_ERROR: '_',
}
@classmethod
def emit(cls, token):
print(cls.tokens[token], end="", flush=True)
@classmethod
def emitStr(cls, str):
print('({})'.format(str), end="", flush=True)
from __future__ import annotations
import argparse
gConfig: argparse.Namespace
def init():
global gConfig
gConfig = []
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册