From a48c13aaf1427072013b5d4fe758ab23617e614e Mon Sep 17 00:00:00 2001
From: hjxilinx <hjxilinx@163.com>
Date: Mon, 25 Nov 2019 18:06:37 +0800
Subject: [PATCH] [tbase-1225]

---
 src/client/src/tscServer.c | 26 +++++++++++++++++++-------
 src/util/src/tglobalcfg.c  |  2 +-
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/client/src/tscServer.c b/src/client/src/tscServer.c
index 40db4d934f..6a66b860d7 100644
--- a/src/client/src/tscServer.c
+++ b/src/client/src/tscServer.c
@@ -59,6 +59,22 @@ void tscPrintMgmtIp() {
 }
 #endif
 
+/*
+ * For each management node, try twice at least in case of poor network situation.
+ * If the client start to connect to a non-management node from the client, and the first retry may fail due to
+ * the poor network quality. And then, the second retry get the response with redirection command.
+ * The retry will not be executed since only *two* retry is allowed in case of single management node in the cluster.
+ * Therefore, we need to multiply the retry times by factor of 2 to fix this problem.
+ */
+static int32_t tscGetMgmtConnMaxRetryTimes() {
+  int32_t factor = 2;
+#ifdef CLUSTER
+  return tscMgmtIpList.numOfIps * factor;
+#else
+  return 1*factor;
+#endif
+}
+
 void tscProcessHeartBeatRsp(void *param, TAOS_RES *tres, int code) {
   STscObj *pObj = (STscObj *)param;
   if (pObj == NULL) return;
@@ -134,18 +150,17 @@ void tscProcessActivityTimer(void *handle, void *tmrId) {
   tscProcessSql(pObj->pHb);
 }
 
-//TODO HANDLE error from mgmt
 void tscGetConnToMgmt(SSqlObj *pSql, uint8_t *pCode) {
   STscObj *pTscObj = pSql->pTscObj;
 #ifdef CLUSTER
-  if (pSql->retry < tscMgmtIpList.numOfIps) {
+  if (pSql->retry < tscGetMgmtConnMaxRetryTimes()) {
     *pCode = 0;
     pSql->retry++;
     pSql->index = pSql->index % tscMgmtIpList.numOfIps;
     if (pSql->cmd.command > TSDB_SQL_READ && pSql->index == 0) pSql->index = 1;
     void *thandle = taosGetConnFromCache(tscConnCache, tscMgmtIpList.ip[pSql->index], TSC_MGMT_VNODE, pTscObj->user);
 #else
-  if (pSql->retry < 1) {
+  if (pSql->retry < tscGetMgmtConnMaxRetryTimes()) {
     *pCode = 0;
     pSql->retry++;
     void *thandle = taosGetConnFromCache(tscConnCache, tsServerIp, TSC_MGMT_VNODE, pTscObj->user);
@@ -444,16 +459,13 @@ void *tscProcessMsgFromServer(char *msg, void *ahandle, void *thandle) {
     }
   } else {
     uint16_t rspCode = pMsg->content[0];
-#ifdef CLUSTER
     
+#ifdef CLUSTER
     
     if (rspCode == TSDB_CODE_REDIRECT) {
       tscTrace("%p it shall be redirected!", pSql);
       taosAddConnIntoCache(tscConnCache, thandle, pSql->ip, pSql->vnode, pObj->user);
       pSql->thandle = NULL;
-      
-      // reset the retry times for a new mgmt node
-      pSql->retry = 0;
 
       if (pCmd->command > TSDB_SQL_MGMT) {
         tscProcessMgmtRedirect(pSql, pMsg->content + 1);
diff --git a/src/util/src/tglobalcfg.c b/src/util/src/tglobalcfg.c
index 6991e6b8b6..0dd0e4e2ba 100644
--- a/src/util/src/tglobalcfg.c
+++ b/src/util/src/tglobalcfg.c
@@ -510,7 +510,7 @@ static void doInitGlobalConfig() {
                      0, TSDB_MAX_VNODES, 0, TSDB_CFG_UTYPE_NONE);
   tsInitConfigOption(cfg++, "tables", &tsSessionsPerVnode, TSDB_CFG_VTYPE_INT,
                      TSDB_CFG_CTYPE_B_CONFIG | TSDB_CFG_CTYPE_B_SHOW,
-                     4, 220000, 0, TSDB_CFG_UTYPE_NONE);
+                     TSDB_MIN_TABLES_PER_VNODE, TSDB_MAX_TABLES_PER_VNODE, 0, TSDB_CFG_UTYPE_NONE);
   tsInitConfigOption(cfg++, "cache", &tsCacheBlockSize, TSDB_CFG_VTYPE_INT,
                      TSDB_CFG_CTYPE_B_CONFIG | TSDB_CFG_CTYPE_B_SHOW,
                      100, 1048576, 0, TSDB_CFG_UTYPE_BYTE);
-- 
GitLab