/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#define _DEFAULT_SOURCE
#include "os.h"
#include "tp.h"
#include "taosmsg.h"
#include "taoserror.h"
#include "tglobal.h"
#include "tqueue.h"
#include "ttimer.h"
#include "dnode.h"
#include "vnodeStatus.h"

#define MAX_QUEUED_MSG_NUM 100000
#define MAX_QUEUED_MSG_SIZE 1024*1024*1024  //1GB

static int64_t tsSubmitReqSucNum = 0;
static int64_t tsSubmitRowNum = 0;
static int64_t tsSubmitRowSucNum = 0;

extern void *  tsDnodeTmr;
static int32_t (*vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_MAX])(SVnodeObj *, void *pCont, SRspRet *);
static int32_t vnodeProcessSubmitMsg(SVnodeObj *pVnode, void *pCont, SRspRet *);
static int32_t vnodeProcessCreateTableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *);
static int32_t vnodeProcessDropTableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *);
static int32_t vnodeProcessAlterTableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *);
static int32_t vnodeProcessDropStableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *);
static int32_t vnodeProcessUpdateTagValMsg(SVnodeObj *pVnode, void *pCont, SRspRet *);
static int32_t vnodePerformFlowCtrl(SVWriteMsg *pWrite);
static int32_t vnodeCheckWal(SVnodeObj *pVnode);

int32_t vnodeInitWrite(void) {
  vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_SUBMIT]          = vnodeProcessSubmitMsg;
  vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_MD_CREATE_TABLE] = vnodeProcessCreateTableMsg;
  vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_MD_DROP_TABLE]   = vnodeProcessDropTableMsg;
  vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_MD_ALTER_TABLE]  = vnodeProcessAlterTableMsg;
  vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_MD_DROP_STABLE]  = vnodeProcessDropStableMsg;
  vnodeProcessWriteMsgFp[TSDB_MSG_TYPE_UPDATE_TAG_VAL]  = vnodeProcessUpdateTagValMsg;

  return 0;
}

void vnodeCleanupWrite() {}

int32_t vnodeProcessWrite(void *vparam, void *wparam, int32_t qtype, void *rparam) {
  int32_t    code = 0;
  SVnodeObj *pVnode = vparam;
  SWalHead * pHead = wparam;
  SVWriteMsg*pWrite = rparam;

  SRspRet *pRspRet = NULL;
  if (pWrite != NULL) pRspRet = &pWrite->rspRet;
  // if wal and forward write , no need response
  if( qtype == TAOS_QTYPE_WAL || qtype == TAOS_QTYPE_FWD) {
    pRspRet = NULL;
  }   

  if (vnodeProcessWriteMsgFp[pHead->msgType] == NULL) {
    vError("vgId:%d, msg:%s not processed since no handle, qtype:%s hver:%" PRIu64, pVnode->vgId,
           taosMsg[pHead->msgType], qtypeStr[qtype], pHead->version);
    return TSDB_CODE_VND_MSG_NOT_PROCESSED;
  }

  vTrace("vgId:%d, msg:%s will be processed in vnode, qtype:%s hver:%" PRIu64 " vver:%" PRIu64, pVnode->vgId,
         taosMsg[pHead->msgType], qtypeStr[qtype], pHead->version, pVnode->version);

  if (pHead->version == 0) {  // from client or CQ
    if (!vnodeInReadyStatus(pVnode)) {
      vDebug("vgId:%d, msg:%s not processed since vstatus:%d, qtype:%s hver:%" PRIu64, pVnode->vgId,
             taosMsg[pHead->msgType], pVnode->status, qtypeStr[qtype], pHead->version);
      return TSDB_CODE_APP_NOT_READY;  // it may be in deleting or closing state
    }

    if (pVnode->role != TAOS_SYNC_ROLE_MASTER) {
      vDebug("vgId:%d, msg:%s not processed since replica:%d role:%s, qtype:%s hver:%" PRIu64, pVnode->vgId,
             taosMsg[pHead->msgType], pVnode->syncCfg.replica, syncRole[pVnode->role], qtypeStr[qtype], pHead->version);
      return TSDB_CODE_APP_NOT_READY;
    }

    // assign version
    pHead->version = pVnode->version + 1;
  } else {  // from wal or forward
    // for data from WAL or forward, version may be smaller
    if (pHead->version <= pVnode->version) return 0;
  }

  // forward to peers, even it is WAL/FWD, it shall be called to update version in sync
  int32_t syncCode = 0;
  bool    force = (pWrite == NULL ? false : pWrite->walHead.msgType != TSDB_MSG_TYPE_SUBMIT);
  syncCode = syncForwardToPeer(pVnode->sync, pHead, pWrite, qtype, force);
  if (syncCode < 0) {
    pHead->version = 0;
    return syncCode;
  }

  // write into WAL
  if (!(tsShortcutFlag & TSDB_SHORTCUT_NR_VNODE_WAL_WRITE)) {
    code = walWrite(pVnode->wal, pHead);
  }
  if (code < 0) {
    if (syncCode > 0 && pWrite) atomic_sub_fetch_32(&pWrite->processedCount, 1);
    vError("vgId:%d, hver:%" PRIu64 " vver:%" PRIu64 " code:0x%x", pVnode->vgId, pHead->version, pVnode->version, code);
    pHead->version = 0;
    return code;
  }

  pVnode->version = pHead->version;

  // write data locally
  code = (*vnodeProcessWriteMsgFp[pHead->msgType])(pVnode, pHead->cont, pRspRet);
  if (code < 0) {
    if (syncCode > 0 && pWrite) atomic_sub_fetch_32(&pWrite->processedCount, 1);
    return code;
  }

  return syncCode;
}

static int32_t vnodeCheckWrite(SVnodeObj *pVnode) {
  if (!(pVnode->accessState & TSDB_VN_WRITE_ACCCESS)) {
    vDebug("vgId:%d, no write auth, refCount:%d pVnode:%p", pVnode->vgId, pVnode->refCount, pVnode);
    return TSDB_CODE_VND_NO_WRITE_AUTH;
  }

  if (pVnode->dbReplica != pVnode->syncCfg.replica &&
      pVnode->syncCfg.nodeInfo[pVnode->syncCfg.replica - 1].nodeId == dnodeGetDnodeId()) {
    vDebug("vgId:%d, vnode is balancing and will be dropped, dbReplica:%d vgReplica:%d, refCount:%d pVnode:%p",
           pVnode->vgId, pVnode->dbReplica, pVnode->syncCfg.replica, pVnode->refCount, pVnode);
    return TSDB_CODE_VND_IS_BALANCING;
  }

  // tsdb may be in reset state
  if (pVnode->tsdb == NULL) {
    vDebug("vgId:%d, tsdb is null, refCount:%d pVnode:%p", pVnode->vgId, pVnode->refCount, pVnode);
    return TSDB_CODE_APP_NOT_READY;
  }

  if (pVnode->isFull) {
    vDebug("vgId:%d, vnode is full, refCount:%d", pVnode->vgId, pVnode->refCount);
    return TSDB_CODE_VND_IS_FULL;
  }

  return TSDB_CODE_SUCCESS;
}

static int32_t vnodeProcessSubmitMsg(SVnodeObj *pVnode, void *pCont, SRspRet *pRet) {
  int32_t code = TSDB_CODE_SUCCESS;

  vTrace("vgId:%d, submit msg is processed", pVnode->vgId);

  if (pVnode->dbType == TSDB_DB_TYPE_TOPIC && pVnode->role == TAOS_SYNC_ROLE_MASTER) {
    tpUpdateTs(pVnode->vgId, &pVnode->sequence, pCont);
  }

  // save insert result into item
  SShellSubmitRspMsg *pRsp = NULL;
  tsem_t** ppsem = NULL;
  if (pRet) {
    pRet->len = sizeof(SShellSubmitRspMsg);
    pRet->rsp = rpcMallocCont(pRet->len);
    pRsp = pRet->rsp;
    ppsem = &pRet->psem;
  }

  if (tsdbInsertData(pVnode->tsdb, pCont, pRsp, ppsem) < 0) {
    code = terrno;
  } else {
    if (pRsp != NULL) atomic_fetch_add_64(&tsSubmitReqSucNum, 1);
  }

  if (pRsp) {
    atomic_fetch_add_64(&tsSubmitRowNum, ntohl(pRsp->numOfRows));
    atomic_fetch_add_64(&tsSubmitRowSucNum, ntohl(pRsp->affectedRows));
  }

  return code;
}

static int32_t vnodeCheckWal(SVnodeObj *pVnode) {
  if (pVnode->isCommiting == 0) {
    return tsdbCheckWal(pVnode->tsdb, (uint32_t)(walGetFSize(pVnode->wal) >> 20));
  }
  return 0;
}

static int32_t vnodeProcessCreateTableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *pRet) {
  int code = TSDB_CODE_SUCCESS;

  STableCfg *pCfg = tsdbCreateTableCfgFromMsg((SMDCreateTableMsg *)pCont);
  if (pCfg == NULL) {
    ASSERT(terrno != 0);
    return terrno;
  }

  if (tsdbCreateTable(pVnode->tsdb, pCfg) < 0) {
    code = terrno;
    ASSERT(code != 0);
  }

  if (((++pVnode->tblMsgVer) & 32767) == 0) {  // lazy check
    vnodeCheckWal(pVnode);
  }

  tsdbClearTableCfg(pCfg);
  return code;
}

static int32_t vnodeProcessDropTableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *pRet) {
  SMDDropTableMsg *pTable = pCont;
  int32_t          code = TSDB_CODE_SUCCESS;

  vDebug("vgId:%d, table:%s, start to drop", pVnode->vgId, pTable->tableFname);
  STableId tableId = {.uid = htobe64(pTable->uid), .tid = htonl(pTable->tid)};

  if (tsdbDropTable(pVnode->tsdb, tableId) < 0) code = terrno;

  return code;
}

static int32_t vnodeProcessAlterTableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *pRet) {
  // TODO: disposed in tsdb
  // STableCfg *pCfg = tsdbCreateTableCfgFromMsg((SMDCreateTableMsg *)pCont);
  // if (pCfg == NULL) return terrno;
  // if (tsdbCreateTable(pVnode->tsdb, pCfg) < 0) code = terrno;

  // tsdbClearTableCfg(pCfg);
  vDebug("vgId:%d, alter table msg is received", pVnode->vgId);
  return TSDB_CODE_SUCCESS;
}

static int32_t vnodeProcessDropStableMsg(SVnodeObj *pVnode, void *pCont, SRspRet *pRet) {
  SDropSTableMsg *pTable = pCont;
  int32_t         code = TSDB_CODE_SUCCESS;

  vDebug("vgId:%d, stable:%s, start to drop", pVnode->vgId, pTable->tableFname);

  STableId stableId = {.uid = htobe64(pTable->uid), .tid = -1};

  if (tsdbDropTable(pVnode->tsdb, stableId) < 0) code = terrno;

  vDebug("vgId:%d, stable:%s, drop stable result:%s", pVnode->vgId, pTable->tableFname, tstrerror(code));

  return code;
}

static int32_t vnodeProcessUpdateTagValMsg(SVnodeObj *pVnode, void *pCont, SRspRet *pRet) {
  if (tsdbUpdateTableTagValue(pVnode->tsdb, (SUpdateTableTagValMsg *)pCont) < 0) {
    return terrno;
  }
  return TSDB_CODE_SUCCESS;
}

static SVWriteMsg *vnodeBuildVWriteMsg(SVnodeObj *pVnode, SWalHead *pHead, int32_t qtype, SRpcMsg *pRpcMsg) {
  if (pHead->len > TSDB_MAX_WAL_SIZE) {
    vError("vgId:%d, wal len:%d exceeds limit, hver:%" PRIu64, pVnode->vgId, pHead->len, pHead->version);
    terrno = TSDB_CODE_WAL_SIZE_LIMIT;
    return NULL;
  }

  int32_t size = sizeof(SVWriteMsg) + pHead->len;
  SVWriteMsg *pWrite = taosAllocateQitem(size);
  if (pWrite == NULL) {
    terrno = TSDB_CODE_VND_OUT_OF_MEMORY;
    return NULL;
  }

  if (pRpcMsg != NULL) {
    pWrite->rpcMsg = *pRpcMsg;
  }

  memcpy(&pWrite->walHead, pHead, sizeof(SWalHead) + pHead->len);
  pWrite->pVnode = pVnode;
  pWrite->qtype = qtype;

  atomic_add_fetch_32(&pVnode->refCount, 1);

  return pWrite;
}

static int32_t vnodeWriteToWQueueImp(SVWriteMsg *pWrite) {
  SVnodeObj *pVnode = pWrite->pVnode;

  if (pWrite->qtype == TAOS_QTYPE_RPC) {
    int32_t code = vnodeCheckWrite(pVnode);
    if (code != TSDB_CODE_SUCCESS) {
      vError("vgId:%d, failed to write into vwqueue since %s", pVnode->vgId, tstrerror(code));
      taosFreeQitem(pWrite);
      vnodeRelease(pVnode);
      return code;
    }
  }

  if (tsAvailDataDirGB <= tsMinimalDataDirGB) {
    vError("vgId:%d, failed to write into vwqueue since no diskspace, avail:%fGB", pVnode->vgId, tsAvailDataDirGB);
    taosFreeQitem(pWrite);
    vnodeRelease(pVnode);
    return TSDB_CODE_VND_NO_DISKSPACE;
  }

  if (!vnodeInReadyOrUpdatingStatus(pVnode)) {
    vError("vgId:%d, failed to write into vwqueue, vstatus is %s, refCount:%d pVnode:%p", pVnode->vgId,
           vnodeStatus[pVnode->status], pVnode->refCount, pVnode);
    taosFreeQitem(pWrite);
    vnodeRelease(pVnode);
    return TSDB_CODE_APP_NOT_READY;
  }

  int32_t queued = atomic_add_fetch_32(&pVnode->queuedWMsg, 1);
  int64_t queuedSize = atomic_add_fetch_64(&pVnode->queuedWMsgSize, pWrite->walHead.len);

  if (queued > MAX_QUEUED_MSG_NUM || queuedSize > MAX_QUEUED_MSG_SIZE) {
    if (pWrite->qtype == TAOS_QTYPE_FWD) {
      queued = atomic_sub_fetch_32(&pVnode->queuedWMsg, 1);
      queuedSize = atomic_sub_fetch_64(&pVnode->queuedWMsgSize, pWrite->walHead.len);

      return -1;
    }

    int32_t ms = (queued / MAX_QUEUED_MSG_NUM) * 10 + 3;
    if (ms > 100) ms = 100;
    vDebug("vgId:%d, too many msg:%d in vwqueue, flow control %dms", pVnode->vgId, queued, ms);
    taosMsleep(ms);
  }

  vTrace("vgId:%d, write into vwqueue, refCount:%d queued:%d size:%" PRId64, pVnode->vgId, pVnode->refCount,
         pVnode->queuedWMsg, pVnode->queuedWMsgSize);

  taosWriteQitem(pVnode->wqueue, pWrite->qtype, pWrite);
  return TSDB_CODE_SUCCESS;
}

int32_t vnodeWriteToWQueue(void *vparam, void *wparam, int32_t qtype, void *rparam) {
  SVnodeObj *pVnode = vparam;
  if (qtype == TAOS_QTYPE_RPC) {
   if (!vnodeInReadyStatus(pVnode)) {
      return TSDB_CODE_APP_NOT_READY;  // it may be in deleting or closing state
    }

    if (pVnode->role != TAOS_SYNC_ROLE_MASTER) {
      return TSDB_CODE_APP_NOT_READY;
    }
  }

  SVWriteMsg *pWrite = vnodeBuildVWriteMsg(vparam, wparam, qtype, rparam);
  if (pWrite == NULL) {
    assert(terrno != 0);
    return terrno;
  }

  int32_t code = vnodePerformFlowCtrl(pWrite);
  if (code != 0) return 0;

  return vnodeWriteToWQueueImp(pWrite);
}

void vnodeFreeFromWQueue(void *vparam, SVWriteMsg *pWrite) {
  SVnodeObj *pVnode = vparam;
  if (pVnode) {
    int32_t queued = atomic_sub_fetch_32(&pVnode->queuedWMsg, 1);
    int64_t queuedSize = atomic_sub_fetch_64(&pVnode->queuedWMsgSize, pWrite->walHead.len);

    vTrace("vgId:%d, msg:%p, app:%p, free from vwqueue, queued:%d size:%" PRId64, pVnode->vgId, pWrite,
           pWrite->rpcMsg.ahandle, queued, queuedSize);
  }

  taosFreeQitem(pWrite);
  vnodeRelease(pVnode);
}

static void vnodeFlowCtrlMsgToWQueue(void *param, void *tmrId) {
  SVWriteMsg *pWrite = param;
  SVnodeObj * pVnode = pWrite->pVnode;
  int32_t     code = TSDB_CODE_VND_IS_SYNCING;

  if (pVnode->flowctrlLevel <= 0) code = TSDB_CODE_VND_IS_FLOWCTRL;

  pWrite->processedCount++;
  if (pWrite->processedCount >= 100) {
    vError("vgId:%d, msg:%p, failed to process since %s, retry:%d", pVnode->vgId, pWrite, tstrerror(code),
           pWrite->processedCount);
    void *handle = pWrite->rpcMsg.handle;
    taosFreeQitem(pWrite);
    vnodeRelease(pVnode);
    SRpcMsg rpcRsp = {.handle = handle, .code = code};
    rpcSendResponse(&rpcRsp);
  } else {
    code = vnodePerformFlowCtrl(pWrite);
    if (code == 0) {
      vDebug("vgId:%d, msg:%p, write into vwqueue after flowctrl, retry:%d", pVnode->vgId, pWrite,
             pWrite->processedCount);
      pWrite->processedCount = 0;
      void *handle = pWrite->rpcMsg.handle;
      code = vnodeWriteToWQueueImp(pWrite);
      if (code != TSDB_CODE_SUCCESS) {
        SRpcMsg rpcRsp = {.handle = handle, .code = code};
        rpcSendResponse(&rpcRsp);
      }
    }
  }
}

static int32_t vnodePerformFlowCtrl(SVWriteMsg *pWrite) {
  SVnodeObj *pVnode = pWrite->pVnode;
  if (pWrite->qtype != TAOS_QTYPE_RPC) return 0;
  if (pVnode->queuedWMsg < MAX_QUEUED_MSG_NUM && pVnode->queuedWMsgSize < MAX_QUEUED_MSG_SIZE &&
      pVnode->flowctrlLevel <= 0)
    return 0;

  if (tsEnableFlowCtrl == 0) {
    int32_t ms = (int32_t)pow(2, pVnode->flowctrlLevel + 2);
    if (ms > 100) ms = 100;
    vTrace("vgId:%d, msg:%p, app:%p, perform flowctrl for %d ms", pVnode->vgId, pWrite, pWrite->rpcMsg.ahandle, ms);
    taosMsleep(ms);
    return 0;
  } else {
    void *unUsedTimerId = NULL;
    taosTmrReset(vnodeFlowCtrlMsgToWQueue, 100, pWrite, tsDnodeTmr, &unUsedTimerId);

    vTrace("vgId:%d, msg:%p, app:%p, perform flowctrl, retry:%d", pVnode->vgId, pWrite, pWrite->rpcMsg.ahandle,
           pWrite->processedCount);
    return TSDB_CODE_VND_ACTION_IN_PROGRESS;
  }
}

void vnodeWaitWriteCompleted(SVnodeObj *pVnode) {
  int32_t extraSleep = 0;
  while (pVnode->queuedWMsg > 0) {
    vTrace("vgId:%d, queued wmsg num:%d", pVnode->vgId, pVnode->queuedWMsg);
    taosMsleep(10);
    extraSleep = 1;
  }

  if (extraSleep)
    taosMsleep(900);
}

SVnodeStatisInfo vnodeGetStatisInfo() {
  SVnodeStatisInfo info = {0};
  info.submitReqSucNum = atomic_exchange_64(&tsSubmitReqSucNum, 0);
  info.submitRowNum = atomic_exchange_64(&tsSubmitRowNum, 0);
  info.submitRowSucNum = atomic_exchange_64(&tsSubmitRowSucNum, 0);

  return info;
}
