提交 3a4a91a0 编写于 作者: S slguan

remove unsed files

上级 021d4b42
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(TDengine)
IF ((TD_LINUX_64) OR (TD_LINUX_32 AND TD_ARM))
INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/inc)
INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/dnode/inc)
INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/mnode/detail/inc)
INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/vnode/detail/inc)
INCLUDE_DIRECTORIES(${TD_COMMUNITY_DIR}/src/client/inc)
INCLUDE_DIRECTORIES(${TD_OS_DIR}/inc)
INCLUDE_DIRECTORIES(inc)
AUX_SOURCE_DIRECTORY(./src SRC)
LIST(REMOVE_ITEM SRC ./src/vnodeFileUtil.c)
LIST(REMOVE_ITEM SRC ./src/taosGrant.c)
ADD_LIBRARY(vnode ${SRC})
IF (TD_CLUSTER)
TARGET_LINK_LIBRARIES(vnode vcluster)
ELSEIF (TD_LITE)
TARGET_LINK_LIBRARIES(vnode vlite)
ENDIF ()
ENDIF ()
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODE_H
#define TDENGINE_VNODE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "os.h"
#include "tglobalcfg.h"
#include "tidpool.h"
#include "tlog.h"
#include "tmempool.h"
#include "trpc.h"
#include "tsclient.h"
#include "taosdef.h"
#include "tsocket.h"
#include "ttime.h"
#include "ttimer.h"
#include "tutil.h"
#include "vnodeCache.h"
#include "vnodeFile.h"
#include "vnodePeer.h"
#include "vnodeShell.h"
#define TSDB_FILE_HEADER_LEN 512
#define TSDB_FILE_HEADER_VERSION_SIZE 32
#define TSDB_CACHE_POS_BITS 13
#define TSDB_CACHE_POS_MASK 0x1FFF
#define TSDB_ACTION_INSERT 0
#define TSDB_ACTION_IMPORT 1
#define TSDB_ACTION_DELETE 2
#define TSDB_ACTION_UPDATE 3
#define TSDB_ACTION_MAX 4
enum _data_source {
TSDB_DATA_SOURCE_METER,
TSDB_DATA_SOURCE_VNODE,
TSDB_DATA_SOURCE_SHELL,
TSDB_DATA_SOURCE_QUEUE,
TSDB_DATA_SOURCE_LOG,
};
enum _sync_cmd {
TSDB_SYNC_CMD_FILE,
TSDB_SYNC_CMD_CACHE,
TSDB_SYNC_CMD_CREATE,
TSDB_SYNC_CMD_REMOVE,
};
typedef struct {
int64_t offset : 48;
int64_t length : 16;
} SMeterObjHeader;
typedef struct {
int64_t len;
char data[];
} SData;
#pragma pack(push, 8)
typedef struct {
SVnodeStatisticInfo vnodeStatistic;
int vnode;
SVnodeCfg cfg;
// SDiskDesc tierDisk[TSDB_MAX_TIER];
SVPeerDesc vpeers[TSDB_VNODES_SUPPORT];
SVnodePeer * peerInfo[TSDB_VNODES_SUPPORT];
char selfIndex;
char vnodeStatus;
char accessState; // Vnode access state, Readable/Writable
char syncStatus;
char commitInProcess;
pthread_t commitThread;
TSKEY firstKey; // minimum key uncommitted, it may be smaller than
// commitFirstKey
TSKEY commitFirstKey; // minimum key for a commit file, it shall be
// xxxx00000, calculated from fileId
TSKEY commitLastKey; // maximum key for a commit file, it shall be xxxx99999,
// calculated fromm fileId
int commitFileId;
TSKEY lastCreate;
TSKEY lastRemove;
TSKEY lastKey; // last key for the whole vnode, updated by every insert
// operation
uint64_t version;
int streamRole;
int numOfStreams;
void *streamTimer;
TSKEY lastKeyOnFile; // maximum key on the last file, is shall be xxxx99999
int fileId;
int badFileId;
int numOfFiles;
int maxFiles;
int maxFile1;
int maxFile2;
int nfd; // temp head file FD
int hfd; // head file FD
int lfd; // last file FD
int tfd; // temp last file FD
int dfd; // data file FD
int64_t dfSize;
int64_t lfSize;
uint64_t * fmagic; // hold magic number for each file
char cfn[TSDB_FILENAME_LEN];
char nfn[TSDB_FILENAME_LEN];
char lfn[TSDB_FILENAME_LEN]; // last file name
char tfn[TSDB_FILENAME_LEN]; // temp last file name
pthread_mutex_t vmutex;
int logFd;
char * pMem;
char * pWrite;
pthread_mutex_t logMutex;
char logFn[TSDB_FILENAME_LEN];
char logOFn[TSDB_FILENAME_LEN];
int64_t mappingSize;
int64_t mappingThreshold;
void * commitTimer;
void ** meterList;
void * pCachePool;
void * pQueue;
pthread_t thread;
int peersOnline;
int shellConns;
int meterConns;
struct _qinfo *pQInfoList;
TAOS * dbConn;
SMeterObjHeader *meterIndex;
} SVnodeObj;
#pragma pack(pop)
typedef struct SColumn {
short colId;
short bytes;
char type;
} SColumn;
typedef struct _meter_obj {
uint64_t uid;
char meterId[TSDB_TABLE_ID_LEN];
int sid;
short vnode;
short numOfColumns;
short bytesPerPoint;
short maxBytes;
int32_t pointsPerBlock;
int32_t pointsPerFileBlock;
int freePoints;
TSKEY lastKey; // updated by insert operation
TSKEY lastKeyOnFile; // last key on file, updated by commit action
TSKEY timeStamp; // delete or added time
uint64_t commitCount;
int32_t sversion;
short sqlLen;
char searchAlgorithm : 4;
char compAlgorithm : 4;
char status; // 0: ok, 1: stop stream computing
char reserved[16];
int state;
int numOfQueries;
char * pSql;
void * pStream;
void * pCache;
SColumn *schema;
} SMeterObj;
typedef struct {
char type;
char pversion; // protocol version
char action; // insert, import, delete, update
int32_t sversion; // only for insert
int32_t sid;
int32_t len;
uint64_t lastVersion; // latest version
char cont[];
} SVMsgHeader;
struct tSQLBinaryExpr;
typedef struct SColumnInfoEx {
SColumnInfo data;
int16_t colIdx;
int16_t colIdxInBuf;
/*
* 0: denotes if its is required in the first round of scan of data block
* 1: denotes if its is required in the secondary scan
*/
int16_t req[2];
} SColumnInfoEx;
struct SColumnFilterElem;
typedef bool (*__filter_func_t)(struct SColumnFilterElem *pFilter, char *val1, char *val2);
typedef struct SColumnFilterElem {
int16_t bytes; // column length
__filter_func_t fp;
SColumnFilterInfo filterInfo;
} SColumnFilterElem;
typedef struct SSingleColumnFilterInfo {
SColumnInfoEx info;
int32_t numOfFilters;
SColumnFilterElem *pFilters;
char * pData;
} SSingleColumnFilterInfo;
typedef struct SQuery {
short numOfCols;
SOrderVal order;
char keyIsMet; // if key is met, it will be set
char over;
int fileId; // only for query in file
int hfd; // only for query in file, head file handle
int dfd; // only for query in file, data file handle
int lfd; // only for query in file, last file handle
SCompBlock *pBlock; // only for query in file
SField ** pFields;
int numOfBlocks; // only for query in file
int blockBufferSize; // length of pBlock buffer
int currentSlot;
int firstSlot;
/*
* the two parameters are utilized to handle the data missing situation, caused by import operation.
* When the commit slot is the first slot, and commitPoints != 0
*/
int32_t commitSlot; // which slot is committed,
int32_t commitPoint; // starting point for next commit
int slot;
int pos;
TSKEY key;
int compBlockLen; // only for import
int64_t blockId;
TSKEY skey;
TSKEY ekey;
int64_t intervalTime;
int64_t slidingTime; // sliding time for sliding window query
char intervalTimeUnit; // interval data type, used for daytime revise
int8_t precision;
int16_t numOfOutputCols;
int16_t interpoType;
int16_t checkBufferInLoop; // check if the buffer is full during scan each block
SLimitVal limit;
int32_t rowSize;
SSqlGroupbyExpr * pGroupbyExpr;
SSqlFunctionExpr * pSelectExpr;
SColumnInfoEx * colList;
int32_t numOfFilterCols;
SSingleColumnFilterInfo *pFilterInfo;
int64_t * defaultVal;
TSKEY lastKey;
// buffer info
int64_t pointsRead; // the number of points returned
int64_t pointsToRead; // maximum number of points to read
int64_t pointsOffset; // the number of points offset to save read data
SData **sdata;
SData * tsData; // timestamp column/primary key column
} SQuery;
typedef struct {
char spi;
char encrypt;
char secret[TSDB_KEY_LEN];
char cipheringKey[TSDB_KEY_LEN];
} SConnSec;
typedef struct {
char * buffer;
char * offset;
int trans;
int bufferSize;
pthread_mutex_t qmutex;
} STranQueue;
// internal globals
extern int tsMeterSizeOnFile;
extern void * tsQueryQhandle;
extern int tsVnodePeers;
extern int tsMaxVnode;
extern int tsMaxQueues;
extern int tsOpenVnodes;
extern SVnodeObj *vnodeList;
extern void * vnodeTmrCtrl;
// read API
extern int (*vnodeSearchKeyFunc[])(char *pValue, int num, TSKEY key, int order);
void *vnodeQueryOnSingleTable(SMeterObj **pMeterObj, SSqlGroupbyExpr *pGroupbyExpr, SSqlFunctionExpr *sqlExprs,
SQueryMeterMsg *pQueryMsg, int *code);
void *vnodeQueryOnMultiMeters(SMeterObj **pMeterObj, SSqlGroupbyExpr *pGroupbyExpr, SSqlFunctionExpr *pSqlExprs,
SQueryMeterMsg *pQueryMsg, int *code);
// assistant/tool functions
SSqlGroupbyExpr *vnodeCreateGroupbyExpr(SQueryMeterMsg *pQuery, int32_t *code);
SSqlFunctionExpr *vnodeCreateSqlFunctionExpr(SQueryMeterMsg *pQuery, int32_t *code);
bool vnodeValidateExprColumnInfo(SQueryMeterMsg *pQueryMsg, SSqlFuncExprMsg *pExprMsg);
bool vnodeIsValidVnodeCfg(SVnodeCfg *pCfg);
int32_t vnodeGetResultSize(void *handle, int32_t *numOfRows);
int32_t vnodeCopyQueryResultToMsg(void *handle, char *data, int32_t numOfRows);
int64_t vnodeGetOffsetVal(void *thandle);
bool vnodeHasRemainResults(void *handle);
int vnodeRetrieveQueryResult(void *handle, int *pNum, char *argv[]);
int vnodeSaveQueryResult(void *handle, char *data, int32_t* size);
int vnodeRetrieveQueryInfo(void *handle, int *numOfRows, int *rowSize, int16_t *timePrec);
void vnodeFreeQInfo(void *, bool);
void vnodeFreeQInfoInQueue(void *param);
bool vnodeIsQInfoValid(void *param);
void vnodeDecRefCount(void *param);
void vnodeAddRefCount(void *param);
int32_t vnodeConvertQueryMeterMsg(SQueryMeterMsg *pQuery);
void vnodeQueryData(SSchedMsg *pMsg);
// meter API
int vnodeOpenMetersVnode(int vnode);
void vnodeCloseMetersVnode(int vnode);
int vnodeCreateMeterObj(SMeterObj *pNew, SConnSec *pSec);
int vnodeRemoveMeterObj(int vnode, int sid);
int vnodeInsertPoints(SMeterObj *pObj, char *cont, int contLen, char source, void *, int sversion, int *numOfPoints, TSKEY now);
int vnodeImportPoints(SMeterObj *pObj, char *cont, int contLen, char source, void *, int sversion, int *numOfPoints, TSKEY now);
int vnodeInsertBufferedPoints(int vnode);
int vnodeSaveAllMeterObjToFile(int vnode);
int vnodeSaveMeterObjToFile(SMeterObj *pObj);
int vnodeSaveVnodeCfg(int vnode, SVnodeCfg *pCfg, SVPeerDesc *pDesc);
int vnodeSaveVnodeInfo(int vnode);
// cache API
void *vnodeOpenCachePool(int vnode);
void vnodeCloseCachePool(int vnode);
void *vnodeAllocateCacheInfo(SMeterObj *pObj);
void vnodeFreeCacheInfo(SMeterObj *pObj);
void vnodeSetCommitQuery(SMeterObj *pObj, SQuery *pQuery);
int vnodeInsertPointToCache(SMeterObj *pObj, char *pData);
int vnodeQueryFromCache(SMeterObj *pObj, SQuery *pQuery);
uint64_t vnodeGetPoolCount(SVnodeObj *pVnode);
void vnodeUpdateCommitInfo(SMeterObj *pObj, int slot, int pos, uint64_t count);
void vnodeCommitOver(SVnodeObj *pVnode);
TSKEY vnodeGetFirstKey(int vnode);
int vnodeSyncRetrieveCache(int vnode, int fd);
int vnodeSyncRestoreCache(int vnode, int fd);
pthread_t vnodeCreateCommitThread(SVnodeObj *pVnode);
void vnodeCancelCommit(SVnodeObj *pVnode);
void vnodeCloseStream(SVnodeObj *pVnode);
void vnodeProcessCommitTimer(void *param, void *tmrId);
void vnodeSearchPointInCache(SMeterObj *pObj, SQuery *pQuery);
int vnodeAllocateCacheBlock(SMeterObj *pObj);
int vnodeFreeCacheBlock(SCacheBlock *pCacheBlock);
int vnodeIsCacheCommitted(SMeterObj *pObj);
// file API
int vnodeInitFile(int vnode);
int vnodeQueryFromFile(SMeterObj *pObj, SQuery *pQuery);
void *vnodeCommitToFile(void *param);
void *vnodeCommitMultiToFile(SVnodeObj *pVnode, int ssid, int esid);
int vnodeSyncRetrieveFile(int vnode, int fd, uint32_t fileId, uint64_t *fmagic);
int vnodeSyncRestoreFile(int vnode, int sfd);
int vnodeWriteBlockToFile(SMeterObj *pObj, SCompBlock *pBlock, SData *data[], SData *cdata[], int pointsRead);
int vnodeSearchPointInFile(SMeterObj *pObj, SQuery *pQuery);
int vnodeReadCompBlockToMem(SMeterObj *pObj, SQuery *pQuery, SData *sdata[]);
int vnodeOpenCommitFiles(SVnodeObj *pVnode, int noTempLast);
void vnodeCloseCommitFiles(SVnodeObj *pVnode);
int vnodeReadLastBlockToMem(SMeterObj *pObj, SCompBlock *pBlock, SData *sdata[]);
// vnode API
void vnodeUpdateStreamRole(SVnodeObj *pVnode);
int vnodeInitPeer(int numOfThreads);
void vnodeCleanUpPeer();
int vnodeOpenPeerVnode(int vnode);
void vnodeClosePeerVnode(int vnode);
void *vnodeGetMeterPeerConnection(SMeterObj *pObj, int index);
int vnodeForwardToPeer(SMeterObj *pObj, char *msg, int msgLen, char action, int sversion);
void vnodeCloseAllSyncFds(int vnode);
void vnodeConfigVPeers(int vnode, int numOfPeers, SVPeerDesc peerDesc[]);
void vnodeStartSyncProcess(SVnodeObj *pVnode);
void vnodeCancelSync(int vnode);
void vnodeListPeerStatus(char *buffer);
void vnodeCheckOwnStatus(SVnodeObj *pVnode);
int vnodeSaveMeterObjToFile(SMeterObj *pObj);
int vnodeRecoverFromPeer(SVnodeObj *pVnode, int fileId);
// vnodes API
int vnodeInitVnodes();
int vnodeInitStore();
void vnodeCleanUpVnodes();
int vnodeRemoveVnode(int vnode);
int vnodeCreateVnode(int vnode, SVnodeCfg *pCfg, SVPeerDesc *pDesc);
void vnodeOpenStreams(void *param, void *tmrId);
void vnodeCreateStream(SMeterObj *pObj);
void vnodeRemoveStream(SMeterObj *pObj);
// shell API
int vnodeInitShell();
void vnodeCleanUpShell();
int vnodeOpenShellVnode(int vnode);
void vnodeCloseShellVnode(int vnode);
// memter mgmt
int vnodeInitMeterMgmt();
void vnodeCleanUpMeterMgmt();
int vnodeOpenMeterMgmtVnode(int vnode);
int vnodeOpenMeterMgmtStoreVnode(int vnode);
void vnodeCloseMeterMgmtVnode(int vnode);
int vnodeCreateMeterMgmt(SMeterObj *pObj, SConnSec *pSec);
void vnodeRemoveMeterMgmt(SMeterObj *pObj);
SConnSec *vnodeGetMeterSec(int vnode, int sid);
int vnodeCreateMeterObjFile(int vnode);
// mgmt
void vnodeCleanUpMgmt();
int vnodeRetrieveMissedCreateMsg(int vnode, int fd, uint64_t stime);
int vnodeRestoreMissedCreateMsg(int vnode, int fd);
int vnodeRetrieveMissedRemoveMsg(int vid, int fd, uint64_t stime);
int vnodeRestoreMissedRemoveMsg(int vnode, int fd);
int vnodeProcessBufferedCreateMsgs(int vnode);
void vnodeSendVpeerCfgMsg(int vnode);
int vnodeSendMeterCfgMsg(int vnode, int sid);
int vnodeMgmtConns();
void vnodeRemoveFile(int vnode, int fileId);
// commit
int vnodeInitCommit(int vnode);
void vnodeCleanUpCommit(int vnode);
int vnodeRenewCommitLog(int vnode);
void vnodeRemoveCommitLog(int vnode);
int vnodeWriteToCommitLog(SMeterObj *pObj, char action, char *cont, int contLen, int sversion);
extern int (*vnodeProcessAction[])(SMeterObj *, char *, int, char, void *, int, int *, TSKEY);
extern int (*pCompFunc[])(const char *const input, int inputSize, const int elements, char *const output,
int outputSize, char algorithm, char *const buffer, int bufferSize);
extern int (*pDecompFunc[])(const char *const input, int compressedSize, const int elements, char *const output,
int outputSize, char algorithm, char *const buffer, int bufferSize);
// global variable and APIs provided by mgmt
extern char mgmtStatus;
extern char tsMgmtDirectory[];
extern const int16_t vnodeFileVersion;
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODE_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODECACHE_H
#define TDENGINE_VNODECACHE_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
short notFree;
short numOfPoints;
int slot;
int index;
int64_t blockId;
struct _meter_obj *pMeterObj;
char * offset[];
} SCacheBlock;
typedef struct {
int64_t blocks;
int maxBlocks;
int numOfBlocks;
int unCommittedBlocks;
int32_t currentSlot;
int32_t commitSlot; // which slot is committed
int32_t commitPoint; // starting point for next commit
SCacheBlock **cacheBlocks; // cache block list, circular list
} SCacheInfo;
typedef struct {
int vnode;
char ** pMem;
int64_t freeSlot;
pthread_mutex_t vmutex;
uint64_t count; // kind of transcation ID
int64_t notFreeSlots;
int64_t threshold;
char commitInProcess;
int cacheBlockSize;
int cacheNumOfBlocks;
} SCachePool;
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODECACHE_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODEDATAFILTERFUNC_H
#define TDENGINE_VNODEDATAFILTERFUNC_H
#ifdef __cplusplus
extern "C" {
#endif
#include "vnode.h"
__filter_func_t *vnodeGetRangeFilterFuncArray(int32_t type);
__filter_func_t *vnodeGetValueFilterFuncArray(int32_t type);
bool vnodeSupportPrefilter(int32_t type);
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODEDATAFILTERFUNC_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODEFILE_H
#define TDENGINE_VNODEFILE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "tchecksum.h"
#define TSDB_VNODE_DELIMITER 0xF00AFA0F
typedef struct { int64_t compInfoOffset; } SCompHeader;
typedef struct {
short colId;
short bytes;
int32_t numOfNullPoints;
int32_t type : 8;
int32_t offset : 24;
int32_t len; // data length
int64_t sum;
int64_t max;
int64_t min;
int16_t maxIndex;
int16_t minIndex;
char reserved[20];
} SField;
typedef struct {
int64_t last : 1;
int64_t offset : 63;
int32_t algorithm : 8; // compression algorithm can be changed
int32_t numOfPoints : 24; // how many points have been written into this block
int32_t sversion;
int32_t len; // total length of this data block
uint16_t numOfCols;
char reserved[16];
TSKEY keyFirst; // time stamp for the first point
TSKEY keyLast; // time stamp for the last point
} SCompBlock;
typedef struct {
SCompBlock *compBlock;
SField * fields;
} SCompBlockFields;
typedef struct {
uint64_t uid;
int64_t last : 1;
int64_t numOfBlocks : 62;
uint32_t delimiter; // delimiter for recovery
TSCKSUM checksum;
SCompBlock compBlocks[]; // comp block list
} SCompInfo;
typedef struct {
int64_t tempHeadOffset;
int64_t compInfoOffset;
int64_t oldCompBlockOffset;
int64_t oldNumOfBlocks;
int64_t newNumOfBlocks;
int64_t finalNumOfBlocks;
int64_t oldCompBlockLen;
int64_t newCompBlockLen;
int64_t finalCompBlockLen;
int64_t committedPoints;
int commitSlot;
int32_t last : 1;
int32_t changed : 1;
int32_t commitPos : 30;
int64_t commitCount;
SCompBlock lastBlock;
} SMeterInfo;
typedef struct { int64_t totalStorage; } SVnodeHeadInfo;
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODEFILE_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODEQUERYIMPL_H
#define TDENGINE_VNODEQUERYIMPL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "os.h"
#include "hash.h"
#include "hashfunc.h"
#define GET_QINFO_ADDR(x) ((char*)(x)-offsetof(SQInfo, query))
#define Q_STATUS_EQUAL(p, s) (((p) & (s)) != 0)
/*
* set the output buffer page size is 16k
* The page size should be sufficient for at least one output result or intermediate result.
* Some intermediate results may be extremely large, such as top/bottom(100) query.
*/
#define DEFAULT_INTERN_BUF_SIZE 16384L
#define INIT_ALLOCATE_DISK_PAGES 60L
#define DEFAULT_DATA_FILE_MAPPING_PAGES 2L
#define DEFAULT_DATA_FILE_MMAP_WINDOW_SIZE (DEFAULT_DATA_FILE_MAPPING_PAGES * DEFAULT_INTERN_BUF_SIZE)
#define IO_ENGINE_MMAP 0
#define IO_ENGINE_SYNC 1
#define DEFAULT_IO_ENGINE IO_ENGINE_SYNC
/**
* check if the primary column is load by default, otherwise, the program will
* forced to load primary column explicitly.
*/
#define PRIMARY_TSCOL_LOADED(query) ((query)->colList[0].data.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX)
typedef enum {
/*
* the program will call this function again, if this status is set.
* used to transfer from QUERY_RESBUF_FULL
*/
QUERY_NOT_COMPLETED = 0x1u,
/*
* output buffer is full, so, the next query will be employed,
* in this case, we need to set the appropriated start scan point for
* the next query.
*
* this status is only exist in group-by clause and
* diff/add/division/multiply/ query.
*/
QUERY_RESBUF_FULL = 0x2u,
/*
* query is over
* 1. this status is used in one row result query process, e.g.,
* count/sum/first/last/
* avg...etc.
* 2. when the query range on timestamp is satisfied, it is also denoted as
* query_compeleted
*/
QUERY_COMPLETED = 0x4u,
/*
* all data has been scanned, so current search is stopped,
* At last, the function will transfer this status to QUERY_COMPLETED
*/
QUERY_NO_DATA_TO_CHECK = 0x8u,
} vnodeQueryStatus;
typedef struct SPointInterpoSupporter {
int32_t numOfCols;
char** pPrevPoint;
char** pNextPoint;
} SPointInterpoSupporter;
typedef struct SBlockInfo {
TSKEY keyFirst;
TSKEY keyLast;
int32_t numOfCols;
int32_t size;
} SBlockInfo;
typedef struct SMeterDataBlockInfoEx {
SCompBlockFields pBlock;
SMeterDataInfo* pMeterDataInfo;
int32_t blockIndex;
int32_t groupIdx; /* number of group is less than the total number of meters */
} SMeterDataBlockInfoEx;
typedef enum {
DISK_DATA_LOAD_FAILED = -0x1,
DISK_DATA_LOADED = 0x0,
DISK_DATA_DISCARDED = 0x01,
} vnodeDiskLoadStatus;
#define IS_MASTER_SCAN(runtime) (((runtime)->scanFlag & 1u) == MASTER_SCAN)
#define IS_SUPPLEMENT_SCAN(runtime) ((runtime)->scanFlag == SUPPLEMENTARY_SCAN)
#define SET_SUPPLEMENT_SCAN_FLAG(runtime) ((runtime)->scanFlag = SUPPLEMENTARY_SCAN)
#define SET_MASTER_SCAN_FLAG(runtime) ((runtime)->scanFlag = MASTER_SCAN)
typedef int (*__block_search_fn_t)(char* data, int num, int64_t key, int order);
static FORCE_INLINE SMeterObj* getMeterObj(void* hashHandle, int32_t sid) {
return *(SMeterObj**)taosHashGet(hashHandle, (const char*)&sid, sizeof(sid));
}
bool isQueryKilled(SQuery* pQuery);
bool isFixedOutputQuery(SQuery* pQuery);
bool isPointInterpoQuery(SQuery* pQuery);
bool isSumAvgRateQuery(SQuery *pQuery);
bool isTopBottomQuery(SQuery* pQuery);
bool isFirstLastRowQuery(SQuery* pQuery);
bool isTSCompQuery(SQuery* pQuery);
bool notHasQueryTimeRange(SQuery* pQuery);
bool needSupplementaryScan(SQuery* pQuery);
bool onDemandLoadDatablock(SQuery* pQuery, int16_t queryRangeSet);
void setQueryStatus(SQuery* pQuery, int8_t status);
bool doRevisedResultsByLimit(SQInfo* pQInfo);
void truncateResultByLimit(SQInfo* pQInfo, int64_t* final, int32_t* interpo);
void initCtxOutputBuf(SQueryRuntimeEnv* pRuntimeEnv);
void resetCtxOutputBuf(SQueryRuntimeEnv* pRuntimeEnv);
void forwardCtxOutputBuf(SQueryRuntimeEnv* pRuntimeEnv, int64_t output);
bool needPrimaryTimestampCol(SQuery* pQuery, SBlockInfo* pBlockInfo);
void vnodeScanAllData(SQueryRuntimeEnv* pRuntimeEnv);
int32_t vnodeQueryResultInterpolate(SQInfo* pQInfo, tFilePage** pDst, tFilePage** pDataSrc, int32_t numOfRows,
int32_t* numOfInterpo);
void copyResToQueryResultBuf(STableQuerySupportObj* pSupporter, SQuery* pQuery);
void doSkipResults(SQueryRuntimeEnv* pRuntimeEnv);
void doFinalizeResult(SQueryRuntimeEnv* pRuntimeEnv);
int64_t getNumOfResult(SQueryRuntimeEnv* pRuntimeEnv);
void forwardQueryStartPosition(SQueryRuntimeEnv* pRuntimeEnv);
bool normalizedFirstQueryRange(bool dataInDisk, bool dataInCache, STableQuerySupportObj* pSupporter,
SPointInterpoSupporter* pPointInterpSupporter, int64_t* key);
void pointInterpSupporterInit(SQuery* pQuery, SPointInterpoSupporter* pInterpoSupport);
void pointInterpSupporterDestroy(SPointInterpoSupporter* pPointInterpSupport);
void pointInterpSupporterSetData(SQInfo* pQInfo, SPointInterpoSupporter* pPointInterpSupport);
int64_t loadRequiredBlockIntoMem(SQueryRuntimeEnv* pRuntimeEnv, SPositionInfo* position);
void disableFunctForSuppleScan(STableQuerySupportObj* pSupporter, int32_t order);
void enableFunctForMasterScan(SQueryRuntimeEnv* pRuntimeEnv, int32_t order);
int32_t mergeMetersResultToOneGroups(STableQuerySupportObj* pSupporter);
void copyFromWindowResToSData(SQInfo* pQInfo, SWindowResult* result);
SBlockInfo getBlockInfo(SQueryRuntimeEnv *pRuntimeEnv);
SBlockInfo getBlockBasicInfo(SQueryRuntimeEnv *pRuntimeEnv, void* pBlock, int32_t type);
SCacheBlock* getCacheDataBlock(SMeterObj* pMeterObj, SQueryRuntimeEnv* pRuntimeEnv, int32_t slot);
void stableApplyFunctionsOnBlock(STableQuerySupportObj* pSupporter, SMeterDataInfo* pMeterDataInfo,
SBlockInfo* pBlockInfo, SField* pFields, __block_search_fn_t searchFn);
int32_t vnodeFilterQualifiedMeters(SQInfo* pQInfo, int32_t vid, tSidSet* pSidSet, SMeterDataInfo* pMeterDataInfo,
int32_t* numOfMeters, SMeterDataInfo*** pReqMeterDataInfo);
int32_t vnodeGetVnodeHeaderFileIndex(int32_t* fid, SQueryRuntimeEnv* pRuntimeEnv, int32_t order);
int32_t createDataBlocksInfoEx(SMeterDataInfo** pMeterDataInfo, int32_t numOfMeters,
SMeterDataBlockInfoEx** pDataBlockInfoEx, int32_t numOfCompBlocks,
int32_t* nAllocBlocksInfoSize, int64_t addr);
void freeMeterBlockInfoEx(SMeterDataBlockInfoEx* pDataBlockInfoEx, int32_t len);
void setExecutionContext(STableQuerySupportObj* pSupporter, SMeterQueryInfo* pMeterQueryInfo, int32_t meterIdx,
int32_t groupIdx, TSKEY nextKey);
int32_t setAdditionalInfo(STableQuerySupportObj *pSupporter, int32_t meterIdx, SMeterQueryInfo *pMeterQueryInfo);
void doGetAlignedIntervalQueryRangeImpl(SQuery* pQuery, int64_t pKey, int64_t keyFirst, int64_t keyLast,
int64_t* actualSkey, int64_t* actualEkey, int64_t* skey, int64_t* ekey);
int64_t getQueryStartPositionInCache(SQueryRuntimeEnv* pRuntimeEnv, int32_t* slot, int32_t* pos, bool ignoreQueryRange);
int32_t getDataBlocksForMeters(STableQuerySupportObj* pSupporter, SQuery* pQuery, int32_t numOfMeters,
const char* filePath, SMeterDataInfo** pMeterDataInfo, uint32_t* numOfBlocks);
int32_t LoadDatablockOnDemand(SCompBlock* pBlock, SField** pFields, uint8_t* blkStatus, SQueryRuntimeEnv* pRuntimeEnv,
int32_t fileIdx, int32_t slotIdx, __block_search_fn_t searchFn, bool onDemand);
int32_t vnodeGetHeaderFile(SQueryRuntimeEnv* pRuntimeEnv, int32_t fileIndex);
/**
* Create SMeterQueryInfo.
* The MeterQueryInfo is created one for each table during super table query
*
* @param skey
* @param ekey
* @return
*/
SMeterQueryInfo* createMeterQueryInfo(STableQuerySupportObj* pSupporter, int32_t sid, TSKEY skey, TSKEY ekey);
/**
* Destroy meter query info
* @param pMeterQInfo
* @param numOfCols
*/
void destroyMeterQueryInfo(SMeterQueryInfo* pMeterQueryInfo, int32_t numOfCols);
/**
* change the meter query info for supplement scan
* @param pMeterQueryInfo
* @param skey
* @param ekey
*/
void changeMeterQueryInfoForSuppleQuery(SQuery* pQuery, SMeterQueryInfo* pMeterQueryInfo,
TSKEY skey, TSKEY ekey);
/**
* add the new allocated disk page to meter query info
* the new allocated disk page is used to keep the intermediate (interval) results
* @param pQuery
* @param pMeterQueryInfo
* @param pSupporter
*/
tFilePage* addDataPageForMeterQueryInfo(SQuery* pQuery, SMeterQueryInfo* pMeterQueryInfo,
STableQuerySupportObj* pSupporter);
/**
* restore the query range data from SMeterQueryInfo to runtime environment
*
* @param pRuntimeEnv
* @param pMeterQueryInfo
*/
void restoreIntervalQueryRange(SQueryRuntimeEnv* pRuntimeEnv, SMeterQueryInfo* pMeterQueryInfo);
/**
* set the interval query range for the interval query, when handling a data(cache) block
*
* @param pMeterQueryInfo
* @param pSupporter
* @param key
*/
void setIntervalQueryRange(SMeterQueryInfo* pMeterQueryInfo, STableQuerySupportObj* pSupporter, int64_t key);
/**
* set the meter data information
* @param pMeterDataInfo
* @param pMeterObj current query meter object
* @param meterIdx meter index in the sid list
* @param groupId group index, which the meter is belonged to
*/
void setMeterDataInfo(SMeterDataInfo* pMeterDataInfo, SMeterObj* pMeterObj, int32_t meterIdx, int32_t groupId);
void vnodeSetTagValueInParam(tSidSet* pSidSet, SQueryRuntimeEnv* pRuntimeEnv, SMeterSidExtInfo* pMeterInfo);
void vnodeCheckIfDataExists(SQueryRuntimeEnv* pRuntimeEnv, SMeterObj* pMeterObj, bool* dataInDisk, bool* dataInCache);
void displayInterResult(SData** pdata, SQuery* pQuery, int32_t numOfRows);
void vnodePrintQueryStatistics(STableQuerySupportObj* pSupporter);
void clearTimeWindowResBuf(SQueryRuntimeEnv* pRuntimeEnv, SWindowResult* pOneOutputRes);
void copyTimeWindowResBuf(SQueryRuntimeEnv* pRuntimeEnv, SWindowResult* dst, const SWindowResult* src);
int32_t initWindowResInfo(SWindowResInfo* pWindowResInfo, SQueryRuntimeEnv* pRuntimeEnv, int32_t size,
int32_t threshold, int16_t type);
void cleanupTimeWindowInfo(SWindowResInfo* pWindowResInfo, SQueryRuntimeEnv* pRuntimeEnv);
void resetTimeWindowInfo(SQueryRuntimeEnv* pRuntimeEnv, SWindowResInfo* pWindowResInfo);
void clearFirstNTimeWindow(SQueryRuntimeEnv *pRuntimeEnv, int32_t num);
void clearClosedTimeWindow(SQueryRuntimeEnv* pRuntimeEnv);
int32_t numOfClosedTimeWindow(SWindowResInfo* pWindowResInfo);
void closeTimeWindow(SWindowResInfo* pWindowResInfo, int32_t slot);
void closeAllTimeWindow(SWindowResInfo* pWindowResInfo);
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODEQUERYIMPL_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODEREAD_H
#define TDENGINE_VNODEREAD_H
#ifdef __cplusplus
extern "C" {
#endif
#include "os.h"
#include "qresultBuf.h"
#include "qinterpolation.h"
#include "vnodeTagMgmt.h"
/*
* use to keep the first point position, consisting of position in blk and block
* id, file id
*/
typedef struct {
int32_t pos;
int32_t slot;
int32_t fileId;
} SPositionInfo;
typedef struct SLoadDataBlockInfo {
int32_t fileListIndex; /* index of this file in files list of this vnode */
int32_t fileId;
int32_t slotIdx;
int32_t sid;
bool tsLoaded; // if timestamp column of current block is loaded or not
} SLoadDataBlockInfo;
typedef struct SLoadCompBlockInfo {
int32_t sid; /* meter sid */
int32_t fileId;
int32_t fileListIndex;
} SLoadCompBlockInfo;
/*
* the header file info for one vnode
*/
typedef struct SHeaderFileInfo {
int32_t fileID; // file id
} SHeaderFileInfo;
typedef struct SQueryCostSummary {
double cacheTimeUs;
double fileTimeUs;
int64_t numOfFiles; // opened files during query
int64_t numOfTables; // num of queries tables
int64_t numOfSeek; // number of seek operation
int64_t readDiskBlocks; // accessed disk block
int64_t skippedFileBlocks; // skipped blocks
int64_t blocksInCache; // accessed cache blocks
int64_t readField; // field size
int64_t totalFieldSize; // total read fields size
double loadFieldUs; // total elapsed time to read fields info
int64_t totalBlockSize; // read data blocks
double loadBlocksUs; // total elapsed time to read data blocks
int64_t totalGenData; // in-memory generated data
int64_t readCompInfo; // read compblock info
int64_t totalCompInfoSize; // total comp block size
double loadCompInfoUs; // total elapsed time to read comp block info
int64_t tmpBufferInDisk; // size of buffer for intermediate result
} SQueryCostSummary;
typedef struct SPosInfo {
int16_t pageId;
int16_t rowId;
} SPosInfo;
typedef struct STimeWindow {
TSKEY skey;
TSKEY ekey;
} STimeWindow;
typedef struct SWindowStatus {
bool closed;
} SWindowStatus;
typedef struct SWindowResult {
uint16_t numOfRows;
SPosInfo pos; // Position of current result in disk-based output buffer
SResultInfo* resultInfo; // For each result column, there is a resultInfo
STimeWindow window; // The time window that current result covers.
SWindowStatus status;
} SWindowResult;
/*
* header files info, avoid to iterate the directory, the data is acquired
* during in query preparation function
*/
typedef struct SQueryFilesInfo {
SHeaderFileInfo* pFileInfo;
uint32_t numOfFiles; // the total available number of files for this virtual node during query execution
int32_t current; // the memory mapped header file, NOTE: only one header file can be mmap.
int32_t vnodeId;
int32_t headerFd; // header file fd
int64_t headerFileSize;
int32_t dataFd;
int32_t lastFd;
char headerFilePath[PATH_MAX]; // current opened header file name
char dataFilePath[PATH_MAX]; // current opened data file name
char lastFilePath[PATH_MAX]; // current opened last file path
char dbFilePathPrefix[PATH_MAX];
} SQueryFilesInfo;
typedef struct SWindowResInfo {
SWindowResult* pResult; // reference to SQuerySupporter->pResult
void* hashList; // hash list for quick access
int16_t type; // data type for hash key
int32_t capacity; // max capacity
int32_t curIndex; // current start active index
int32_t size;
int64_t startTime; // start time of the first time window for sliding query
int64_t prevSKey; // previous (not completed) sliding window start key
int64_t threshold; // threshold for return completed results.
} SWindowResInfo;
typedef struct SQueryRuntimeEnv {
SPositionInfo startPos; /* the start position, used for secondary/third iteration */
SPositionInfo endPos; /* the last access position in query, served as the start pos of reversed order query */
SPositionInfo nextPos; /* start position of the next scan */
SData* colDataBuffer[TSDB_MAX_COLUMNS];
SResultInfo* resultInfo; // todo refactor to merge with SWindowResInfo
uint8_t blockStatus; // Indicate if data block is loaded, the block is first/last/internal block
int32_t unzipBufSize;
SData* primaryColBuffer;
char* unzipBuffer;
char* secondaryUnzipBuffer;
SQuery* pQuery;
SMeterObj* pMeterObj;
SQLFunctionCtx* pCtx;
SLoadDataBlockInfo loadBlockInfo; /* record current block load information */
SLoadCompBlockInfo loadCompBlockInfo; /* record current compblock information in SQuery */
SQueryFilesInfo vnodeFileInfo;
int16_t numOfRowsPerPage;
int16_t offset[TSDB_MAX_COLUMNS];
uint16_t scanFlag; // denotes reversed scan of data or not
SInterpolationInfo interpoInfo;
SData** pInterpoBuf;
SWindowResInfo windowResInfo;
STSBuf* pTSBuf;
STSCursor cur;
SQueryCostSummary summary;
bool stableQuery; // is super table query or not
SDiskbasedResultBuf* pResultBuf; // query result buffer based on blocked-wised disk file
/*
* Temporarily hold the in-memory cache block info during scan cache blocks
* Here we do not use the cache block info from pMeterObj, simple because it may change anytime
* during the query by the submit/insert handling threads.
* So we keep a copy of the support structure as well as the cache block data itself.
*/
SCacheBlock cacheBlock;
} SQueryRuntimeEnv;
/* intermediate pos during multimeter query involves interval */
typedef struct SMeterQueryInfo {
int64_t lastKey;
int64_t skey;
int64_t ekey;
int32_t numOfRes;
int16_t queryRangeSet; // denote if the query range is set, only available for interval query
int64_t tag;
STSCursor cur;
int32_t sid; // for retrieve the page id list
SWindowResInfo windowResInfo;
} SMeterQueryInfo;
typedef struct SMeterDataInfo {
uint64_t offsetInHeaderFile;
int32_t numOfBlocks;
int32_t start; // start block index
SCompBlock* pBlock;
int32_t meterOrderIdx;
SMeterObj* pMeterObj;
int32_t groupIdx; // group id in meter list
SMeterQueryInfo* pMeterQInfo;
} SMeterDataInfo;
typedef struct STableQuerySupportObj {
void* pMetersHashTable; // meter table hash list
SMeterSidExtInfo** pMeterSidExtInfo;
int32_t numOfMeters;
/*
* multimeter query resultset.
* In multimeter queries, the result is temporarily stored on this structure, instead of
* directly put result into output buffer, since we have no idea how many number of
* rows may be generated by a specific subgroup. When query on all subgroups is executed,
* the result is copy to output buffer. This attribution is not used during single meter query processing.
*/
SQueryRuntimeEnv runtimeEnv;
int64_t rawSKey;
int64_t rawEKey;
int32_t subgroupIdx;
int32_t offset; /* offset in group result set of subgroup */
tSidSet* pSidSet;
/*
* the query is executed position on which meter of the whole list.
* when the index reaches the last one of the list, it means the query is completed.
* We later may refactor to remove this attribution by using another flag to denote
* whether a multimeter query is completed or not.
*/
int32_t meterIdx;
int32_t numOfGroupResultPages;
int32_t groupResultSize;
SMeterDataInfo* pMeterDataInfo;
TSKEY* tsList;
} STableQuerySupportObj;
typedef struct _qinfo {
uint64_t signature;
int32_t refCount; // QInfo reference count, when the value is 0, it can be released safely
char user[TSDB_TABLE_ID_LEN + 1];
char sql[TSDB_SHOW_SQL_LEN];
uint8_t stream;
uint16_t port;
uint32_t ip;
uint64_t startTime;
int64_t useconds;
int killed;
struct _qinfo *prev, *next;
SQuery query;
int totalPoints;
int pointsRead;
int pointsReturned;
int pointsInterpo;
int code;
char bufIndex;
char changed;
char over;
SMeterObj* pObj;
sem_t dataReady;
STableQuerySupportObj* pTableQuerySupporter;
int (*fp)(SMeterObj*, SQuery*);
} SQInfo;
int32_t vnodeQueryTablePrepare(SQInfo* pQInfo, SMeterObj* pMeterObj, STableQuerySupportObj* pSMultiMeterObj,
void* param);
void vnodeQueryFreeQInfoEx(SQInfo* pQInfo);
bool vnodeParametersSafetyCheck(SQuery* pQuery);
int32_t vnodeSTableQueryPrepare(SQInfo* pQInfo, SQuery* pQuery, void* param);
/**
* decrease the numofQuery of each table that is queried, enable the
* remove/close operation can be executed
* @param pQInfo
*/
void vnodeDecMeterRefcnt(SQInfo* pQInfo);
/* sql query handle in dnode */
void vnodeSingleTableQuery(SSchedMsg* pMsg);
/*
* handle multi-meter query process
*/
void vnodeMultiMeterQuery(SSchedMsg* pMsg);
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODEREAD_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODESHELL_H
#define TDENGINE_VNODESHELL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "os.h"
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODESHELL_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODESTORE_H
#define TDENGINE_VNODESTORE_H
#ifdef __cplusplus
extern "C" {
#endif
void vnodeProcessDataFromVnode(SIntMsg *msg, void *tcpHandle);
void vnodeCalcOpenVnodes();
bool vnodeRemoveDataFileFromLinkFile(char* linkFile, char* de_name);
int vnodeInitInfo();
#ifdef __cplusplus
}
#endif
#endif // TDEGINE_VNODESTORE_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TBASE_MNODE_SUPER_TABLE_QUERY_H
#define TBASE_MNODE_SUPER_TABLE_QUERY_H
#include "os.h"
#include "mnode.h"
#include "qast.h"
int32_t mgmtDoJoin(SSuperTableMetaMsg* pSuperTableMetaMsg, tQueryResultset* pRes);
void mgmtReorganizeMetersInMetricMeta(SSuperTableMetaMsg* pInfo, int32_t index, tQueryResultset* pRes);
#endif
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODESYSTEM_H
#define TDENGINE_VNODESYSTEM_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODESYSTEM_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODETAGMGMT_H
#define TDENGINE_VNODETAGMGMT_H
#ifdef __cplusplus
extern "C" {
#endif
/*
* @version 0.1
* @date 2018/01/02
* @author liaohj
* management of the tag value of tables
* in query, client need the vnode to aggregate results according to tags
* values,
* the grouping operation is done here.
* Note:
* 1. we implement a quick sort algorithm, may remove it later.
*/
typedef int32_t (*__ext_compar_fn_t)(const void *p1, const void *p2, void *param);
tSidSet *tSidSetCreate(struct SMeterSidExtInfo **pMeterSidExtInfo, int32_t numOfMeters, SSchema *pSchema,
int32_t numOfTags, SColIndexEx *colList, int32_t numOfOrderCols);
int32_t *calculateSubGroup(void **pSids, int32_t numOfMeters, int32_t *numOfSubset, tOrderDescriptor *pOrderDesc,
__ext_compar_fn_t compareFn);
void tSidSetDestroy(tSidSet **pSets);
void tSidSetSort(tSidSet *pSets);
int32_t meterSidComparator(const void *s1, const void *s2, void *param);
int32_t doCompare(char *f1, char *f2, int32_t type, int32_t size);
void tQSortEx(void **pMeterSids, size_t size, int32_t start, int32_t end, void *param, __ext_compar_fn_t compareFn);
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODETAGMGMT_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef TDENGINE_VNODE_UTIL_H
#define TDENGINE_VNODE_UTIL_H
#ifdef __cplusplus
extern "C" {
#endif
/* get the qinfo struct address from the query struct address */
#define GET_COLUMN_BYTES(query, colidx) \
((query)->colList[(query)->pSelectExpr[colidx].pBase.colInfo.colIdxInBuf].data.bytes)
#define GET_COLUMN_TYPE(query, colidx) \
((query)->colList[(query)->pSelectExpr[colidx].pBase.colInfo.colIdxInBuf].data.type)
#define QUERY_IS_ASC_QUERY(q) (GET_FORWARD_DIRECTION_FACTOR((q)->order.order) == QUERY_ASC_FORWARD_STEP)
#define EXTRA_BYTES 2 // for possible compression deflation
#define GET_COL_DATA_POS(query, index, step) ((query)->pos + (index)*(step))
int vnodeGetEid(int days);
int vnodeCheckFileIntegrity(FILE *fp);
void vnodeCreateFileHeader(FILE *fp);
void vnodeCreateFileHeaderFd(int fd);
void vnodeGetHeadFileHeaderInfo(int fd, SVnodeHeadInfo *pHeadInfo);
void vnodeUpdateHeadFileHeader(int fd, SVnodeHeadInfo *pHeadInfo);
/**
* check if two schema is identical or not
* This function does not check if a schema is valid or not
*
* @param pSSchemaFirst
* @param numOfCols1
* @param pSSchemaSecond
* @param numOfCols2
* @return
*/
bool vnodeMeterSchemaIdentical(SColumn *pSchema1, int32_t numOfCols1, SColumn *pSchema2, int32_t numOfCols2);
/**
* free SFields in SQuery
* vnodeFreeFields must be called before free(pQuery->pBlock);
* @param pQuery
*/
void vnodeFreeFields(SQuery *pQuery);
void vnodeUpdateFilterColumnIndex(SQuery* pQuery);
void vnodeUpdateQueryColumnIndex(SQuery* pQuery, SMeterObj* pMeterObj);
int32_t vnodeCreateFilterInfo(void* pQInfo, SQuery *pQuery);
bool vnodeFilterData(SQuery* pQuery, int32_t* numOfActualRead, int32_t index);
bool vnodeDoFilterData(SQuery* pQuery, int32_t elemPos);
bool vnodeIsProjectionQuery(SSqlFunctionExpr *pExpr, int32_t numOfOutput);
int32_t vnodeIncQueryRefCount(SQueryMeterMsg *pQueryMsg, SMeterSidExtInfo **pSids, SMeterObj **pMeterObjList,
int32_t *numOfInc);
void vnodeDecQueryRefCount(SQueryMeterMsg *pQueryMsg, SMeterObj **pMeterObjList, int32_t numOfInc);
int32_t vnodeSetMeterState(SMeterObj* pMeterObj, int32_t state);
void vnodeClearMeterState(SMeterObj* pMeterObj, int32_t state);
bool vnodeIsMeterState(SMeterObj* pMeterObj, int32_t state);
void vnodeSetMeterDeleting(SMeterObj* pMeterObj);
int32_t vnodeSetMeterInsertImportStateEx(SMeterObj* pObj, int32_t st);
bool vnodeIsSafeToDeleteMeter(SVnodeObj* pVnode, int32_t sid);
void vnodeFreeColumnInfo(SColumnInfo* pColumnInfo);
bool isGroupbyNormalCol(SSqlGroupbyExpr* pExpr);
#ifdef __cplusplus
}
#endif
#endif // TDENGINE_VNODE_UTIL_H
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "taosmsg.h"
#include "vnode.h"
#include "vnodeCache.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
void vnodeSearchPointInCache(SMeterObj *pObj, SQuery *pQuery);
void vnodeProcessCommitTimer(void *param, void *tmrId);
void *vnodeOpenCachePool(int vnode) {
SCachePool *pCachePool;
SVnodeCfg * pCfg = &vnodeList[vnode].cfg;
int blockId = 0;
char * pMem = NULL;
pCachePool = (SCachePool *)malloc(sizeof(SCachePool));
if (pCachePool == NULL) {
dError("no memory to allocate cache pool!");
return NULL;
}
memset(pCachePool, 0, sizeof(SCachePool));
pCachePool->count = 1;
pCachePool->vnode = vnode;
pthread_mutex_init(&(pCachePool->vmutex), NULL);
size_t size = sizeof(char *) * pCfg->cacheNumOfBlocks.totalBlocks;
pCachePool->pMem = malloc(size);
if (pCachePool->pMem == NULL) {
dError("no memory to allocate cache blocks!");
pthread_mutex_destroy(&(pCachePool->vmutex));
tfree(pCachePool);
return NULL;
}
memset(pCachePool->pMem, 0, size);
pCachePool->threshold = pCfg->cacheNumOfBlocks.totalBlocks * 0.6;
int maxAllocBlock = (1024 * 1024 * 1024) / pCfg->cacheBlockSize;
if (maxAllocBlock < 1) {
dError("Cache block size is too large");
pthread_mutex_destroy(&(pCachePool->vmutex));
tfree(pCachePool->pMem);
tfree(pCachePool);
return NULL;
}
while (blockId < pCfg->cacheNumOfBlocks.totalBlocks) {
// TODO : Allocate real blocks
int allocBlocks = MIN(pCfg->cacheNumOfBlocks.totalBlocks - blockId, maxAllocBlock);
pMem = calloc(allocBlocks, pCfg->cacheBlockSize);
if (pMem == NULL) {
dError("failed to allocate cache memory: %d", allocBlocks*pCfg->cacheBlockSize);
goto _err_exit;
}
for (int i = 0; i < allocBlocks; i++) {
pCachePool->pMem[blockId] = pMem + i * pCfg->cacheBlockSize;
blockId++;
}
}
dPrint("vid:%d, cache pool is allocated:0x%x", vnode, pCachePool);
return pCachePool;
_err_exit:
pthread_mutex_destroy(&(pCachePool->vmutex));
// TODO : Free the cache blocks and return
blockId = 0;
while (blockId < pCfg->cacheNumOfBlocks.totalBlocks) {
tfree(pCachePool->pMem[blockId]);
blockId = blockId + (MIN(maxAllocBlock, pCfg->cacheNumOfBlocks.totalBlocks - blockId));
}
tfree(pCachePool->pMem);
tfree(pCachePool);
return NULL;
}
void vnodeCloseCachePool(int vnode) {
SVnodeObj * pVnode = vnodeList + vnode;
SCachePool *pCachePool = (SCachePool *)pVnode->pCachePool;
int blockId = 0;
taosTmrStopA(&pVnode->commitTimer);
if (pVnode->commitInProcess) pthread_cancel(pVnode->commitThread);
dPrint("vid:%d, cache pool closed, count:%d", vnode, pCachePool->count);
int maxAllocBlock = (1024 * 1024 * 1024) / pVnode->cfg.cacheBlockSize;
while (blockId < pVnode->cfg.cacheNumOfBlocks.totalBlocks) {
tfree(pCachePool->pMem[blockId]);
blockId = blockId + (MIN(maxAllocBlock, pVnode->cfg.cacheNumOfBlocks.totalBlocks - blockId));
}
tfree(pCachePool->pMem);
pthread_mutex_destroy(&(pCachePool->vmutex));
tfree(pCachePool);
pVnode->pCachePool = NULL;
}
void *vnodeAllocateCacheInfo(SMeterObj *pObj) {
SCacheInfo *pInfo;
size_t size;
SVnodeCfg * pCfg = &vnodeList[pObj->vnode].cfg;
size = sizeof(SCacheInfo);
pInfo = (SCacheInfo *)malloc(size);
if (pInfo == NULL) {
dError("id:%s, no memory for cacheInfo", pObj->meterId);
return NULL;
}
memset(pInfo, 0, size);
pInfo->maxBlocks = vnodeList[pObj->vnode].cfg.blocksPerMeter;
size = sizeof(SCacheBlock *) * pInfo->maxBlocks;
pInfo->cacheBlocks = (SCacheBlock **)malloc(size);
if (pInfo->cacheBlocks == NULL) {
dError("id:%s, no memory for cacheBlocks", pObj->meterId);
tfree(pInfo);
return NULL;
}
memset(pInfo->cacheBlocks, 0, size);
pInfo->currentSlot = -1;
pObj->pointsPerBlock =
(pCfg->cacheBlockSize - sizeof(SCacheBlock) - pObj->numOfColumns * sizeof(char *)) / pObj->bytesPerPoint;
if (pObj->pointsPerBlock > pObj->pointsPerFileBlock) pObj->pointsPerBlock = pObj->pointsPerFileBlock;
pObj->pCache = (void *)pInfo;
pObj->freePoints = pObj->pointsPerBlock * pInfo->maxBlocks;
return (void *)pInfo;
}
int vnodeFreeCacheBlock(SCacheBlock *pCacheBlock) {
SMeterObj * pObj;
SCacheInfo *pInfo;
if (pCacheBlock == NULL) return -1;
pObj = pCacheBlock->pMeterObj;
pInfo = (SCacheInfo *)pObj->pCache;
if (pObj) {
pInfo->numOfBlocks--;
if (pInfo->numOfBlocks < 0) {
dError("vid:%d sid:%d id:%s, numOfBlocks:%d shall never be negative", pObj->vnode, pObj->sid, pObj->meterId,
pInfo->numOfBlocks);
}
if (pCacheBlock->blockId == 0) {
dError("vid:%d sid:%d id:%s, double free", pObj->vnode, pObj->sid, pObj->meterId);
}
SCachePool *pPool = (SCachePool *)vnodeList[pObj->vnode].pCachePool;
if (pCacheBlock->notFree) {
pPool->notFreeSlots--;
pInfo->unCommittedBlocks--;
dTrace("vid:%d sid:%d id:%s, cache block is not free, slot:%d, index:%d notFreeSlots:%d",
pObj->vnode, pObj->sid, pObj->meterId, pCacheBlock->slot, pCacheBlock->index, pPool->notFreeSlots);
}
dTrace("vid:%d sid:%d id:%s, free a cache block, numOfBlocks:%d, slot:%d, index:%d notFreeSlots:%d",
pObj->vnode, pObj->sid, pObj->meterId, pInfo->numOfBlocks, pCacheBlock->slot, pCacheBlock->index,
pPool->notFreeSlots);
memset(pCacheBlock, 0, sizeof(SCacheBlock));
} else {
dError("BUG, pObj is null");
}
return 0;
}
void vnodeFreeCacheInfo(SMeterObj *pObj) {
SCacheInfo * pInfo;
SCacheBlock *pCacheBlock;
SCachePool * pPool;
int slot, numOfBlocks;
if (pObj == NULL || pObj->pCache == NULL) return;
pPool = (SCachePool *)vnodeList[pObj->vnode].pCachePool;
pInfo = (SCacheInfo *)pObj->pCache;
if (pPool == NULL || pInfo == NULL) return;
pthread_mutex_lock(&pPool->vmutex);
numOfBlocks = pInfo->numOfBlocks;
slot = pInfo->currentSlot;
for (int i = 0; i < numOfBlocks; ++i) {
pCacheBlock = pInfo->cacheBlocks[slot];
vnodeFreeCacheBlock(pCacheBlock);
slot = (slot - 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
}
pObj->pCache = NULL;
tfree(pInfo->cacheBlocks);
tfree(pInfo);
pthread_mutex_unlock(&pPool->vmutex);
}
uint64_t vnodeGetPoolCount(SVnodeObj *pVnode) {
SCachePool *pPool;
pPool = (SCachePool *)pVnode->pCachePool;
return pPool->count;
}
void vnodeUpdateCommitInfo(SMeterObj *pObj, int slot, int pos, uint64_t count) {
SCacheInfo * pInfo;
SCacheBlock *pBlock;
SCachePool * pPool;
pInfo = (SCacheInfo *)pObj->pCache;
pPool = (SCachePool *)vnodeList[pObj->vnode].pCachePool;
int tslot =
(pInfo->commitPoint == pObj->pointsPerBlock) ? (pInfo->commitSlot + 1) % pInfo->maxBlocks : pInfo->commitSlot;
int slots = 0;
while (tslot != slot || ((tslot == slot) && (pos == pObj->pointsPerBlock))) {
slots++;
pthread_mutex_lock(&pPool->vmutex);
pBlock = pInfo->cacheBlocks[tslot];
assert(pBlock->notFree);
pBlock->notFree = 0;
pInfo->unCommittedBlocks--;
pPool->notFreeSlots--;
pthread_mutex_unlock(&pPool->vmutex);
dTrace("vid:%d sid:%d id:%s, cache block is committed, slot:%d, index:%d notFreeSlots:%d, unCommittedBlocks:%d",
pObj->vnode, pObj->sid, pObj->meterId, pBlock->slot, pBlock->index, pPool->notFreeSlots,
pInfo->unCommittedBlocks);
if (tslot == slot) break;
tslot = (tslot + 1) % pInfo->maxBlocks;
}
atomic_fetch_add_32(&pObj->freePoints, pObj->pointsPerBlock * slots);
pInfo->commitSlot = slot;
pInfo->commitPoint = pos;
pObj->commitCount = count;
}
TSKEY vnodeGetFirstKey(int vnode) {
SMeterObj * pObj;
SCacheInfo * pInfo;
SCacheBlock *pCacheBlock;
SVnodeCfg *pCfg = &vnodeList[vnode].cfg;
TSKEY key = taosGetTimestamp(pCfg->precision);
for (int sid = 0; sid < pCfg->maxSessions; ++sid) {
pObj = vnodeList[vnode].meterList[sid];
if (pObj == NULL || pObj->pCache == NULL) continue;
pInfo = (SCacheInfo *)pObj->pCache;
pCacheBlock = pInfo->cacheBlocks[0];
if (pCacheBlock == NULL || pCacheBlock->numOfPoints <= 0) continue;
if (*((TSKEY *)(pCacheBlock->offset[0])) < key) key = *((TSKEY *)(pCacheBlock->offset[0]));
}
return key;
}
pthread_t vnodeCreateCommitThread(SVnodeObj *pVnode) {
// this function has to mutex locked before it is called
pthread_attr_t thattr;
SCachePool * pPool = (SCachePool *)pVnode->pCachePool;
if (pPool->commitInProcess) {
dTrace("vid:%d, commit is already in process", pVnode->vnode);
return pVnode->commitThread;
}
taosTmrStopA(&pVnode->commitTimer);
if (pVnode->vnodeStatus == TSDB_VN_STATUS_UNSYNCED) {
taosTmrReset(vnodeProcessCommitTimer, pVnode->cfg.commitTime * 1000, pVnode, vnodeTmrCtrl, &pVnode->commitTimer);
dTrace("vid:%d, it is in unsyc state, commit later", pVnode->vnode);
return pVnode->commitThread;
}
pthread_attr_init(&thattr);
pthread_attr_setdetachstate(&thattr, PTHREAD_CREATE_DETACHED);
if (pthread_create(&(pVnode->commitThread), &thattr, vnodeCommitToFile, pVnode) != 0) {
dError("vid:%d, failed to create thread to commit file, reason:%s", pVnode->vnode, strerror(errno));
} else {
pPool->commitInProcess = 1;
dTrace("vid:%d, commit thread: 0x%lx is created", pVnode->vnode, pVnode->commitThread);
}
pthread_attr_destroy(&thattr);
return pVnode->commitThread;
}
void vnodeProcessCommitTimer(void *param, void *tmrId) {
SVnodeObj * pVnode = (SVnodeObj *)param;
SCachePool *pPool = (SCachePool *)pVnode->pCachePool;
pthread_mutex_lock(&pPool->vmutex);
vnodeCreateCommitThread(pVnode);
pthread_mutex_unlock(&pPool->vmutex);
}
void vnodeCommitOver(SVnodeObj *pVnode) {
SCachePool *pPool = (SCachePool *)(pVnode->pCachePool);
taosTmrReset(vnodeProcessCommitTimer, pVnode->cfg.commitTime * 1000, pVnode, vnodeTmrCtrl, &pVnode->commitTimer);
pthread_mutex_lock(&pPool->vmutex);
pPool->commitInProcess = 0;
dTrace("vid:%d, commit is over, notFreeSlots:%d", pPool->vnode, pPool->notFreeSlots);
pthread_mutex_unlock(&pPool->vmutex);
}
static void vnodeWaitForCommitComplete(SVnodeObj *pVnode) {
SCachePool *pPool = (SCachePool *)(pVnode->pCachePool);
// wait for 100s at most
const int32_t totalCount = 1000;
int32_t count = 0;
// all meter is marked as dropped, so the commit will abort very quickly
while(count++ < totalCount) {
int32_t commitInProcess = 0;
pthread_mutex_lock(&pPool->vmutex);
commitInProcess = pPool->commitInProcess;
pthread_mutex_unlock(&pPool->vmutex);
if (commitInProcess) {
dWarn("vid:%d still in commit, wait for completed", pVnode->vnode);
taosMsleep(10);
}
}
}
void vnodeCancelCommit(SVnodeObj *pVnode) {
SCachePool *pPool = (SCachePool *)(pVnode->pCachePool);
if (pPool == NULL) return;
vnodeWaitForCommitComplete(pVnode);
taosTmrReset(vnodeProcessCommitTimer, pVnode->cfg.commitTime * 1000, pVnode, vnodeTmrCtrl, &pVnode->commitTimer);
}
/* The vnode cache lock should be hold before calling this interface
*/
SCacheBlock *vnodeGetFreeCacheBlock(SVnodeObj *pVnode) {
SCachePool *pPool = (SCachePool *)(pVnode->pCachePool);
SVnodeCfg *pCfg = &(pVnode->cfg);
SCacheBlock *pCacheBlock = NULL;
int skipped = 0;
while (1) {
pCacheBlock = (SCacheBlock *)(pPool->pMem[((int64_t)pPool->freeSlot)]);
if (pCacheBlock->blockId == 0) break;
if (pCacheBlock->notFree) {
pPool->freeSlot++;
pPool->freeSlot = pPool->freeSlot % pCfg->cacheNumOfBlocks.totalBlocks;
skipped++;
if (skipped > pPool->threshold) {
vnodeCreateCommitThread(pVnode);
pthread_mutex_unlock(&pPool->vmutex);
dError("vid:%d committing process is too slow, notFreeSlots:%d....", pVnode->vnode, pPool->notFreeSlots);
return NULL;
}
} else {
SMeterObj * pRelObj = pCacheBlock->pMeterObj;
SCacheInfo *pRelInfo = (SCacheInfo *)pRelObj->pCache;
int firstSlot = (pRelInfo->currentSlot - pRelInfo->numOfBlocks + 1 + pRelInfo->maxBlocks) % pRelInfo->maxBlocks;
pCacheBlock = pRelInfo->cacheBlocks[firstSlot];
if (pCacheBlock) {
pPool->freeSlot = pCacheBlock->index;
vnodeFreeCacheBlock(pCacheBlock);
break;
} else {
pPool->freeSlot = (pPool->freeSlot + 1) % pCfg->cacheNumOfBlocks.totalBlocks;
skipped++;
}
}
}
pCacheBlock = (SCacheBlock *)(pPool->pMem[pPool->freeSlot]);
pCacheBlock->index = pPool->freeSlot;
pCacheBlock->notFree = 1;
pPool->freeSlot = (pPool->freeSlot + 1) % pCfg->cacheNumOfBlocks.totalBlocks;
pPool->notFreeSlots++;
return pCacheBlock;
}
int vnodeAllocateCacheBlock(SMeterObj *pObj) {
int index;
SCachePool * pPool;
SCacheBlock *pCacheBlock;
SCacheInfo * pInfo;
SVnodeObj * pVnode;
int commit = 0;
pVnode = vnodeList + pObj->vnode;
pPool = (SCachePool *)pVnode->pCachePool;
pInfo = (SCacheInfo *)pObj->pCache;
SVnodeCfg *pCfg = &(vnodeList[pObj->vnode].cfg);
if (pPool == NULL) return -1;
pthread_mutex_lock(&pPool->vmutex);
if (pInfo == NULL || pInfo->cacheBlocks == NULL) {
pthread_mutex_unlock(&pPool->vmutex);
dError("vid:%d sid:%d id:%s, meter is not there", pObj->vnode, pObj->sid, pObj->meterId);
return -1;
}
if (pPool->count <= 1) {
if (pVnode->commitTimer == NULL)
pVnode->commitTimer = taosTmrStart(vnodeProcessCommitTimer, pCfg->commitTime * 1000, pVnode, vnodeTmrCtrl);
}
if (pInfo->unCommittedBlocks >= pInfo->maxBlocks-1) {
vnodeCreateCommitThread(pVnode);
pthread_mutex_unlock(&pPool->vmutex);
dError("vid:%d sid:%d id:%s, all blocks are not committed yet....", pObj->vnode, pObj->sid, pObj->meterId);
return -1;
}
if ((pCacheBlock = vnodeGetFreeCacheBlock(pVnode)) == NULL) return -1;
index = pCacheBlock->index;
pCacheBlock->pMeterObj = pObj;
pCacheBlock->offset[0] = ((char *)(pCacheBlock)) + sizeof(SCacheBlock) + pObj->numOfColumns * sizeof(char *);
for (int col = 1; col < pObj->numOfColumns; ++col)
pCacheBlock->offset[col] = pCacheBlock->offset[col - 1] + pObj->schema[col - 1].bytes * pObj->pointsPerBlock;
pInfo->numOfBlocks++;
pInfo->blocks++;
pInfo->unCommittedBlocks++;
pInfo->currentSlot = (pInfo->currentSlot + 1) % pInfo->maxBlocks;
pCacheBlock->blockId = pInfo->blocks;
pCacheBlock->slot = pInfo->currentSlot;
if (pInfo->numOfBlocks > pInfo->maxBlocks) {
pCacheBlock = pInfo->cacheBlocks[pInfo->currentSlot];
vnodeFreeCacheBlock(pCacheBlock);
}
pInfo->cacheBlocks[pInfo->currentSlot] = (SCacheBlock *)(pPool->pMem[(int64_t)index]);
dTrace("vid:%d sid:%d id:%s, allocate a cache block, numOfBlocks:%d, slot:%d, index:%d notFreeSlots:%d blocks:%d",
pObj->vnode, pObj->sid, pObj->meterId, pInfo->numOfBlocks, pInfo->currentSlot, index, pPool->notFreeSlots,
pInfo->blocks);
if (((pPool->notFreeSlots > pPool->threshold) || (pInfo->unCommittedBlocks >= pInfo->maxBlocks / 2))) {
dTrace("vid:%d sid:%d id:%s, too many unCommitted slots, unCommitted:%d notFreeSlots:%d",
pObj->vnode, pObj->sid, pObj->meterId, pInfo->unCommittedBlocks, pPool->notFreeSlots);
vnodeCreateCommitThread(pVnode);
commit = 1;
}
pthread_mutex_unlock(&pPool->vmutex);
return commit;
}
int vnodeInsertPointToCache(SMeterObj *pObj, char *pData) {
SCacheBlock *pCacheBlock;
SCacheInfo * pInfo;
SCachePool * pPool;
pInfo = (SCacheInfo *)pObj->pCache;
pPool = (SCachePool *)vnodeList[pObj->vnode].pCachePool;
if (pInfo->numOfBlocks == 0) {
if (vnodeAllocateCacheBlock(pObj) < 0) {
return -1;
}
}
if (pInfo->currentSlot < 0) return -1;
pCacheBlock = pInfo->cacheBlocks[pInfo->currentSlot];
if (pCacheBlock->numOfPoints >= pObj->pointsPerBlock) {
if (vnodeAllocateCacheBlock(pObj) < 0) return -1;
pCacheBlock = pInfo->cacheBlocks[pInfo->currentSlot];
}
for (int col = 0; col < pObj->numOfColumns; ++col) {
memcpy(pCacheBlock->offset[col] + pCacheBlock->numOfPoints * pObj->schema[col].bytes, pData,
pObj->schema[col].bytes);
pData += pObj->schema[col].bytes;
}
atomic_fetch_sub_32(&pObj->freePoints, 1);
pCacheBlock->numOfPoints++;
pPool->count++;
return 0;
}
void vnodeUpdateQuerySlotPos(SCacheInfo *pInfo, SQuery *pQuery) {
SCacheBlock *pCacheBlock;
int step = QUERY_IS_ASC_QUERY(pQuery) ? -1 : 1;
if ((QUERY_IS_ASC_QUERY(pQuery) && (pQuery->slot == pQuery->currentSlot)) ||
(!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->slot == pQuery->firstSlot))) {
pQuery->over = 1;
} else {
pQuery->slot = (pQuery->slot - step + pInfo->maxBlocks) % pInfo->maxBlocks;
pCacheBlock = pInfo->cacheBlocks[pQuery->slot];
pQuery->pos = QUERY_IS_ASC_QUERY(pQuery) ? 0 : pCacheBlock->numOfPoints - 1;
}
}
static FORCE_INLINE TSKEY vnodeGetTSInCacheBlock(SCacheBlock *pCacheBlock, int32_t pos) {
return *(TSKEY *)(pCacheBlock->offset[PRIMARYKEY_TIMESTAMP_COL_INDEX] + pos * TSDB_KEYSIZE);
}
int vnodeQueryFromCache(SMeterObj *pObj, SQuery *pQuery) {
SCacheBlock *pCacheBlock;
int col, step;
char * pRead, *pData;
SCacheInfo * pInfo;
int lastPos = -1;
int startPos, numOfReads, numOfPoints;
pQuery->pointsRead = 0;
if (pQuery->over) return 0;
vnodeFreeFields(pQuery);
pInfo = (SCacheInfo *)pObj->pCache;
if ((pInfo == NULL) || (pInfo->numOfBlocks == 0)) {
pQuery->over = 1;
return 0;
}
if (pQuery->slot < 0 || pQuery->pos < 0) // it means a new query, we need to find the point first
vnodeSearchPointInCache(pObj, pQuery);
if (pQuery->slot < 0 || pQuery->pos < 0) {
pQuery->over = 1;
return 0;
}
step = QUERY_IS_ASC_QUERY(pQuery) ? -1 : 1;
pCacheBlock = pInfo->cacheBlocks[pQuery->slot];
numOfPoints = pCacheBlock->numOfPoints;
int maxReads = QUERY_IS_ASC_QUERY(pQuery) ? numOfPoints - pQuery->pos : pQuery->pos + 1;
if (maxReads <= 0) {
vnodeUpdateQuerySlotPos(pInfo, pQuery);
return 0;
}
TSKEY startkey = vnodeGetTSInCacheBlock(pCacheBlock, 0);
TSKEY endkey = vnodeGetTSInCacheBlock(pCacheBlock, numOfPoints - 1);
if (QUERY_IS_ASC_QUERY(pQuery)) {
if (endkey < pQuery->ekey) {
numOfReads = maxReads;
} else {
lastPos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(
pCacheBlock->offset[PRIMARYKEY_TIMESTAMP_COL_INDEX] + TSDB_KEYSIZE * pQuery->pos, maxReads, pQuery->ekey, 0);
numOfReads = (lastPos >= 0) ? lastPos + 1 : 0;
}
} else {
if (startkey > pQuery->ekey) {
numOfReads = maxReads;
} else {
lastPos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(pCacheBlock->offset[PRIMARYKEY_TIMESTAMP_COL_INDEX],
maxReads, pQuery->ekey, 1);
numOfReads = (lastPos >= 0) ? pQuery->pos - lastPos + 1 : 0;
}
}
if (numOfReads > pQuery->pointsToRead - pQuery->pointsRead) {
numOfReads = pQuery->pointsToRead - pQuery->pointsRead;
} else {
if (lastPos >= 0 || numOfReads == 0) {
pQuery->keyIsMet = 1;
pQuery->over = 1;
}
}
startPos = QUERY_IS_ASC_QUERY(pQuery) ? pQuery->pos : pQuery->pos - numOfReads + 1;
int32_t numOfQualifiedPoints = 0;
int32_t numOfActualRead = numOfReads;
if (pQuery->numOfFilterCols == 0) {
for (col = 0; col < pQuery->numOfOutputCols; ++col) {
int16_t colIdx = pQuery->pSelectExpr[col].pBase.colInfo.colIdx;
int16_t bytes = GET_COLUMN_BYTES(pQuery, col);
int16_t type = GET_COLUMN_TYPE(pQuery, col);
pData = pQuery->sdata[col]->data + pQuery->pointsOffset * bytes;
/* this column is absent from current block, fill this block with null value */
if (colIdx < 0 || colIdx >= pObj->numOfColumns ||
pObj->schema[colIdx].colId != pQuery->pSelectExpr[col].pBase.colInfo.colId) { // set null
setNullN(pData, type, bytes, pCacheBlock->numOfPoints);
} else {
pRead = pCacheBlock->offset[colIdx] + startPos * bytes;
if (QUERY_IS_ASC_QUERY(pQuery)) {
memcpy(pData, pRead, numOfReads * bytes);
} else {
for(int32_t j = 0; j < numOfReads; ++j) {
memcpy(pData + bytes * j, pRead + (numOfReads - 1 - j) * bytes, bytes);
}
}
}
}
numOfQualifiedPoints = numOfReads;
} else { // check each data one by one
// set the input column data
for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
int16_t colIdx = pQuery->pFilterInfo[k].info.colIdx;
if (colIdx < 0) { // current data has not specified column
pQuery->pFilterInfo[k].pData = NULL;
} else {
pQuery->pFilterInfo[k].pData = pCacheBlock->offset[colIdx];
}
}
int32_t *ids = calloc(1, numOfReads * sizeof(int32_t));
numOfActualRead = 0;
if (QUERY_IS_ASC_QUERY(pQuery)) {
for (int32_t j = startPos; j < pCacheBlock->numOfPoints; ++j) {
TSKEY key = vnodeGetTSInCacheBlock(pCacheBlock, j);
if (key < startkey || key > endkey) {
dError("vid:%d sid:%d id:%s, timestamp in cache slot is disordered. slot:%d, pos:%d, ts:%" PRId64 ", block "
"range:%" PRId64 "-%" PRId64, pObj->vnode, pObj->sid, pObj->meterId, pQuery->slot, j, key, startkey, endkey);
tfree(ids);
return -TSDB_CODE_FILE_BLOCK_TS_DISORDERED;
}
if (key > pQuery->ekey) {
break;
}
if (!vnodeFilterData(pQuery, &numOfActualRead, j)) {
continue;
}
ids[numOfQualifiedPoints] = j;
if (++numOfQualifiedPoints == numOfReads) { // qualified data are enough
break;
}
}
} else {
startPos = pQuery->pos;
for (int32_t j = startPos; j >= 0; --j) {
TSKEY key = vnodeGetTSInCacheBlock(pCacheBlock, j);
if (key < startkey || key > endkey) {
dError("vid:%d sid:%d id:%s, timestamp in cache slot is disordered. slot:%d, pos:%d, ts:%" PRId64 ", block "
"range:%" PRId64 "-%" PRId64, pObj->vnode, pObj->sid, pObj->meterId, pQuery->slot, j, key, startkey, endkey);
tfree(ids);
return -TSDB_CODE_FILE_BLOCK_TS_DISORDERED;
}
if (key < pQuery->ekey) {
break;
}
if (!vnodeFilterData(pQuery, &numOfActualRead, j)) {
continue;
}
ids[numOfQualifiedPoints] = j;
if (++numOfQualifiedPoints == numOfReads) { // qualified data are enough
break;
}
}
}
// int32_t start = QUERY_IS_ASC_QUERY(pQuery) ? 0 : numOfReads - numOfQualifiedPoints;
for (int32_t j = 0; j < numOfQualifiedPoints; ++j) {
for (int32_t col = 0; col < pQuery->numOfOutputCols; ++col) {
int16_t colIndex = pQuery->pSelectExpr[col].pBase.colInfo.colIdx;
int32_t bytes = pObj->schema[colIndex].bytes;
pData = pQuery->sdata[col]->data + (pQuery->pointsOffset + j) * bytes;
pRead = pCacheBlock->offset[colIndex] + ids[j/* + start*/] * bytes;
memcpy(pData, pRead, bytes);
}
}
tfree(ids);
assert(numOfQualifiedPoints <= numOfReads);
}
pQuery->pointsRead += numOfQualifiedPoints;
pQuery->pos -= numOfActualRead * step;
// update the skey/lastkey
int32_t lastAccessPos = pQuery->pos + step;
pQuery->lastKey = vnodeGetTSInCacheBlock(pCacheBlock, lastAccessPos);
pQuery->skey = pQuery->lastKey - step;
int update = 0; // go to next slot after this round
if ((pQuery->pos < 0 || pQuery->pos >= pObj->pointsPerBlock || numOfReads == 0) && (pQuery->over == 0)) update = 1;
// if block is changed, it shall be thrown away, it won't happen for committing
if (pObj != pCacheBlock->pMeterObj || pCacheBlock->blockId > pQuery->blockId) {
update = 1;
pQuery->pointsRead = 0;
dWarn("vid:%d sid:%d id:%s, cache block is overwritten, slot:%d blockId:%d qBlockId:%d",
pObj->vnode, pObj->sid, pObj->meterId, pQuery->slot, pCacheBlock->blockId, pQuery->blockId);
}
if (update) vnodeUpdateQuerySlotPos(pInfo, pQuery);
for (col = 0; col < pQuery->numOfOutputCols; ++col) {
int16_t bytes = GET_COLUMN_BYTES(pQuery, col);
pQuery->sdata[col]->len = bytes * (pQuery->pointsRead + pQuery->pointsOffset);
}
return pQuery->pointsRead;
}
void vnodeSearchPointInCache(SMeterObj *pObj, SQuery *pQuery) {
int numOfBlocks;
int firstSlot, lastSlot, midSlot;
TSKEY keyFirst, keyLast;
SCacheBlock *pBlock;
SCacheInfo * pInfo = (SCacheInfo *)pObj->pCache;
SCachePool * pPool = (SCachePool *)vnodeList[pObj->vnode].pCachePool;
pQuery->slot = -1;
pQuery->pos = -1;
// save these variables first in case it may be changed by write operation
pthread_mutex_lock(&pPool->vmutex);
numOfBlocks = pInfo->numOfBlocks;
lastSlot = pInfo->currentSlot;
pthread_mutex_unlock(&pPool->vmutex);
if (numOfBlocks <= 0) return;
firstSlot = (lastSlot - numOfBlocks + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
// make sure it is there, otherwise, return right away
pBlock = pInfo->cacheBlocks[firstSlot];
keyFirst = vnodeGetTSInCacheBlock(pBlock, 0);
pBlock = pInfo->cacheBlocks[lastSlot];
keyLast = vnodeGetTSInCacheBlock(pBlock, pBlock->numOfPoints - 1);
pQuery->blockId = pBlock->blockId;
pQuery->currentSlot = lastSlot;
pQuery->numOfBlocks = numOfBlocks;
pQuery->firstSlot = firstSlot;
if (!QUERY_IS_ASC_QUERY(pQuery)) {
if (pQuery->skey < keyFirst) return;
if (pQuery->ekey > keyLast) return;
} else {
if (pQuery->skey > keyLast) return;
if (pQuery->ekey < keyFirst) return;
}
while (1) {
numOfBlocks = (lastSlot - firstSlot + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
if (numOfBlocks == 0) numOfBlocks = pInfo->maxBlocks;
midSlot = (firstSlot + (numOfBlocks >> 1)) % pInfo->maxBlocks;
pBlock = pInfo->cacheBlocks[midSlot];
keyFirst = vnodeGetTSInCacheBlock(pBlock, 0);
keyLast = vnodeGetTSInCacheBlock(pBlock, pBlock->numOfPoints - 1);
if (numOfBlocks == 1) break;
if (pQuery->skey > keyLast) {
if (numOfBlocks == 2) break;
if (!QUERY_IS_ASC_QUERY(pQuery)) {
int nextSlot = (midSlot + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
SCacheBlock *pNextBlock = pInfo->cacheBlocks[nextSlot];
TSKEY nextKeyFirst = vnodeGetTSInCacheBlock(pNextBlock, 0);
if (pQuery->skey < nextKeyFirst) break;
}
firstSlot = (midSlot + 1) % pInfo->maxBlocks;
} else if (pQuery->skey < keyFirst) {
if (QUERY_IS_ASC_QUERY(pQuery)) {
int prevSlot = (midSlot - 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
SCacheBlock *pPrevBlock = pInfo->cacheBlocks[prevSlot];
TSKEY prevKeyLast = vnodeGetTSInCacheBlock(pPrevBlock, pPrevBlock->numOfPoints - 1);
if (pQuery->skey > prevKeyLast) break;
}
lastSlot = (midSlot - 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
} else {
break; // got the slot
}
}
pQuery->slot = midSlot;
if (!QUERY_IS_ASC_QUERY(pQuery)) {
if (pQuery->skey < keyFirst) return;
if (pQuery->ekey > keyLast) {
pQuery->slot = (midSlot + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
return;
}
} else {
if (pQuery->skey > keyLast) {
pQuery->slot = (midSlot + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
return;
}
if (pQuery->ekey < keyFirst) return;
}
// midSlot and pBlock is the search result
pBlock = pInfo->cacheBlocks[midSlot];
pQuery->pos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(pBlock->offset[0], pBlock->numOfPoints, pQuery->skey,
pQuery->order.order);
pQuery->key = vnodeGetTSInCacheBlock(pBlock, pQuery->pos);
if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols == 0) {
int maxReads = QUERY_IS_ASC_QUERY(pQuery) ? pBlock->numOfPoints - pQuery->pos : pQuery->pos + 1;
if (pQuery->limit.offset < maxReads) { // start position in current block
if (QUERY_IS_ASC_QUERY(pQuery)) {
pQuery->pos += pQuery->limit.offset;
} else {
pQuery->pos -= pQuery->limit.offset;
}
pQuery->key = vnodeGetTSInCacheBlock(pBlock, pQuery->pos);
pQuery->limit.offset = 0;
} else if (pInfo->numOfBlocks == 1) {
pQuery->pos = -1; // no qualified data
} else {
int step = QUERY_IS_ASC_QUERY(pQuery) ? 1 : -1;
pQuery->limit.offset -= maxReads;
midSlot = (midSlot + step + pInfo->maxBlocks) % pInfo->maxBlocks;
bool hasData = true;
while (pQuery->limit.offset > pInfo->cacheBlocks[midSlot]->numOfPoints) {
pQuery->limit.offset -= pInfo->cacheBlocks[midSlot]->numOfPoints;
if ((QUERY_IS_ASC_QUERY(pQuery) && midSlot == pQuery->currentSlot) ||
(!QUERY_IS_ASC_QUERY(pQuery) && midSlot == pQuery->firstSlot)) { // no qualified data in cache
hasData = false;
break;
}
midSlot = (midSlot + step + pInfo->maxBlocks) % pInfo->maxBlocks;
}
if (hasData) {
if (QUERY_IS_ASC_QUERY(pQuery)) {
pQuery->pos = pQuery->limit.offset;
} else {
pQuery->pos = pInfo->cacheBlocks[midSlot]->numOfPoints - pQuery->limit.offset - 1;
}
pQuery->limit.offset = 0;
pQuery->slot = midSlot;
pQuery->key = vnodeGetTSInCacheBlock(pInfo->cacheBlocks[midSlot], pQuery->pos);
} else {
pQuery->pos = -1; // no qualified data
pBlock = pInfo->cacheBlocks[midSlot];
if (QUERY_IS_ASC_QUERY(pQuery)) {
pQuery->lastKey = vnodeGetTSInCacheBlock(pBlock, pBlock->numOfPoints - 1);
pQuery->skey = pQuery->lastKey + 1;
} else {
pQuery->lastKey = vnodeGetTSInCacheBlock(pBlock, 0);
pQuery->skey = pQuery->lastKey - 1;
}
}
}
}
return;
}
void vnodeSetCommitQuery(SMeterObj *pObj, SQuery *pQuery) {
SCacheInfo *pInfo = (SCacheInfo *)pObj->pCache;
SCachePool *pPool = (SCachePool *)vnodeList[pObj->vnode].pCachePool;
SVnodeObj * pVnode = vnodeList + pObj->vnode;
pQuery->order.order = TSQL_SO_ASC;
pQuery->numOfCols = pObj->numOfColumns;
pQuery->numOfOutputCols = pObj->numOfColumns;
for (int16_t col = 0; col < pObj->numOfColumns; ++col) {
pQuery->colList[col].colIdxInBuf = col;
pQuery->colList[col].data.colId = pObj->schema[col].colId;
pQuery->colList[col].data.bytes = pObj->schema[col].bytes;
pQuery->colList[col].data.type = pObj->schema[col].type;
SColIndexEx *pColIndexEx = &pQuery->pSelectExpr[col].pBase.colInfo;
pColIndexEx->colId = pObj->schema[col].colId;
pColIndexEx->colIdx = col;
pColIndexEx->colIdxInBuf = col;
pColIndexEx->flag = TSDB_COL_NORMAL;
}
pQuery->slot = pInfo->commitSlot;
pQuery->pos = pInfo->commitPoint;
pQuery->over = 0;
pthread_mutex_lock(&pPool->vmutex);
pQuery->currentSlot = pInfo->currentSlot;
pQuery->numOfBlocks = pInfo->numOfBlocks;
pthread_mutex_unlock(&pPool->vmutex);
if (pQuery->numOfBlocks <= 0 || pQuery->firstSlot < 0) {
pQuery->over = 1;
return;
}
pQuery->firstSlot = (pQuery->currentSlot - pQuery->numOfBlocks + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
pQuery->blockId = pInfo->cacheBlocks[pQuery->currentSlot]->blockId;
SCacheBlock *pCacheBlock;
pCacheBlock = pInfo->cacheBlocks[pInfo->commitSlot];
if (pInfo->commitSlot == pQuery->currentSlot && pInfo->commitPoint == pCacheBlock->numOfPoints) {
dTrace("vid:%d sid:%d id:%s, no new data to commit", pObj->vnode, pObj->sid, pObj->meterId);
pQuery->over = 1;
return;
}
if (pQuery->pos == pObj->pointsPerBlock) {
pQuery->slot = (pQuery->slot + 1) % pInfo->maxBlocks;
pQuery->pos = 0;
}
pCacheBlock = pInfo->cacheBlocks[pQuery->slot];
TSKEY firstKey = *((TSKEY *)(pCacheBlock->offset[0] + pQuery->pos * pObj->schema[0].bytes));
if (firstKey < pQuery->skey) {
pQuery->over = 1;
dTrace("vid:%d sid:%d id:%s, first key is small, keyFirst:%" PRId64 " commitFirstKey:%" PRId64 "",
pObj->vnode, pObj->sid, pObj->meterId, firstKey, pQuery->skey);
pthread_mutex_lock(&(pVnode->vmutex));
if (firstKey < pVnode->firstKey) pVnode->firstKey = firstKey;
assert(pVnode->firstKey > 0);
pthread_mutex_unlock(&(pVnode->vmutex));
}
}
int vnodeSyncRetrieveVnodeStatistic(int vnode, int fd) {
SVnodeObj *pVnode = vnodeList + vnode;
if (taosWriteMsg(fd, &(pVnode->vnodeStatistic.pointsWritten), sizeof(int64_t)) < 0) return -1;
if (taosWriteMsg(fd, &(pVnode->vnodeStatistic.totalStorage), sizeof(int64_t)) < 0) return -1;
if (taosWriteMsg(fd, &(pVnode->vnodeStatistic.compStorage), sizeof(int64_t)) < 0) return -1;
return 0;
}
int vnodeSyncRestoreVnodeStatistic(int vnode, int fd) {
SVnodeObj *pVnode = vnodeList + vnode;
if (taosReadMsg(fd, &(pVnode->vnodeStatistic.pointsWritten), sizeof(int64_t)) < 0) return -1;
if (taosReadMsg(fd, &(pVnode->vnodeStatistic.totalStorage), sizeof(int64_t)) < 0) return -1;
if (taosReadMsg(fd, &(pVnode->vnodeStatistic.compStorage), sizeof(int64_t)) < 0) return -1;
return 0;
}
int vnodeSyncRetrieveCache(int vnode, int fd) {
int32_t sid, slot, points;
SVnodeObj * pVnode;
SMeterObj * pObj;
SCacheInfo * pInfo;
SCacheBlock *pBlock;
int blocksSent, pointsSent;
pVnode = vnodeList + vnode;
points = 0;
SVnodeCfg *pCfg = &vnodeList[vnode].cfg;
for (sid = 0; sid < pCfg->maxSessions; ++sid) {
pObj = pVnode->meterList[sid];
if (pObj == NULL) continue;
pInfo = (SCacheInfo *)pObj->pCache;
if (pInfo == NULL) continue;
// write sid first
if (taosWriteMsg(fd, &sid, sizeof(sid)) <= 0) return -1;
if (taosWriteMsg(fd, &(pObj->lastKey), sizeof(pObj->lastKey)) <= 0) return -1;
if (taosWriteMsg(fd, &(pObj->lastKeyOnFile), sizeof(pObj->lastKeyOnFile)) <= 0) return -1;
if (taosWriteMsg(fd, &(pInfo->commitPoint), sizeof(pInfo->commitPoint)) <= 0) return -1;
dTrace("vid:%d sid:%d id:%s, send lastKey:%" PRId64 " lastKeyOnFile:%" PRId64, vnode, sid, pObj->meterId, pObj->lastKey,
pObj->lastKeyOnFile);
slot = pInfo->commitSlot;
blocksSent = 0;
pointsSent = 0;
while (pInfo->numOfBlocks > 0) {
pBlock = pInfo->cacheBlocks[slot];
if (pBlock->numOfPoints == 0) break;
// write the number of points
points = pBlock->numOfPoints;
if (taosWriteMsg(fd, &(points), sizeof(points)) <= 0) return -1;
// write the data
for (int col = 0; col < pObj->numOfColumns; ++col)
if (taosWriteMsg(fd, pBlock->offset[col], pObj->schema[col].bytes * points) <= 0) return -1;
TSKEY lastKey = *((TSKEY *)(pBlock->offset[0] + pObj->schema[0].bytes * (points - 1)));
dTrace("vid:%d sid:%d id:%s, cache block is sent, points:%d lastKey:%" PRId64, vnode, sid, pObj->meterId, points,
lastKey);
blocksSent++;
pointsSent += pBlock->numOfPoints;
if (slot == pInfo->currentSlot) break;
slot = (slot + 1) % pInfo->maxBlocks;
}
// set number of points as zero at the end
points = 0;
if (taosWriteMsg(fd, &(points), sizeof(points)) <= 0) return -1;
}
sid = -1;
if (taosWriteMsg(fd, &sid, sizeof(sid)) < 0) return -1;
if (vnodeSyncRetrieveVnodeStatistic(vnode, fd) < 0) return -1;
return 0;
}
int vnodeSyncRestoreCache(int vnode, int fd) {
int32_t sid, points, i, slot;
SMeterObj * pObj;
SCacheInfo * pInfo;
SCacheBlock *pBlock;
int blocksReceived, pointsReceived;
int numOfBlocks;
SVnodeCfg * pCfg = &vnodeList[vnode].cfg;
SCachePool * pPool = (SCachePool *)vnodeList[vnode].pCachePool;
while (1) {
// read sid first
if (taosReadMsg(fd, &sid, sizeof(sid)) <= 0) return -1;
if (sid >= pCfg->maxSessions) {
dError("vid:%d, restore cache, sid:%d is messed up", vnode, sid);
return -1;
}
if (sid < 0) break;
pObj = vnodeList[vnode].meterList[sid];
if (pObj == NULL) {
dError("vid:%d sid:%d, meter is not there", vnode, sid);
vnodeSendMeterCfgMsg(vnode, sid);
return -1;
}
pInfo = (SCacheInfo *)pObj->pCache;
numOfBlocks = pInfo->numOfBlocks;
pthread_mutex_lock(&pPool->vmutex);
for (i = 0; i < numOfBlocks; ++i) {
slot = (pInfo->currentSlot - i + pInfo->maxBlocks) % pInfo->maxBlocks;
pBlock = pInfo->cacheBlocks[slot];
vnodeFreeCacheBlock(pBlock);
}
pthread_mutex_unlock(&pPool->vmutex);
pInfo->unCommittedBlocks = 0;
if (taosReadMsg(fd, &(pObj->lastKey), sizeof(pObj->lastKey)) <= 0) return -1;
if (taosReadMsg(fd, &(pObj->lastKeyOnFile), sizeof(pObj->lastKeyOnFile)) <= 0) return -1;
if (taosReadMsg(fd, &(pInfo->commitPoint), sizeof(pInfo->commitPoint)) <= 0) return -1;
dTrace("vid:%d sid:%d id:%s, commitPoint:%d lastKeyOnFile:%" PRId64, vnode, sid, pObj->meterId, pInfo->commitPoint,
pObj->lastKeyOnFile);
if (vnodeList[pObj->vnode].lastKey < pObj->lastKey) vnodeList[pObj->vnode].lastKey = pObj->lastKey;
if (vnodeList[pObj->vnode].lastKeyOnFile < pObj->lastKeyOnFile)
vnodeList[pObj->vnode].lastKeyOnFile = pObj->lastKeyOnFile;
pInfo->currentSlot = -1;
pInfo->commitSlot = 0;
memset(pInfo->cacheBlocks, 0, sizeof(SCacheBlock *) * pInfo->maxBlocks);
blocksReceived = 0;
pointsReceived = 0;
pObj->freePoints = pObj->pointsPerBlock * pInfo->maxBlocks;
while (1) {
// read number of points;
points = 0;
if (taosReadMsg(fd, &points, sizeof(points)) <= 0) return -1;
if (points == 0) break;
if (vnodeAllocateCacheBlock(pObj) < 0) return -1;
pBlock = pInfo->cacheBlocks[pInfo->currentSlot];
pBlock->numOfPoints = points;
// read the data
for (int col = 0; col < pObj->numOfColumns; ++col)
if (taosReadMsg(fd, pBlock->offset[col], pObj->schema[col].bytes * points) <= 0) return -1;
atomic_fetch_sub_32(&pObj->freePoints, points);
blocksReceived++;
pointsReceived += points;
pObj->lastKey = *((TSKEY *)(pBlock->offset[0] + pObj->schema[0].bytes * (points - 1)));
if (vnodeList[pObj->vnode].lastKey < pObj->lastKey) vnodeList[pObj->vnode].lastKey = pObj->lastKey;
if (vnodeList[pObj->vnode].firstKey > *(TSKEY *)(pBlock->offset[0]))
vnodeList[pObj->vnode].firstKey = *(TSKEY *)(pBlock->offset[0]);
dTrace("vid:%d sid:%d id:%s, cache block is received, points:%d lastKey:%" PRId64, vnode, sid, pObj->meterId, points,
pObj->lastKey);
}
}
if (vnodeSyncRestoreVnodeStatistic(pObj->vnode, fd) < 0) return -1;
return 0;
}
int vnodeIsCacheCommitted(SMeterObj *pObj) {
if (pObj->pCache == NULL) return 1;
SCacheInfo *pInfo = (SCacheInfo *)pObj->pCache;
if (pInfo->currentSlot < 0) return 1;
SCacheBlock *pBlock = pInfo->cacheBlocks[pInfo->currentSlot];
if (pInfo->commitSlot != pInfo->currentSlot) return 0;
if (pInfo->commitPoint != pBlock->numOfPoints) return 0;
return 1;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include "os.h"
#include "taosdef.h"
#include "vnode.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
typedef struct {
int sversion;
int sid;
int contLen;
int action:8;
int simpleCheck:24;
} SCommitHead;
int vnodeOpenCommitLog(int vnode, uint64_t firstV) {
SVnodeObj *pVnode = vnodeList + vnode;
char * fileName = pVnode->logFn;
pVnode->logFd = open(fileName, O_RDWR | O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO);
if (pVnode->logFd < 0) {
dError("vid:%d, failed to open file:%s, reason:%s", vnode, fileName, strerror(errno));
return -1;
}
dTrace("vid:%d, logfd:%d, open file:%s success", vnode, pVnode->logFd, fileName);
if (posix_fallocate64(pVnode->logFd, 0, pVnode->mappingSize) != 0) {
dError("vid:%d, logfd:%d, failed to alloc file size:%d, reason:%s", vnode, pVnode->logFd, pVnode->mappingSize, strerror(errno));
perror("fallocate failed");
goto _err_log_open;
}
struct stat statbuf;
stat(fileName, &statbuf);
int64_t length = statbuf.st_size;
if (length != pVnode->mappingSize) {
dError("vid:%d, logfd:%d, alloc file size:%" PRId64 " not equal to mapping size:%" PRId64, vnode, pVnode->logFd, length,
pVnode->mappingSize);
goto _err_log_open;
}
pVnode->pMem = mmap(0, pVnode->mappingSize, PROT_WRITE | PROT_READ, MAP_SHARED, pVnode->logFd, 0);
if (pVnode->pMem == MAP_FAILED) {
dError("vid:%d, logfd:%d, failed to map file, reason:%s", vnode, pVnode->logFd, strerror(errno));
goto _err_log_open;
}
pVnode->pWrite = pVnode->pMem;
memcpy(pVnode->pWrite, &(firstV), sizeof(firstV));
pVnode->pWrite += sizeof(firstV);
return pVnode->logFd;
_err_log_open:
close(pVnode->logFd);
remove(fileName);
pVnode->logFd = -1;
return -1;
}
int vnodeRenewCommitLog(int vnode) {
SVnodeObj *pVnode = vnodeList + vnode;
char * fileName = pVnode->logFn;
char * oldName = pVnode->logOFn;
pthread_mutex_lock(&(pVnode->logMutex));
if (FD_VALID(pVnode->logFd)) {
munmap(pVnode->pMem, pVnode->mappingSize);
close(pVnode->logFd);
rename(fileName, oldName);
}
if (pVnode->cfg.commitLog) vnodeOpenCommitLog(vnode, vnodeList[vnode].version);
pthread_mutex_unlock(&(pVnode->logMutex));
return pVnode->logFd;
}
void vnodeRemoveCommitLog(int vnode) { remove(vnodeList[vnode].logOFn); }
size_t vnodeRestoreDataFromLog(int vnode, char *fileName, uint64_t *firstV) {
int fd, ret;
char * cont = NULL;
size_t totalLen = 0;
int actions = 0;
SVnodeObj *pVnode = vnodeList + vnode;
if (pVnode->meterList == NULL) {
dError("vid:%d, vnode is not initialized!!!", vnode);
return 0;
}
struct stat fstat;
if (stat(fileName, &fstat) < 0) {
dTrace("vid:%d, no log file:%s", vnode, fileName);
return 0;
}
dTrace("vid:%d, uncommitted data in file:%s, restore them ...", vnode, fileName);
fd = open(fileName, O_RDWR);
if (fd < 0) {
dError("vid:%d, failed to open:%s, reason:%s", vnode, fileName, strerror(errno));
goto _error;
}
ret = read(fd, firstV, sizeof(pVnode->version));
if (ret <= 0) {
dError("vid:%d, failed to read version", vnode);
goto _error;
}
pVnode->version = *firstV;
int32_t bufLen = TSDB_PAYLOAD_SIZE;
cont = calloc(1, bufLen);
if (cont == NULL) {
dError("vid:%d, out of memory", vnode);
goto _error;
}
TSKEY now = taosGetTimestamp(pVnode->cfg.precision);
SCommitHead head;
int simpleCheck = 0;
while (1) {
ret = read(fd, &head, sizeof(head));
if (ret < 0) goto _error;
if (ret == 0) break;
if (((head.sversion+head.sid+head.contLen+head.action) & 0xFFFFFF) != head.simpleCheck) break;
simpleCheck = head.simpleCheck;
// head.contLen validation is removed
if (head.sid >= pVnode->cfg.maxSessions || head.sid < 0 || head.action >= TSDB_ACTION_MAX) {
dError("vid, invalid commit head, sid:%d contLen:%d action:%d", head.sid, head.contLen, head.action);
} else {
if (head.contLen > 0) {
if (bufLen < head.contLen+sizeof(simpleCheck)) { // pre-allocated buffer is not enough
cont = realloc(cont, head.contLen+sizeof(simpleCheck));
bufLen = head.contLen+sizeof(simpleCheck);
}
if (read(fd, cont, head.contLen+sizeof(simpleCheck)) < 0) goto _error;
if (*(int *)(cont+head.contLen) != simpleCheck) break;
SMeterObj *pObj = pVnode->meterList[head.sid];
if (pObj == NULL) {
dError("vid:%d, sid:%d not exists, ignore data in commit log, contLen:%d action:%d",
vnode, head.sid, head.contLen, head.action);
continue;
}
if (vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPING)) {
dWarn("vid:%d sid:%d id:%s, meter is dropped, ignore data in commit log, contLen:%d action:%d",
vnode, head.sid, head.contLen, head.action);
continue;
}
int32_t numOfPoints = 0;
(*vnodeProcessAction[head.action])(pObj, cont, head.contLen, TSDB_DATA_SOURCE_LOG, NULL, head.sversion,
&numOfPoints, now);
actions++;
} else {
break;
}
}
totalLen += sizeof(head) + head.contLen + sizeof(simpleCheck);
}
tclose(fd);
tfree(cont);
dTrace("vid:%d, %d pieces of uncommitted data are restored", vnode, actions);
return totalLen;
_error:
tclose(fd);
tfree(cont);
dError("vid:%d, failed to restore %s, remove this node...", vnode, fileName);
// rename to error file for future process
char *f = NULL;
taosFileRename(fileName, "error", '/', &f);
free(f);
return -1;
}
int vnodeInitCommit(int vnode) {
size_t size = 0;
uint64_t firstV = 0;
SVnodeObj *pVnode = vnodeList + vnode;
pthread_mutex_init(&(pVnode->logMutex), NULL);
sprintf(pVnode->logFn, "%s/vnode%d/db/submit%d.log", tsDirectory, vnode, vnode);
sprintf(pVnode->logOFn, "%s/vnode%d/db/submit%d.olog", tsDirectory, vnode, vnode);
pVnode->mappingSize = ((int64_t)pVnode->cfg.cacheBlockSize) * pVnode->cfg.cacheNumOfBlocks.totalBlocks * 1.5;
pVnode->mappingThreshold = pVnode->mappingSize * 0.7;
// restore from .olog file and commit to file
size = vnodeRestoreDataFromLog(vnode, pVnode->logOFn, &firstV);
if (size < 0) return -1;
if (size > 0) {
if (pVnode->commitInProcess == 0) vnodeCommitToFile(pVnode);
remove(pVnode->logOFn);
}
// restore from .log file to cache
size = vnodeRestoreDataFromLog(vnode, pVnode->logFn, &firstV);
if (size < 0) return -1;
if (pVnode->cfg.commitLog == 0) return 0;
if (size == 0) firstV = pVnode->version;
if (vnodeOpenCommitLog(vnode, firstV) < 0) {
dError("vid:%d, commit log init failed", vnode);
return -1;
}
pVnode->pWrite += size;
dPrint("vid:%d, commit log is initialized", vnode);
return 0;
}
void vnodeCleanUpCommit(int vnode) {
SVnodeObj *pVnode = vnodeList + vnode;
if (FD_VALID(pVnode->logFd)) close(pVnode->logFd);
if (pVnode->cfg.commitLog && (pVnode->logFd > 0 && remove(pVnode->logFn) < 0)) {
dError("vid:%d, failed to remove:%s", vnode, pVnode->logFn);
taosLogError("vid:%d, failed to remove:%s", vnode, pVnode->logFn);
}
pthread_mutex_destroy(&(pVnode->logMutex));
}
int vnodeWriteToCommitLog(SMeterObj *pObj, char action, char *cont, int contLen, int sverion) {
SVnodeObj *pVnode = vnodeList + pObj->vnode;
if (pVnode->pWrite == NULL) return 0;
SCommitHead head;
head.sid = pObj->sid;
head.action = action;
head.sversion = pObj->sversion;
head.contLen = contLen;
head.simpleCheck = (head.sversion+head.sid+head.contLen+head.action) & 0xFFFFFF;
int simpleCheck = head.simpleCheck;
pthread_mutex_lock(&(pVnode->logMutex));
// 100 bytes redundant mem space
if (pVnode->mappingSize - (pVnode->pWrite - pVnode->pMem) < contLen + sizeof(SCommitHead) + sizeof(simpleCheck) + 100) {
pthread_mutex_unlock(&(pVnode->logMutex));
dTrace("vid:%d, mem mapping space is not enough, wait for commit", pObj->vnode);
vnodeProcessCommitTimer(pVnode, NULL);
return TSDB_CODE_ACTION_IN_PROGRESS;
}
char *pWrite = pVnode->pWrite;
pVnode->pWrite += sizeof(head) + contLen + sizeof(simpleCheck);
memcpy(pWrite, (char *)&head, sizeof(head));
memcpy(pWrite + sizeof(head), cont, contLen);
memcpy(pWrite + sizeof(head) + contLen, &simpleCheck, sizeof(simpleCheck));
pthread_mutex_unlock(&(pVnode->logMutex));
if (pVnode->pWrite - pVnode->pMem > pVnode->mappingThreshold) {
dTrace("vid:%d, mem mapping is close to limit, commit", pObj->vnode);
vnodeProcessCommitTimer(pVnode, NULL);
}
dTrace("vid:%d sid:%d, data is written to commit log", pObj->vnode, pObj->sid);
return 0;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "tscompression.h"
#include "tutil.h"
#include "vnode.h"
#include "vnodeFile.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
#define FILE_QUERY_NEW_BLOCK -5 // a special negative number
const int16_t vnodeFileVersion = 0;
int (*pCompFunc[])(const char *const input, int inputSize, const int elements, char *const output, int outputSize,
char algorithm, char *const buffer, int bufferSize) = {NULL,
tsCompressBool,
tsCompressTinyint,
tsCompressSmallint,
tsCompressInt,
tsCompressBigint,
tsCompressFloat,
tsCompressDouble,
tsCompressString,
tsCompressTimestamp,
tsCompressString};
int (*pDecompFunc[])(const char *const input, int compressedSize, const int elements, char *const output,
int outputSize, char algorithm, char *const buffer, int bufferSize) = {NULL,
tsDecompressBool,
tsDecompressTinyint,
tsDecompressSmallint,
tsDecompressInt,
tsDecompressBigint,
tsDecompressFloat,
tsDecompressDouble,
tsDecompressString,
tsDecompressTimestamp,
tsDecompressString};
int vnodeUpdateFileMagic(int vnode, int fileId);
int vnodeRecoverCompHeader(int vnode, int fileId);
int vnodeRecoverHeadFile(int vnode, int fileId);
int vnodeRecoverDataFile(int vnode, int fileId);
int vnodeForwardStartPosition(SQuery *pQuery, SCompBlock *pBlock, int32_t slotIdx, SVnodeObj *pVnode, SMeterObj *pObj);
int vnodeCheckNewHeaderFile(int fd, SVnodeObj *pVnode);
char* vnodeGetDataDir(int vnode, int fileId);
char* vnodeGetDiskFromHeadFile(char *headName);
void vnodeAdustVnodeFile(SVnodeObj *pVnode);
int vnodeSyncRetrieveFile(int vnode, int fd, uint32_t peerFid, uint64_t *fmagic);
int vnodeSyncRestoreFile(int vnode, int sfd);
void vnodeAdjustFileTier(int vnode);
void vnodeGetHeadDataLname(char *headName, char *dataName, char *lastName, int vnode, int fileId) {
if (headName != NULL) sprintf(headName, "%s/vnode%d/db/v%df%d.head", tsDirectory, vnode, vnode, fileId);
if (dataName != NULL) sprintf(dataName, "%s/vnode%d/db/v%df%d.data", tsDirectory, vnode, vnode, fileId);
if (lastName != NULL) sprintf(lastName, "%s/vnode%d/db/v%df%d.last", tsDirectory, vnode, vnode, fileId);
}
void vnodeGetHeadDataDname(char *dHeadName, char *dDataName, char *dLastName, int vnode, int fileId, char *path) {
if (dHeadName != NULL) sprintf(dHeadName, "%s/data/vnode%d/v%df%d.head0", path, vnode, vnode, fileId);
if (dDataName != NULL) sprintf(dDataName, "%s/data/vnode%d/v%df%d.data", path, vnode, vnode, fileId);
if (dLastName != NULL) sprintf(dLastName, "%s/data/vnode%d/v%df%d.last0", path, vnode, vnode, fileId);
}
void vnodeGetDnameFromLname(char *lhead, char *ldata, char *llast, char *dhead, char *ddata, char *dlast) {
if (lhead != NULL) {
assert(dhead != NULL);
readlink(lhead, dhead, TSDB_FILENAME_LEN);
}
if (ldata != NULL) {
assert(ddata != NULL);
readlink(ldata, ddata, TSDB_FILENAME_LEN);
}
if (llast != NULL) {
assert(dlast != NULL);
readlink(llast, dlast, TSDB_FILENAME_LEN);
}
}
void vnodeGetHeadTname(char *nHeadName, char *nLastName, int vnode, int fileId) {
if (nHeadName != NULL) sprintf(nHeadName, "%s/vnode%d/db/v%df%d.t", tsDirectory, vnode, vnode, fileId);
if (nLastName != NULL) sprintf(nLastName, "%s/vnode%d/db/v%df%d.l", tsDirectory, vnode, vnode, fileId);
}
void vnodeCreateDataDirIfNeeded(int vnode, char *path) {
char directory[TSDB_FILENAME_LEN] = "\0";
sprintf(directory, "%s/data/vnode%d", path, vnode);
if (access(directory, F_OK) != 0) mkdir(directory, 0755);
}
int vnodeCreateHeadDataFile(int vnode, int fileId, char *headName, char *dataName, char *lastName) {
char dHeadName[TSDB_FILENAME_LEN];
char dDataName[TSDB_FILENAME_LEN];
char dLastName[TSDB_FILENAME_LEN];
char *path = vnodeGetDataDir(vnode, fileId);
if (path == NULL) {
dError("vid:%d, fileId:%d, failed to get dataDir", vnode, fileId);
return -1;
}
vnodeCreateDataDirIfNeeded(vnode, path);
vnodeGetHeadDataLname(headName, dataName, lastName, vnode, fileId);
vnodeGetHeadDataDname(dHeadName, dDataName, dLastName, vnode, fileId, path);
if (symlink(dHeadName, headName) != 0) return -1;
if (symlink(dDataName, dataName) != 0) return -1;
if (symlink(dLastName, lastName) != 0) return -1;
dPrint("vid:%d, fileId:%d, empty header file:%s file:%s lastFile:%s on disk:%s is created ",
vnode, fileId, headName, dataName, lastName, path);
return 0;
}
int vnodeCreateEmptyCompFile(int vnode, int fileId) {
char headName[TSDB_FILENAME_LEN];
char dataName[TSDB_FILENAME_LEN];
char lastName[TSDB_FILENAME_LEN];
int tfd;
char *temp;
if (vnodeCreateHeadDataFile(vnode, fileId, headName, dataName, lastName) < 0) {
dError("failed to create head data file, vnode: %d, fileId: %d", vnode, fileId);
return -1;
}
tfd = open(headName, O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO);
if (tfd < 0) {
dError("failed to create head file:%s, reason:%s", headName, strerror(errno));
return -1;
}
vnodeCreateFileHeaderFd(tfd);
int size = sizeof(SCompHeader) * vnodeList[vnode].cfg.maxSessions + sizeof(TSCKSUM);
temp = malloc(size);
memset(temp, 0, size);
taosCalcChecksumAppend(0, (uint8_t *)temp, size);
lseek(tfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
twrite(tfd, temp, size);
free(temp);
close(tfd);
tfd = open(dataName, O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO);
if (tfd < 0) {
dError("failed to create data file:%s, reason:%s", dataName, strerror(errno));
return -1;
}
vnodeCreateFileHeaderFd(tfd);
close(tfd);
tfd = open(lastName, O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO);
if (tfd < 0) {
dError("failed to create last file:%s, reason:%s", lastName, strerror(errno));
return -1;
}
vnodeCreateFileHeaderFd(tfd);
close(tfd);
return 0;
}
int vnodeCreateNeccessaryFiles(SVnodeObj *pVnode) {
int numOfFiles = 0, fileId, filesAdded = 0;
int vnode = pVnode->vnode;
SVnodeCfg *pCfg = &(pVnode->cfg);
if (pVnode->lastKeyOnFile == 0) {
if (pCfg->daysPerFile == 0) pCfg->daysPerFile = 10;
pVnode->fileId = pVnode->firstKey / tsMsPerDay[(uint8_t)pVnode->cfg.precision] / pCfg->daysPerFile;
pVnode->lastKeyOnFile = (int64_t)(pVnode->fileId + 1) * pCfg->daysPerFile * tsMsPerDay[(uint8_t)pVnode->cfg.precision] - 1;
pVnode->numOfFiles = 1;
if (vnodeCreateEmptyCompFile(vnode, pVnode->fileId) < 0) return -1;
}
numOfFiles = (pVnode->lastKeyOnFile - pVnode->commitFirstKey) / tsMsPerDay[(uint8_t)pVnode->cfg.precision] / pCfg->daysPerFile;
if (pVnode->commitFirstKey > pVnode->lastKeyOnFile) numOfFiles = -1;
dTrace("vid:%d, commitFirstKey:%" PRId64 " lastKeyOnFile:%" PRId64 " numOfFiles:%d fileId:%d vnodeNumOfFiles:%d", pVnode->vnode,
pVnode->commitFirstKey, pVnode->lastKeyOnFile, numOfFiles, pVnode->fileId, pVnode->numOfFiles);
if (numOfFiles >= pVnode->numOfFiles) {
// create empty header files backward
filesAdded = numOfFiles - pVnode->numOfFiles + 1;
assert(filesAdded <= pVnode->maxFiles + 2);
for (int i = 0; i < filesAdded; ++i) {
fileId = pVnode->fileId - pVnode->numOfFiles - i;
if (vnodeCreateEmptyCompFile(vnode, fileId) < 0)
#ifdef CLUSTER
return vnodeRecoverFromPeer(pVnode, fileId);
#else
return -1;
#endif
}
} else if (numOfFiles < 0) {
// create empty header files forward
pVnode->fileId++;
if (vnodeCreateEmptyCompFile(vnode, pVnode->fileId) < 0)
#ifdef CLUSTER
return vnodeRecoverFromPeer(pVnode, pVnode->fileId);
#else
return -1;
#endif
pVnode->lastKeyOnFile += (int64_t)tsMsPerDay[(uint8_t)pVnode->cfg.precision] * pCfg->daysPerFile;
filesAdded = 1;
numOfFiles = 0; // hacker way
}
fileId = pVnode->fileId - numOfFiles;
pVnode->commitLastKey =
pVnode->lastKeyOnFile - (int64_t)numOfFiles * tsMsPerDay[(uint8_t)pVnode->cfg.precision] * pCfg->daysPerFile;
pVnode->commitFirstKey = pVnode->commitLastKey - (int64_t)tsMsPerDay[(uint8_t)pVnode->cfg.precision] * pCfg->daysPerFile + 1;
pVnode->commitFileId = fileId;
pVnode->numOfFiles = pVnode->numOfFiles + filesAdded;
return 0;
}
int vnodeOpenCommitFiles(SVnodeObj *pVnode, int noTempLast) {
char name[TSDB_FILENAME_LEN];
char dHeadName[TSDB_FILENAME_LEN] = "\0";
char dLastName[TSDB_FILENAME_LEN] = "\0";
int len = 0;
struct stat filestat;
int vnode = pVnode->vnode;
int fileId;
if (vnodeCreateNeccessaryFiles(pVnode) < 0) return -1;
fileId = pVnode->commitFileId;
dTrace("vid:%d, commit fileId:%d, commitLastKey:%" PRId64 ", vnodeLastKey:%" PRId64 ", lastKeyOnFile:%" PRId64 " numOfFiles:%d",
vnode, fileId, pVnode->commitLastKey, pVnode->lastKey, pVnode->lastKeyOnFile, pVnode->numOfFiles);
int minSize = sizeof(SCompHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM) + TSDB_FILE_HEADER_LEN;
vnodeGetHeadDataLname(pVnode->cfn, name, pVnode->lfn, vnode, fileId);
readlink(pVnode->cfn, dHeadName, TSDB_FILENAME_LEN);
readlink(pVnode->lfn, dLastName, TSDB_FILENAME_LEN);
len = strlen(dHeadName);
if (dHeadName[len - 1] == 'd') {
dHeadName[len] = '0';
dHeadName[len + 1] = '\0';
} else {
dHeadName[len - 1] = '0' + (dHeadName[len - 1] + 1 - '0') % 2;
}
len = strlen(dLastName);
if (dLastName[len - 1] == 't') {
dLastName[len] = '0';
dLastName[len + 1] = '\0';
} else {
dLastName[len - 1] = '0' + (dLastName[len - 1] + 1 - '0') % 2;
}
vnodeGetHeadTname(pVnode->nfn, pVnode->tfn, vnode, fileId);
symlink(dHeadName, pVnode->nfn);
if (!noTempLast) symlink(dLastName, pVnode->tfn);
// open head file
pVnode->hfd = open(pVnode->cfn, O_RDONLY);
if (pVnode->hfd < 0) {
dError("vid:%d, failed to open head file:%s, reason:%s", vnode, pVnode->cfn, strerror(errno));
taosLogError("vid:%d, failed to open head file:%s, reason:%s", vnode, pVnode->cfn, strerror(errno));
vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
}
// verify head file, check size
fstat(pVnode->hfd, &filestat);
if (filestat.st_size < minSize) {
dError("vid:%d, head file:%s corrupted", vnode, pVnode->cfn);
taosLogError("vid:%d, head file:%s corrupted", vnode, pVnode->cfn);
vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
}
// open a new header file
pVnode->nfd = open(pVnode->nfn, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO);
if (pVnode->nfd < 0) {
dError("vid:%d, failed to open new head file:%s, reason:%s", vnode, pVnode->nfn, strerror(errno));
taosLogError("vid:%d, failed to open new head file:%s, reason:%s", vnode, pVnode->nfn, strerror(errno));
goto _error;
}
vnodeCreateFileHeaderFd(pVnode->nfd);
// open existing data file
pVnode->dfd = open(name, O_WRONLY | O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO);
if (pVnode->dfd < 0) {
dError("vid:%d, failed to open data file:%s, reason:%s", vnode, name, strerror(errno));
taosLogError("vid:%d, failed to open data file:%s, reason:%s", vnode, name, strerror(errno));
vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
}
// verify data file, check size
fstat(pVnode->dfd, &filestat);
if (filestat.st_size < TSDB_FILE_HEADER_LEN) {
dError("vid:%d, data file:%s corrupted", vnode, name);
taosLogError("vid:%d, data file:%s corrupted", vnode, name);
vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
} else {
dPrint("vid:%d, data file:%s is opened to write", vnode, name);
}
// open last file
pVnode->lfd = open(pVnode->lfn, O_RDWR);
if (pVnode->lfd < 0) {
dError("vid:%d, failed to open last file:%s, reason:%s", vnode, pVnode->lfn, strerror(errno));
taosLogError("vid:%d, failed to open last file:%s, reason:%s", vnode, pVnode->lfn, strerror(errno));
vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
}
// verify last file, check size
fstat(pVnode->lfd, &filestat);
if (filestat.st_size < TSDB_FILE_HEADER_LEN) {
dError("vid:%d, last file:%s corrupted", vnode, pVnode->lfn);
taosLogError("vid:%d, last file:%s corrupted", vnode, pVnode->lfn);
vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
}
// open a new last file
if (noTempLast) {
pVnode->tfd = -1; // do not open temporary last file
} else {
pVnode->tfd = open(pVnode->tfn, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO);
if (pVnode->tfd < 0) {
dError("vid:%d, failed to open new last file:%s, reason:%s", vnode, pVnode->tfn, strerror(errno));
taosLogError("vid:%d, failed to open new last file:%s, reason:%s", vnode, pVnode->tfn, strerror(errno));
goto _error;
}
vnodeCreateFileHeaderFd(pVnode->tfd);
pVnode->lfSize = lseek(pVnode->tfd, 0, SEEK_END);
}
int size = sizeof(SCompHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM);
char *temp = malloc(size);
if (NULL == temp) {
dError("vid:%d, malloc failed", vnode);
taosLogError("vid:%d, malloc failed", vnode);
//vnodeRecoverFromPeer(pVnode, fileId);
goto _error;
}
memset(temp, 0, size);
taosCalcChecksumAppend(0, (uint8_t *)temp, size);
twrite(pVnode->nfd, temp, size);
free(temp);
pVnode->dfSize = lseek(pVnode->dfd, 0, SEEK_END);
return 0;
_error:
if (pVnode->dfd > 0) close(pVnode->dfd);
pVnode->dfd = 0;
if (pVnode->hfd > 0) close(pVnode->hfd);
pVnode->hfd = 0;
if (pVnode->nfd > 0) close(pVnode->nfd);
pVnode->nfd = 0;
if (pVnode->lfd > 0) close(pVnode->lfd);
pVnode->lfd = 0;
if (pVnode->tfd > 0) close(pVnode->tfd);
pVnode->tfd = 0;
return -1;
}
void vnodeRemoveFile(int vnode, int fileId) {
char headName[TSDB_FILENAME_LEN] = "\0";
char dataName[TSDB_FILENAME_LEN] = "\0";
char lastName[TSDB_FILENAME_LEN] = "\0";
char dHeadName[TSDB_FILENAME_LEN] = "\0";
char dDataName[TSDB_FILENAME_LEN] = "\0";
char dLastName[TSDB_FILENAME_LEN] = "\0";
SVnodeObj * pVnode = NULL;
SVnodeHeadInfo headInfo;
pVnode = vnodeList + vnode;
vnodeGetHeadDataLname(headName, dataName, lastName, vnode, fileId);
char *path = vnodeGetDiskFromHeadFile(headName);
if (path == NULL) {
return ;
}
vnodeGetDnameFromLname(headName, dataName, lastName, dHeadName, dDataName, dLastName);
int fd = open(headName, O_RDWR | O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO);
if (fd > 0) {
vnodeGetHeadFileHeaderInfo(fd, &headInfo);
atomic_fetch_add_64(&(pVnode->vnodeStatistic.totalStorage), -headInfo.totalStorage);
close(fd);
}
remove(headName);
remove(dataName);
remove(lastName);
remove(dHeadName);
remove(dDataName);
remove(dLastName);
dPrint("vid:%d fileId:%d on disk: %s is removed, numOfFiles:%d maxFiles:%d", vnode, fileId, path,
pVnode->numOfFiles, pVnode->maxFiles);
}
void vnodeCloseCommitFiles(SVnodeObj *pVnode) {
char dpath[TSDB_FILENAME_LEN] = "\0";
int ret;
// Check new if new header file is correct
if (tsCheckHeaderFile != 0) {
assert(vnodeCheckNewHeaderFile(pVnode->nfd, pVnode) == 0);
}
close(pVnode->nfd);
pVnode->nfd = 0;
close(pVnode->hfd);
pVnode->hfd = 0;
close(pVnode->dfd);
pVnode->dfd = 0;
close(pVnode->lfd);
pVnode->lfd = 0;
if (pVnode->tfd > 0) close(pVnode->tfd);
pthread_mutex_lock(&(pVnode->vmutex));
readlink(pVnode->cfn, dpath, TSDB_FILENAME_LEN);
ret = rename(pVnode->nfn, pVnode->cfn);
if (ret < 0) {
dError("vid:%d, failed to rename:%s, reason:%s", pVnode->vnode, pVnode->nfn, strerror(errno));
}
remove(dpath);
if (pVnode->tfd > 0) {
memset(dpath, 0, TSDB_FILENAME_LEN);
readlink(pVnode->lfn, dpath, TSDB_FILENAME_LEN);
ret = rename(pVnode->tfn, pVnode->lfn);
if (ret < 0) {
dError("vid:%d, failed to rename:%s, reason:%s", pVnode->vnode, pVnode->tfn, strerror(errno));
}
remove(dpath);
}
pthread_mutex_unlock(&(pVnode->vmutex));
pVnode->tfd = 0;
dTrace("vid:%d, %s and %s is saved", pVnode->vnode, pVnode->cfn, pVnode->lfn);
vnodeAdustVnodeFile(pVnode);
vnodeSaveAllMeterObjToFile(pVnode->vnode);
return;
}
void vnodeBroadcastStatusToUnsyncedPeer(SVnodeObj *pVnode);
void *vnodeCommitMultiToFile(SVnodeObj *pVnode, int ssid, int esid) {
int vnode = pVnode->vnode;
SData * data[TSDB_MAX_COLUMNS], *cdata[TSDB_MAX_COLUMNS]; // first 4 bytes are length
char * buffer = NULL, *dmem = NULL, *cmem = NULL, *hmem = NULL, *tmem = NULL;
SMeterObj * pObj = NULL;
SCompInfo compInfo = {0};
SCompHeader * pHeader;
SMeterInfo * meterInfo = NULL, *pTable = NULL;
SQuery query;
SColumnInfoEx colList[TSDB_MAX_COLUMNS] = {0};
SSqlFunctionExpr pExprs[TSDB_MAX_COLUMNS] = {0};
int commitAgain;
int headLen, sid, col;
int64_t pointsRead;
int64_t pointsReadLast;
SCompBlock * pCompBlock = NULL;
SVnodeCfg * pCfg = &pVnode->cfg;
TSCKSUM chksum;
SVnodeHeadInfo headInfo;
uint8_t * pOldCompBlocks;
dPrint("vid:%d, committing to file, firstKey:%" PRId64 " lastKey:%" PRId64 " ssid:%d esid:%d", vnode, pVnode->firstKey,
pVnode->lastKey, ssid, esid);
if (pVnode->lastKey == 0) goto _over;
vnodeCloseAllSyncFds(vnode);
vnodeRenewCommitLog(vnode);
// get the MAX consumption buffer for this vnode
int32_t maxBytesPerPoint = 0;
int32_t minBytesPerPoint = INT32_MAX;
for (sid = ssid; sid <= esid; ++sid) {
pObj = (SMeterObj *)(pVnode->meterList[sid]);
if ((pObj == NULL) || (pObj->pCache == NULL)) continue;
if (maxBytesPerPoint < pObj->bytesPerPoint) {
maxBytesPerPoint = pObj->bytesPerPoint;
}
if (minBytesPerPoint > pObj->bytesPerPoint) {
minBytesPerPoint = pObj->bytesPerPoint;
}
}
// buffer to hold the temp head
int tcachblocks = pCfg->cacheBlockSize / (minBytesPerPoint * pCfg->rowsInFileBlock);
int hmsize =
(pCfg->cacheNumOfBlocks.totalBlocks * (MAX(tcachblocks, 1) + 1) + pCfg->maxSessions) * sizeof(SCompBlock);
// buffer to hold the uncompressed data
int dmsize =
maxBytesPerPoint * pCfg->rowsInFileBlock + (sizeof(SData) + EXTRA_BYTES + sizeof(TSCKSUM)) * TSDB_MAX_COLUMNS;
// buffer to hold the compressed data
int cmsize =
maxBytesPerPoint * pCfg->rowsInFileBlock + (sizeof(SData) + EXTRA_BYTES + sizeof(TSCKSUM)) * TSDB_MAX_COLUMNS;
// buffer to hold compHeader
int tmsize = sizeof(SCompHeader) * pCfg->maxSessions + sizeof(TSCKSUM);
// buffer to hold meterInfo
int misize = pVnode->cfg.maxSessions * sizeof(SMeterInfo);
int totalSize = hmsize + dmsize + cmsize + misize + tmsize;
buffer = malloc(totalSize);
if (buffer == NULL) {
dError("no enough memory for committing buffer");
return NULL;
}
hmem = buffer;
dmem = hmem + hmsize;
cmem = dmem + dmsize;
tmem = cmem + cmsize;
meterInfo = (SMeterInfo *)(tmem + tmsize);
pthread_mutex_lock(&(pVnode->vmutex));
pVnode->commitFirstKey = pVnode->firstKey;
pVnode->firstKey = pVnode->lastKey + 1;
pthread_mutex_unlock(&(pVnode->vmutex));
_again:
pVnode->commitInProcess = 1;
commitAgain = 0;
memset(hmem, 0, totalSize);
memset(&query, 0, sizeof(query));
if (vnodeOpenCommitFiles(pVnode, ssid) < 0) goto _over;
dTrace("vid:%d, start to commit, commitFirstKey:%" PRId64 " commitLastKey:%" PRId64, vnode, pVnode->commitFirstKey,
pVnode->commitLastKey);
headLen = 0;
vnodeGetHeadFileHeaderInfo(pVnode->hfd, &headInfo);
int maxOldBlocks = 1;
// read head info
if (pVnode->hfd) {
lseek(pVnode->hfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
if (read(pVnode->hfd, tmem, tmsize) <= 0) {
dError("vid:%d, failed to read old header file:%s", vnode, pVnode->cfn);
taosLogError("vid:%d, failed to read old header file:%s", vnode, pVnode->cfn);
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
} else {
if (!taosCheckChecksumWhole((uint8_t *)tmem, tmsize)) {
dError("vid:%d, failed to read old header file:%s since comp header offset is broken", vnode, pVnode->cfn);
taosLogError("vid:%d, failed to read old header file:%s since comp header offset is broken",
vnode, pVnode->cfn);
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
}
}
}
// read compInfo
for (sid = 0; sid < pCfg->maxSessions; ++sid) {
if (pVnode->meterList == NULL) { // vnode is being freed, abort
goto _over;
}
pObj = (SMeterObj *)(pVnode->meterList[sid]);
if (pObj == NULL) {
continue;
}
// meter is going to be deleted, abort
if (vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPING)) {
dWarn("vid:%d sid:%d is dropped, ignore this meter", vnode, sid);
continue;
}
pTable = meterInfo + sid;
pHeader = ((SCompHeader *)tmem) + sid;
if (pVnode->hfd > 0) {
if (pHeader->compInfoOffset > 0) {
lseek(pVnode->hfd, pHeader->compInfoOffset, SEEK_SET);
if (read(pVnode->hfd, &compInfo, sizeof(compInfo)) == sizeof(compInfo)) {
if (!taosCheckChecksumWhole((uint8_t *)(&compInfo), sizeof(SCompInfo))) {
dError("vid:%d sid:%d id:%s, failed to read compinfo in file:%s since checksum mismatch",
vnode, sid, pObj->meterId, pVnode->cfn);
taosLogError("vid:%d sid:%d id:%s, failed to read compinfo in file:%s since checksum mismatch",
vnode, sid, pObj->meterId, pVnode->cfn);
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
} else {
if (pObj->uid == compInfo.uid) {
pTable->oldNumOfBlocks = compInfo.numOfBlocks;
pTable->oldCompBlockOffset = pHeader->compInfoOffset + sizeof(SCompInfo);
pTable->last = compInfo.last;
if (compInfo.numOfBlocks > maxOldBlocks) maxOldBlocks = compInfo.numOfBlocks;
if (pTable->last) {
lseek(pVnode->hfd, sizeof(SCompBlock) * (compInfo.numOfBlocks - 1), SEEK_CUR);
read(pVnode->hfd, &pTable->lastBlock, sizeof(SCompBlock));
}
} else {
dTrace("vid:%d sid:%d id:%s, uid:%" PRIu64 " is not matched with old:%" PRIu64 ", old data will be thrown away",
vnode, sid, pObj->meterId, pObj->uid, compInfo.uid);
pTable->oldNumOfBlocks = 0;
}
}
} else {
dError("vid:%d sid:%d id:%s, failed to read compinfo in file:%s", vnode, sid, pObj->meterId, pVnode->cfn);
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
}
}
}
}
// Loop To write data to fileId
for (sid = ssid; sid <= esid; ++sid) {
pObj = (SMeterObj *)(pVnode->meterList[sid]);
if ((pObj == NULL) || (pObj->pCache == NULL)) continue;
data[0] = (SData *)dmem;
cdata[0] = (SData *)cmem;
for (col = 1; col < pObj->numOfColumns; ++col) {
data[col] = (SData *)(((char *)data[col - 1]) + sizeof(SData) +
pObj->pointsPerFileBlock * pObj->schema[col - 1].bytes + EXTRA_BYTES + sizeof(TSCKSUM));
cdata[col] = (SData *)(((char *)cdata[col - 1]) + sizeof(SData) +
pObj->pointsPerFileBlock * pObj->schema[col - 1].bytes + EXTRA_BYTES + sizeof(TSCKSUM));
}
pTable = meterInfo + sid;
pTable->tempHeadOffset = headLen;
memset(&query, 0, sizeof(query));
query.colList = colList;
query.pSelectExpr = pExprs;
query.ekey = pVnode->commitLastKey;
query.skey = pVnode->commitFirstKey;
query.lastKey = query.skey;
query.sdata = data;
vnodeSetCommitQuery(pObj, &query);
dTrace("vid:%d sid:%d id:%s, start to commit, startKey:%" PRId64 " slot:%d pos:%d", pObj->vnode, pObj->sid, pObj->meterId,
pObj->lastKeyOnFile, query.slot, query.pos);
pointsRead = 0;
pointsReadLast = 0;
// last block is at last file
if (pTable->last) {
if ((pTable->lastBlock.sversion != pObj->sversion) || (query.over)) {
// TODO : Check the correctness of this code. write the last block to
// .data file
pCompBlock = (SCompBlock *)(hmem + headLen);
assert(dmem - (char *)pCompBlock >= sizeof(SCompBlock));
*pCompBlock = pTable->lastBlock;
if (pTable->lastBlock.sversion != pObj->sversion) {
pCompBlock->last = 0;
pCompBlock->offset = lseek(pVnode->dfd, 0, SEEK_END);
pTable->last = 0;
lseek(pVnode->lfd, pTable->lastBlock.offset, SEEK_SET);
tsendfile(pVnode->dfd, pVnode->lfd, NULL, pTable->lastBlock.len);
pVnode->dfSize = pCompBlock->offset + pTable->lastBlock.len;
} else {
if (ssid == 0) {
assert(pCompBlock->last && pVnode->tfd != -1);
pCompBlock->offset = lseek(pVnode->tfd, 0, SEEK_END);
lseek(pVnode->lfd, pTable->lastBlock.offset, SEEK_SET);
tsendfile(pVnode->tfd, pVnode->lfd, NULL, pTable->lastBlock.len);
pVnode->lfSize = pCompBlock->offset + pTable->lastBlock.len;
} else {
assert(pVnode->tfd == -1);
}
}
headLen += sizeof(SCompBlock);
pTable->newNumOfBlocks++;
} else {
// read last block into memory
if (vnodeReadLastBlockToMem(pObj, &pTable->lastBlock, data) < 0) goto _over;
pTable->last = 0;
pointsReadLast = pTable->lastBlock.numOfPoints;
query.over = 0;
headInfo.totalStorage -= (pointsReadLast * pObj->bytesPerPoint);
dTrace("vid:%d sid:%d id:%s, points:%d in last block will be merged to new block",
pObj->vnode, pObj->sid, pObj->meterId, pointsReadLast);
}
pTable->changed = 1;
pTable->oldNumOfBlocks--;
}
while (query.over == 0) {
pCompBlock = (SCompBlock *)(hmem + headLen);
assert(dmem - (char *)pCompBlock >= sizeof(SCompBlock));
pointsRead += pointsReadLast;
while (pointsRead < pObj->pointsPerFileBlock) {
query.pointsToRead = pObj->pointsPerFileBlock - pointsRead;
query.pointsOffset = pointsRead;
pointsRead += vnodeQueryFromCache(pObj, &query);
if (query.over) break;
}
if (pointsRead == 0) break;
headInfo.totalStorage += ((pointsRead - pointsReadLast) * pObj->bytesPerPoint);
pCompBlock->last = 1;
if (vnodeWriteBlockToFile(pObj, pCompBlock, data, cdata, pointsRead) < 0) goto _over;
if (pCompBlock->keyLast > pObj->lastKeyOnFile) pObj->lastKeyOnFile = pCompBlock->keyLast;
pTable->last = pCompBlock->last;
// write block info into header buffer
headLen += sizeof(SCompBlock);
pTable->newNumOfBlocks++;
pTable->committedPoints += (pointsRead - pointsReadLast);
dTrace("vid:%d sid:%d id:%s, pointsRead:%d, pointsReadLast:%d lastKey:%" PRId64 ", "
"slot:%d pos:%d newNumOfBlocks:%d headLen:%d",
pObj->vnode, pObj->sid, pObj->meterId, pointsRead, pointsReadLast, pObj->lastKeyOnFile, query.slot, query.pos,
pTable->newNumOfBlocks, headLen);
if (pointsRead < pObj->pointsPerFileBlock || query.keyIsMet) break;
pointsRead = 0;
pointsReadLast = 0;
}
dTrace("vid:%d sid:%d id:%s, %d points are committed, lastKey:%" PRId64 " slot:%d pos:%d newNumOfBlocks:%d",
pObj->vnode, pObj->sid, pObj->meterId, pTable->committedPoints, pObj->lastKeyOnFile, query.slot, query.pos,
pTable->newNumOfBlocks);
if (pTable->committedPoints > 0) {
pTable->commitSlot = query.slot;
pTable->commitPos = query.pos;
}
TSKEY nextKey = 0;
if (pObj->lastKey > pVnode->commitLastKey)
nextKey = pVnode->commitLastKey + 1;
else if (pObj->lastKey > pObj->lastKeyOnFile)
nextKey = pObj->lastKeyOnFile + 1;
pthread_mutex_lock(&(pVnode->vmutex));
if (nextKey < pVnode->firstKey && nextKey > 1) pVnode->firstKey = nextKey;
pthread_mutex_unlock(&(pVnode->vmutex));
}
if (pVnode->lastKey > pVnode->commitLastKey) commitAgain = 1;
dTrace("vid:%d, finish appending the data file", vnode);
// calculate the new compInfoOffset
int compInfoOffset = TSDB_FILE_HEADER_LEN + tmsize;
for (sid = 0; sid < pCfg->maxSessions; ++sid) {
pObj = (SMeterObj *)(pVnode->meterList[sid]);
pHeader = ((SCompHeader *)tmem) + sid;
if (pObj == NULL) {
pHeader->compInfoOffset = 0;
continue;
}
pTable = meterInfo + sid;
pTable->compInfoOffset = compInfoOffset;
pTable->finalNumOfBlocks = pTable->oldNumOfBlocks + pTable->newNumOfBlocks;
if (pTable->finalNumOfBlocks > 0) {
pHeader->compInfoOffset = pTable->compInfoOffset;
compInfoOffset += sizeof(SCompInfo) + pTable->finalNumOfBlocks * sizeof(SCompBlock) + sizeof(TSCKSUM);
} else {
pHeader->compInfoOffset = 0;
}
dTrace("vid:%d sid:%d id:%s, oldBlocks:%d numOfBlocks:%d compInfoOffset:%d", pObj->vnode, pObj->sid, pObj->meterId,
pTable->oldNumOfBlocks, pTable->finalNumOfBlocks, compInfoOffset);
}
// write the comp header into new file
vnodeUpdateHeadFileHeader(pVnode->nfd, &headInfo);
lseek(pVnode->nfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
taosCalcChecksumAppend(0, (uint8_t *)tmem, tmsize);
if (twrite(pVnode->nfd, tmem, tmsize) <= 0) {
dError("vid:%d sid:%d id:%s, failed to write:%s, error:%s", vnode, sid, pObj->meterId, pVnode->nfn,
strerror(errno));
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
}
pOldCompBlocks = (uint8_t *)malloc(sizeof(SCompBlock) * maxOldBlocks);
// write the comp block list in new file
for (sid = 0; sid < pCfg->maxSessions; ++sid) {
pObj = (SMeterObj *)(pVnode->meterList[sid]);
if (pObj == NULL) continue;
pTable = meterInfo + sid;
if (pTable->finalNumOfBlocks <= 0) continue;
compInfo.last = pTable->last;
compInfo.uid = pObj->uid;
compInfo.numOfBlocks = pTable->finalNumOfBlocks;
/* compInfo.compBlockLen = pTable->finalCompBlockLen; */
compInfo.delimiter = TSDB_VNODE_DELIMITER;
taosCalcChecksumAppend(0, (uint8_t *)(&compInfo), sizeof(SCompInfo));
lseek(pVnode->nfd, pTable->compInfoOffset, SEEK_SET);
if (twrite(pVnode->nfd, &compInfo, sizeof(compInfo)) <= 0) {
dError("vid:%d sid:%d id:%s, failed to write:%s, reason:%s", vnode, sid, pObj->meterId, pVnode->nfn,
strerror(errno));
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
}
// write the old comp blocks
chksum = 0;
if (pVnode->hfd && pTable->oldNumOfBlocks) {
lseek(pVnode->hfd, pTable->oldCompBlockOffset, SEEK_SET);
if (pTable->changed) {
int compBlockLen = pTable->oldNumOfBlocks * sizeof(SCompBlock);
read(pVnode->hfd, pOldCompBlocks, compBlockLen);
twrite(pVnode->nfd, pOldCompBlocks, compBlockLen);
chksum = taosCalcChecksum(0, pOldCompBlocks, compBlockLen);
} else {
tsendfile(pVnode->nfd, pVnode->hfd, NULL, pTable->oldNumOfBlocks * sizeof(SCompBlock));
read(pVnode->hfd, &chksum, sizeof(TSCKSUM));
}
}
if (pTable->newNumOfBlocks) {
chksum = taosCalcChecksum(chksum, (uint8_t *)(hmem + pTable->tempHeadOffset),
pTable->newNumOfBlocks * sizeof(SCompBlock));
if (twrite(pVnode->nfd, hmem + pTable->tempHeadOffset, pTable->newNumOfBlocks * sizeof(SCompBlock)) <= 0) {
dError("vid:%d sid:%d id:%s, failed to write:%s, reason:%s", vnode, sid, pObj->meterId, pVnode->nfn,
strerror(errno));
vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
goto _over;
}
}
twrite(pVnode->nfd, &chksum, sizeof(TSCKSUM));
}
tfree(pOldCompBlocks);
dTrace("vid:%d, finish writing the new header file:%s", vnode, pVnode->nfn);
vnodeCloseCommitFiles(pVnode);
for (sid = ssid; sid <= esid; ++sid) {
pObj = (SMeterObj *)(pVnode->meterList[sid]);
if (pObj == NULL) continue;
pTable = meterInfo + sid;
if (pTable->finalNumOfBlocks <= 0) continue;
if (pTable->committedPoints > 0) {
vnodeUpdateCommitInfo(pObj, pTable->commitSlot, pTable->commitPos, pTable->commitCount);
}
}
if (commitAgain) {
pVnode->commitFirstKey = pVnode->commitLastKey + 1;
goto _again;
}
vnodeRemoveCommitLog(vnode);
_over:
pVnode->commitInProcess = 0;
vnodeCommitOver(pVnode);
memset(&(vnodeList[vnode].commitThread), 0, sizeof(vnodeList[vnode].commitThread));
tfree(buffer);
tfree(pOldCompBlocks);
vnodeBroadcastStatusToUnsyncedPeer(pVnode);
dPrint("vid:%d, committing is over", vnode);
return pVnode;
}
void *vnodeCommitToFile(void *param) {
SVnodeObj *pVnode = (SVnodeObj *)param;
return vnodeCommitMultiToFile(pVnode, 0, pVnode->cfg.maxSessions - 1);
}
int vnodeGetCompBlockInfo(SMeterObj *pObj, SQuery *pQuery) {
char prefix[TSDB_FILENAME_LEN];
char fileName[TSDB_FILENAME_LEN];
SCompHeader compHeader;
SCompInfo compInfo;
struct stat fstat;
SVnodeObj * pVnode = &vnodeList[pObj->vnode];
char * buffer = NULL;
TSCKSUM chksum;
vnodeFreeFields(pQuery);
tfree(pQuery->pBlock);
pQuery->numOfBlocks = 0;
SVnodeCfg *pCfg = &vnodeList[pObj->vnode].cfg;
if (pQuery->hfd > 0) close(pQuery->hfd);
sprintf(prefix, "%s/vnode%d/db/v%df%d", tsDirectory, pObj->vnode, pObj->vnode, pQuery->fileId);
sprintf(fileName, "%s.head", prefix);
pthread_mutex_lock(&(pVnode->vmutex));
pQuery->hfd = open(fileName, O_RDONLY);
pthread_mutex_unlock(&(pVnode->vmutex));
if (pQuery->hfd < 0) {
dError("vid:%d sid:%d id:%s, failed to open head file:%s, reason:%s", pObj->vnode, pObj->sid, pObj->meterId,
fileName, strerror(errno));
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
int tmsize = sizeof(SCompHeader) * pCfg->maxSessions + sizeof(TSCKSUM);
buffer = (char *)calloc(1, tmsize);
if (buffer == NULL) {
dError("vid:%d sid:%d id:%s, failed to allocate memory to buffer", pObj->vnode, pObj->sid, pObj->meterId);
return -TSDB_CODE_APP_ERROR;
}
lseek(pQuery->hfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
if (read(pQuery->hfd, buffer, tmsize) != tmsize) {
dError("vid:%d sid:%d id:%s, file:%s failed to read comp header, reason:%s", pObj->vnode, pObj->sid, pObj->meterId,
fileName, strerror(errno));
taosLogError("vid:%d sid:%d id:%s, file:%s failed to read comp header", pObj->vnode, pObj->sid, pObj->meterId,
fileName);
tfree(buffer);
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
if (!taosCheckChecksumWhole((uint8_t *)buffer, tmsize)) {
dError("vid:%d sid:%d id:%s, file:%s comp header offset is broken", pObj->vnode, pObj->sid, pObj->meterId,
fileName);
taosLogError("vid:%d sid:%d id:%s, file:%s comp header offset is broken", pObj->vnode, pObj->sid, pObj->meterId,
fileName);
tfree(buffer);
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
compHeader = ((SCompHeader *)buffer)[pObj->sid];
tfree(buffer);
if (compHeader.compInfoOffset == 0) return 0;
lseek(pQuery->hfd, compHeader.compInfoOffset, SEEK_SET);
read(pQuery->hfd, &compInfo, sizeof(SCompInfo));
if (!taosCheckChecksumWhole((uint8_t *)(&compInfo), sizeof(SCompInfo))) {
dError("vid:%d sid:%d id:%s, file:%s compInfo checksum mismatch", pObj->vnode, pObj->sid, pObj->meterId, fileName);
taosLogError("vid:%d sid:%d id:%s, file:%s compInfo checksum mismatch", pObj->vnode, pObj->sid, pObj->meterId,
fileName);
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
if (compInfo.numOfBlocks <= 0) return 0;
if (compInfo.uid != pObj->uid) return 0;
pQuery->numOfBlocks = compInfo.numOfBlocks;
pQuery->pBlock = (SCompBlock *)calloc(1, (sizeof(SCompBlock) + sizeof(SField *)) * compInfo.numOfBlocks);
pQuery->pFields = (SField **)((char *)pQuery->pBlock + sizeof(SCompBlock) * compInfo.numOfBlocks);
/* char *pBlock = (char *)pQuery->pBlockFields +
* sizeof(SCompBlockFields)*compInfo.numOfBlocks; */
read(pQuery->hfd, pQuery->pBlock, compInfo.numOfBlocks * sizeof(SCompBlock));
read(pQuery->hfd, &chksum, sizeof(TSCKSUM));
if (chksum != taosCalcChecksum(0, (uint8_t *)(pQuery->pBlock), compInfo.numOfBlocks * sizeof(SCompBlock))) {
dError("vid:%d sid:%d id:%s, head file comp block broken, fileId: %d", pObj->vnode, pObj->sid, pObj->meterId,
pQuery->fileId);
taosLogError("vid:%d sid:%d id:%s, head file comp block broken, fileId: %d", pObj->vnode, pObj->sid, pObj->meterId,
pQuery->fileId);
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
close(pQuery->hfd);
pQuery->hfd = -1;
sprintf(fileName, "%s.data", prefix);
if (stat(fileName, &fstat) < 0) {
dError("vid:%d sid:%d id:%s, data file:%s not there!", pObj->vnode, pObj->sid, pObj->meterId, fileName);
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
if (pQuery->dfd > 0) close(pQuery->dfd);
pQuery->dfd = open(fileName, O_RDONLY);
if (pQuery->dfd < 0) {
dError("vid:%d sid:%d id:%s, failed to open data file:%s, reason:%s", pObj->vnode, pObj->sid, pObj->meterId,
fileName, strerror(errno));
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
sprintf(fileName, "%s.last", prefix);
if (stat(fileName, &fstat) < 0) {
dError("vid:%d sid:%d id:%s, last file:%s not there!", pObj->vnode, pObj->sid, pObj->meterId, fileName);
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
if (pQuery->lfd > 0) close(pQuery->lfd);
pQuery->lfd = open(fileName, O_RDONLY);
if (pQuery->lfd < 0) {
dError("vid:%d sid:%d id:%s, failed to open last file:%s, reason:%s", pObj->vnode, pObj->sid, pObj->meterId,
fileName, strerror(errno));
return vnodeRecoverFromPeer(pVnode, pQuery->fileId);
}
return pQuery->numOfBlocks;
}
int vnodeReadColumnToMem(int fd, SCompBlock *pBlock, SField **fields, int col, char *data, int dataSize,
char *temp, char *buffer, int bufferSize) {
int len = 0, size = 0;
SField *tfields = NULL;
TSCKSUM chksum = 0;
if (*fields == NULL) {
size = sizeof(SField) * (pBlock->numOfCols) + sizeof(TSCKSUM);
*fields = (SField *)calloc(1, size);
lseek(fd, pBlock->offset, SEEK_SET);
read(fd, *fields, size);
if (!taosCheckChecksumWhole((uint8_t *)(*fields), size)) {
dError("SField checksum error, col: %d", col);
taosLogError("SField checksum error, col: %d", col);
return -1;
}
}
tfields = *fields;
/* If data is NULL, that means only to read SField content. So no need to read data part. */
if (data == NULL) return 0;
lseek(fd, pBlock->offset + tfields[col].offset, SEEK_SET);
if (pBlock->algorithm) {
len = read(fd, temp, tfields[col].len);
read(fd, &chksum, sizeof(TSCKSUM));
if (chksum != taosCalcChecksum(0, (uint8_t *)temp, tfields[col].len)) {
dError("data column checksum error, col: %d", col);
taosLogError("data column checksum error, col: %d", col);
return -1;
}
(*pDecompFunc[tfields[col].type])(temp, tfields[col].len, pBlock->numOfPoints, data, dataSize,
pBlock->algorithm, buffer, bufferSize);
} else {
len = read(fd, data, tfields[col].len);
read(fd, &chksum, sizeof(TSCKSUM));
if (chksum != taosCalcChecksum(0, (uint8_t *)data, tfields[col].len)) {
dError("data column checksum error, col: %d", col);
taosLogError("data column checksum error, col: %d", col);
return -1;
}
}
if (len <= 0) {
dError("failed to read col:%d, offset:%d, reason:%s", col, (int32_t)(tfields[col].offset), strerror(errno));
return -1;
}
return 0;
}
int vnodeReadCompBlockToMem(SMeterObj *pObj, SQuery *pQuery, SData *sdata[]) {
char * temp = NULL;
int i = 0, col = 0, code = 0;
SCompBlock *pBlock = NULL;
SField ** pFields = NULL;
char * buffer = NULL;
int bufferSize = 0;
int dfd = pQuery->dfd;
tfree(pQuery->pFields[pQuery->slot]);
pBlock = pQuery->pBlock + pQuery->slot;
pFields = pQuery->pFields + pQuery->slot;
temp = malloc(pObj->bytesPerPoint * (pBlock->numOfPoints + 1));
if (pBlock->last) dfd = pQuery->lfd;
if (pBlock->algorithm == TWO_STAGE_COMP) {
bufferSize = pObj->maxBytes * pBlock->numOfPoints + EXTRA_BYTES;
buffer = (char *)calloc(1, bufferSize);
}
if (pQuery->colList[0].colIdx != PRIMARYKEY_TIMESTAMP_COL_INDEX) {
// load timestamp column first in any cases.
code = vnodeReadColumnToMem(dfd, pBlock, pFields, PRIMARYKEY_TIMESTAMP_COL_INDEX,
pQuery->tsData->data + pQuery->pointsOffset * TSDB_KEYSIZE,
TSDB_KEYSIZE*pBlock->numOfPoints, temp, buffer, bufferSize);
col = 1;
} else {
// Read the SField data for this block first, if timestamp column is retrieved in this query, we ignore this process
code = vnodeReadColumnToMem(dfd, pBlock, pFields, 0, NULL, 0, NULL, buffer, bufferSize);
}
if (code < 0) goto _over;
while (col < pBlock->numOfCols && i < pQuery->numOfCols) {
SColumnInfo *pColumnInfo = &pQuery->colList[i].data;
if ((*pFields)[col].colId < pColumnInfo->colId) {
++col;
} else if ((*pFields)[col].colId == pColumnInfo->colId) {
code = vnodeReadColumnToMem(dfd, pBlock, pFields, col, sdata[i]->data, pColumnInfo->bytes*pBlock->numOfPoints, temp, buffer, bufferSize);
if (code < 0) goto _over;
++i;
++col;
} else {
/*
* pQuery->colList[i].colIdx < (*pFields)[col].colId, this column is not existed in current block,
* fill space with NULL value
*/
char * output = sdata[i]->data;
int32_t bytes = pQuery->colList[i].data.bytes;
int32_t type = pQuery->colList[i].data.type;
setNullN(output, type, bytes, pBlock->numOfPoints);
++i;
}
}
if (col >= pBlock->numOfCols && i < pQuery->numOfCols) {
// remain columns need to set null value
while (i < pQuery->numOfCols) {
char * output = sdata[i]->data;
int32_t bytes = pQuery->colList[i].data.bytes;
int32_t type = pQuery->colList[i].data.type;
setNullN(output, type, bytes, pBlock->numOfPoints);
++i;
}
}
_over:
tfree(buffer);
tfree(temp);
if (code < 0) code = vnodeRecoverFromPeer(vnodeList + pObj->vnode, pQuery->fileId);
return code;
}
int vnodeReadLastBlockToMem(SMeterObj *pObj, SCompBlock *pBlock, SData *sdata[]) {
char * temp = NULL;
int col = 0, code = 0;
SField *pFields = NULL;
char * buffer = NULL;
int bufferSize = 0;
SVnodeObj *pVnode = vnodeList + pObj->vnode;
temp = malloc(pObj->bytesPerPoint * (pBlock->numOfPoints + 1));
if (pBlock->algorithm == TWO_STAGE_COMP) {
bufferSize = pObj->maxBytes*pBlock->numOfPoints+EXTRA_BYTES;
buffer = (char *)calloc(1, pObj->maxBytes * pBlock->numOfPoints + EXTRA_BYTES);
}
for (col = 0; col < pBlock->numOfCols; ++col) {
code = vnodeReadColumnToMem(pVnode->lfd, pBlock, &pFields, col, sdata[col]->data,
pObj->pointsPerFileBlock*pObj->schema[col].bytes+EXTRA_BYTES, temp, buffer, bufferSize);
if (code < 0) break;
sdata[col]->len = pObj->schema[col].bytes * pBlock->numOfPoints;
}
tfree(buffer);
tfree(temp);
tfree(pFields);
if (code < 0) code = vnodeRecoverFromPeer(pVnode, pVnode->fileId);
return code;
}
int vnodeWriteBlockToFile(SMeterObj *pObj, SCompBlock *pCompBlock, SData *data[], SData *cdata[], int points) {
SVnodeObj *pVnode = &vnodeList[pObj->vnode];
SVnodeCfg *pCfg = &pVnode->cfg;
int wlen = 0;
SField * fields = NULL;
int size = sizeof(SField) * pObj->numOfColumns + sizeof(TSCKSUM);
int32_t offset = size;
char * buffer = NULL;
int bufferSize = 0;
int dfd = pVnode->dfd;
if (pCompBlock->last && (points < pObj->pointsPerFileBlock * tsFileBlockMinPercent)) {
dTrace("vid:%d sid:%d id:%s, points:%d are written to last block, block stime: %" PRId64 ", block etime: %" PRId64,
pObj->vnode, pObj->sid, pObj->meterId, points, *((TSKEY *)(data[0]->data)),
*((TSKEY * )(data[0]->data + (points - 1) * pObj->schema[0].bytes)));
pCompBlock->last = 1;
dfd = pVnode->tfd > 0 ? pVnode->tfd : pVnode->lfd;
} else {
pCompBlock->last = 0;
}
pCompBlock->offset = lseek(dfd, 0, SEEK_END);
pCompBlock->len = 0;
fields = (SField *)calloc(1, size);
if (fields == NULL) return -1;
if (pCfg->compression == TWO_STAGE_COMP){
bufferSize = pObj->maxBytes * points + EXTRA_BYTES;
buffer = (char *)malloc(bufferSize);
}
for (int i = 0; i < pObj->numOfColumns; ++i) {
fields[i].colId = pObj->schema[i].colId;
fields[i].type = pObj->schema[i].type;
fields[i].bytes = pObj->schema[i].bytes;
fields[i].offset = offset;
// assert(data[i]->len == points*pObj->schema[i].bytes);
if (pCfg->compression) {
cdata[i]->len = (*pCompFunc[(uint8_t)pObj->schema[i].type])(data[i]->data, points * pObj->schema[i].bytes, points,
cdata[i]->data, pObj->schema[i].bytes*pObj->pointsPerFileBlock+EXTRA_BYTES,
pCfg->compression, buffer, bufferSize);
fields[i].len = cdata[i]->len;
taosCalcChecksumAppend(0, (uint8_t *)(cdata[i]->data), cdata[i]->len + sizeof(TSCKSUM));
offset += (cdata[i]->len + sizeof(TSCKSUM));
} else {
data[i]->len = pObj->schema[i].bytes * points;
fields[i].len = data[i]->len;
taosCalcChecksumAppend(0, (uint8_t *)(data[i]->data), data[i]->len + sizeof(TSCKSUM));
offset += (data[i]->len + sizeof(TSCKSUM));
}
getStatistics(data[0]->data, data[i]->data, pObj->schema[i].bytes, points, pObj->schema[i].type, &fields[i].min,
&fields[i].max, &fields[i].sum, &fields[i].minIndex, &fields[i].maxIndex, &fields[i].numOfNullPoints);
}
tfree(buffer);
// Write SField part
taosCalcChecksumAppend(0, (uint8_t *)fields, size);
wlen = twrite(dfd, fields, size);
if (wlen <= 0) {
tfree(fields);
dError("vid:%d sid:%d id:%s, failed to write block, wlen:%d reason:%s", pObj->vnode, pObj->sid, pObj->meterId, wlen,
strerror(errno));
#ifdef CLUSTER
return vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
#else
return -1;
#endif
}
pVnode->vnodeStatistic.compStorage += wlen;
pVnode->dfSize += wlen;
pCompBlock->len += wlen;
tfree(fields);
// Write data part
for (int i = 0; i < pObj->numOfColumns; ++i) {
if (pCfg->compression) {
wlen = twrite(dfd, cdata[i]->data, cdata[i]->len + sizeof(TSCKSUM));
} else {
wlen = twrite(dfd, data[i]->data, data[i]->len + sizeof(TSCKSUM));
}
if (wlen <= 0) {
dError("vid:%d sid:%d id:%s, failed to write block, wlen:%d points:%d reason:%s",
pObj->vnode, pObj->sid, pObj->meterId, wlen, points, strerror(errno));
return vnodeRecoverFromPeer(pVnode, pVnode->commitFileId);
}
pVnode->vnodeStatistic.compStorage += wlen;
pVnode->dfSize += wlen;
pCompBlock->len += wlen;
}
dTrace("vid:%d, vnode compStorage size is: %" PRId64, pObj->vnode, pVnode->vnodeStatistic.compStorage);
pCompBlock->algorithm = pCfg->compression;
pCompBlock->numOfPoints = points;
pCompBlock->numOfCols = pObj->numOfColumns;
pCompBlock->keyFirst = *((TSKEY *)(data[0]->data)); // hack way to get the key
pCompBlock->keyLast = *((TSKEY *)(data[0]->data + (points - 1) * pObj->schema[0].bytes));
pCompBlock->sversion = pObj->sversion;
assert(pCompBlock->keyFirst <= pCompBlock->keyLast);
return 0;
}
static int forwardInFile(SQuery *pQuery, int32_t midSlot, int32_t step, SVnodeObj *pVnode, SMeterObj *pObj);
int vnodeSearchPointInFile(SMeterObj *pObj, SQuery *pQuery) {
TSKEY latest, oldest;
int ret = 0;
int64_t delta = 0;
int firstSlot, lastSlot, midSlot;
int numOfBlocks;
char * temp = NULL, *data = NULL;
SCompBlock *pBlock = NULL;
SVnodeObj * pVnode = &vnodeList[pObj->vnode];
int step;
char * buffer = NULL;
int bufferSize = 0;
int dfd;
// if file is broken, pQuery->slot = -2; if not found, pQuery->slot = -1;
pQuery->slot = -1;
pQuery->pos = -1;
if (pVnode->numOfFiles <= 0) return 0;
SVnodeCfg *pCfg = &pVnode->cfg;
delta = (int64_t)pCfg->daysPerFile * tsMsPerDay[(uint8_t)pVnode->cfg.precision];
latest = pObj->lastKeyOnFile;
oldest = (pVnode->fileId - pVnode->numOfFiles + 1) * delta;
if (latest < oldest) return 0;
if (!QUERY_IS_ASC_QUERY(pQuery)) {
if (pQuery->skey < oldest) return 0;
if (pQuery->ekey > latest) return 0;
if (pQuery->skey > latest) pQuery->skey = latest;
} else {
if (pQuery->skey > latest) return 0;
if (pQuery->ekey < oldest) return 0;
if (pQuery->skey < oldest) pQuery->skey = oldest;
}
dTrace("vid:%d sid:%d id:%s, skey:%" PRId64 " ekey:%" PRId64 " oldest:%" PRId64 " latest:%" PRId64 " fileId:%d numOfFiles:%d",
pObj->vnode, pObj->sid, pObj->meterId, pQuery->skey, pQuery->ekey, oldest, latest, pVnode->fileId,
pVnode->numOfFiles);
step = QUERY_IS_ASC_QUERY(pQuery) ? 1 : -1;
pQuery->fileId = pQuery->skey / delta; // starting fileId
pQuery->fileId -= step; // hacker way to make while loop below works
bufferSize = pCfg->rowsInFileBlock*sizeof(TSKEY)+EXTRA_BYTES;
buffer = (char *)calloc(1, bufferSize);
while (1) {
pQuery->fileId += step;
if ((pQuery->fileId > pVnode->fileId) || (pQuery->fileId < pVnode->fileId - pVnode->numOfFiles + 1)) {
tfree(buffer);
return 0;
}
ret = vnodeGetCompBlockInfo(pObj, pQuery);
if (ret == 0) continue;
if (ret < 0) break; // file broken
pBlock = pQuery->pBlock;
firstSlot = 0;
lastSlot = pQuery->numOfBlocks - 1;
//numOfBlocks = pQuery->numOfBlocks;
if (QUERY_IS_ASC_QUERY(pQuery) && pBlock[lastSlot].keyLast < pQuery->skey) continue;
if (!QUERY_IS_ASC_QUERY(pQuery) && pBlock[firstSlot].keyFirst > pQuery->skey) continue;
while (1) {
numOfBlocks = lastSlot - firstSlot + 1;
midSlot = (firstSlot + (numOfBlocks >> 1));
if (numOfBlocks == 1) break;
if (pQuery->skey > pBlock[midSlot].keyLast) {
if (numOfBlocks == 2) break;
if (!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->skey < pBlock[midSlot + 1].keyFirst)) break;
firstSlot = midSlot + 1;
} else if (pQuery->skey < pBlock[midSlot].keyFirst) {
if (QUERY_IS_ASC_QUERY(pQuery) && (pQuery->skey > pBlock[midSlot - 1].keyLast)) break;
lastSlot = midSlot - 1;
} else {
break; // got the slot
}
}
pQuery->slot = midSlot;
if (!QUERY_IS_ASC_QUERY(pQuery)) {
if (pQuery->skey < pBlock[midSlot].keyFirst) break;
if (pQuery->ekey > pBlock[midSlot].keyLast) {
pQuery->slot = midSlot + 1;
break;
}
} else {
if (pQuery->skey > pBlock[midSlot].keyLast) {
pQuery->slot = midSlot + 1;
break;
}
if (pQuery->ekey < pBlock[midSlot].keyFirst) break;
}
temp = malloc(pObj->pointsPerFileBlock * TSDB_KEYSIZE + EXTRA_BYTES); // only first column
data = malloc(pObj->pointsPerFileBlock * TSDB_KEYSIZE + EXTRA_BYTES); // only first column
dfd = pBlock[midSlot].last ? pQuery->lfd : pQuery->dfd;
ret = vnodeReadColumnToMem(dfd, pBlock + midSlot, pQuery->pFields + midSlot, 0, data,
pObj->pointsPerFileBlock*TSDB_KEYSIZE+EXTRA_BYTES,
temp, buffer, bufferSize);
if (ret < 0) {
ret = vnodeRecoverFromPeer(pVnode, pQuery->fileId);
break;
} // file broken
pQuery->pos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(data, pBlock[midSlot].numOfPoints, pQuery->skey,
pQuery->order.order);
pQuery->key = *((TSKEY *)(data + pObj->schema[0].bytes * pQuery->pos));
ret = vnodeForwardStartPosition(pQuery, pBlock, midSlot, pVnode, pObj);
break;
}
tfree(buffer);
tfree(temp);
tfree(data);
return ret;
}
int vnodeForwardStartPosition(SQuery *pQuery, SCompBlock *pBlock, int32_t slotIdx, SVnodeObj *pVnode, SMeterObj *pObj) {
int step = QUERY_IS_ASC_QUERY(pQuery) ? 1 : -1;
if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols == 0) {
int maxReads = QUERY_IS_ASC_QUERY(pQuery) ? pBlock->numOfPoints - pQuery->pos : pQuery->pos + 1;
if (pQuery->limit.offset < maxReads) { // start position in current block
if (QUERY_IS_ASC_QUERY(pQuery)) {
pQuery->pos += pQuery->limit.offset;
} else {
pQuery->pos -= pQuery->limit.offset;
}
pQuery->limit.offset = 0;
} else {
pQuery->limit.offset -= maxReads;
slotIdx += step;
return forwardInFile(pQuery, slotIdx, step, pVnode, pObj);
}
}
return pQuery->numOfBlocks;
}
int forwardInFile(SQuery *pQuery, int32_t slotIdx, int32_t step, SVnodeObj *pVnode, SMeterObj *pObj) {
SCompBlock *pBlock = pQuery->pBlock;
while (slotIdx < pQuery->numOfBlocks && slotIdx >= 0 && pQuery->limit.offset >= pBlock[slotIdx].numOfPoints) {
pQuery->limit.offset -= pBlock[slotIdx].numOfPoints;
slotIdx += step;
}
if (slotIdx < pQuery->numOfBlocks && slotIdx >= 0) {
if (QUERY_IS_ASC_QUERY(pQuery)) {
pQuery->pos = pQuery->limit.offset;
} else {
pQuery->pos = pBlock[slotIdx].numOfPoints - pQuery->limit.offset - 1;
}
pQuery->slot = slotIdx;
pQuery->limit.offset = 0;
return pQuery->numOfBlocks;
} else { // continue in next file, forward pQuery->limit.offset points
int ret = 0;
pQuery->slot = -1;
pQuery->pos = -1;
while (1) {
pQuery->fileId += step;
if ((pQuery->fileId > pVnode->fileId) || (pQuery->fileId < pVnode->fileId - pVnode->numOfFiles + 1)) {
pQuery->lastKey = pObj->lastKeyOnFile;
pQuery->skey = pQuery->lastKey + 1;
return 0;
}
ret = vnodeGetCompBlockInfo(pObj, pQuery);
if (ret == 0) continue;
if (ret > 0) break; // qualified file
}
if (ret > 0) {
int startSlot = QUERY_IS_ASC_QUERY(pQuery) ? 0 : pQuery->numOfBlocks - 1;
return forwardInFile(pQuery, startSlot, step, pVnode, pObj);
} else {
return ret;
}
}
}
static FORCE_INLINE TSKEY vnodeGetTSInDataBlock(SQuery *pQuery, int32_t pos, int32_t factor) {
return *(TSKEY *)(pQuery->tsData->data + (pQuery->pointsOffset * factor + pos) * TSDB_KEYSIZE);
}
int vnodeQueryFromFile(SMeterObj *pObj, SQuery *pQuery) {
int numOfReads = 0;
int lastPos = -1, startPos;
int col, step, code = 0;
char * pRead, *pData;
SData * sdata[TSDB_MAX_COLUMNS];
SCompBlock *pBlock = NULL;
SVnodeObj * pVnode = &vnodeList[pObj->vnode];
pQuery->pointsRead = 0;
int keyLen = TSDB_KEYSIZE;
if (pQuery->over) return 0;
if (pQuery->slot < 0) // it means a new query, we need to find the point first
code = vnodeSearchPointInFile(pObj, pQuery);
if (code < 0 || pQuery->slot < 0 || pQuery->pos == -1) {
pQuery->over = 1;
return code;
}
step = QUERY_IS_ASC_QUERY(pQuery) ? -1 : 1;
pBlock = pQuery->pBlock + pQuery->slot;
if (pQuery->pos == FILE_QUERY_NEW_BLOCK) {
if (!QUERY_IS_ASC_QUERY(pQuery)) {
if (pQuery->ekey > pBlock->keyLast) pQuery->over = 1;
if (pQuery->skey < pBlock->keyFirst) pQuery->over = 1;
} else {
if (pQuery->ekey < pBlock->keyFirst) pQuery->over = 1;
if (pQuery->skey > pBlock->keyLast) pQuery->over = 1;
}
pQuery->pos = QUERY_IS_ASC_QUERY(pQuery) ? 0 : pBlock->numOfPoints - 1;
}
if (pQuery->over) return 0;
// To make sure the start position of each buffer is aligned to 4bytes in 32-bit ARM system.
for(col = 0; col < pQuery->numOfCols; ++col) {
sdata[col] = calloc(1, sizeof(SData) + pBlock->numOfPoints * pQuery->colList[col].data.bytes + EXTRA_BYTES);
}
/*
* timestamp column is fetched in any cases. Therefore, if the query does not fetch primary column,
* we allocate tsData buffer with twice size of the other ordinary pQuery->sdata.
* Otherwise, the query function may over-write buffer area while retrieve function has not packed the results into
* message to send to client yet.
*
* So the startPositionFactor is needed to denote which half part is used to store the result, and which
* part is available for keep data during query process.
*
* Note: the startPositionFactor must be used in conjunction with pQuery->pointsOffset
*/
int32_t startPositionFactor = 1;
if (pQuery->colList[0].colIdx == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
pQuery->tsData = sdata[0];
startPositionFactor = 0;
}
code = vnodeReadCompBlockToMem(pObj, pQuery, sdata);
if (code < 0) {
dError("vid:%d sid:%d id:%s, failed to read block:%d numOfPoints:%d", pObj->vnode, pObj->sid, pObj->meterId,
pQuery->slot, pBlock->numOfPoints);
goto _next;
}
int maxReads = QUERY_IS_ASC_QUERY(pQuery) ? pBlock->numOfPoints - pQuery->pos : pQuery->pos + 1;
TSKEY startKey = vnodeGetTSInDataBlock(pQuery, 0, startPositionFactor);
TSKEY endKey = vnodeGetTSInDataBlock(pQuery, pBlock->numOfPoints - 1, startPositionFactor);
if (QUERY_IS_ASC_QUERY(pQuery)) {
if (endKey < pQuery->ekey) {
numOfReads = maxReads;
} else {
lastPos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(
pQuery->tsData->data + keyLen * (pQuery->pos + pQuery->pointsOffset * startPositionFactor), maxReads,
pQuery->ekey, TSQL_SO_DESC);
numOfReads = (lastPos >= 0) ? lastPos + 1 : 0;
}
} else {
if (startKey > pQuery->ekey) {
numOfReads = maxReads;
} else {
lastPos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(
pQuery->tsData->data + keyLen * pQuery->pointsOffset * startPositionFactor, maxReads, pQuery->ekey,
TSQL_SO_ASC);
numOfReads = (lastPos >= 0) ? pQuery->pos - lastPos + 1 : 0;
}
}
if (numOfReads > pQuery->pointsToRead - pQuery->pointsRead) {
numOfReads = pQuery->pointsToRead - pQuery->pointsRead;
} else {
if (lastPos >= 0 || numOfReads == 0) {
pQuery->keyIsMet = 1;
pQuery->over = 1;
}
}
startPos = QUERY_IS_ASC_QUERY(pQuery) ? pQuery->pos : pQuery->pos - numOfReads + 1;
int32_t numOfQualifiedPoints = 0;
int32_t numOfActualRead = numOfReads;
// copy data to result buffer
if (pQuery->numOfFilterCols == 0) {
// no filter condition on ordinary columns
for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
int16_t colBufferIndex = pQuery->pSelectExpr[i].pBase.colInfo.colIdxInBuf;
int32_t bytes = GET_COLUMN_BYTES(pQuery, i);
pData = pQuery->sdata[i]->data + pQuery->pointsOffset * bytes;
pRead = sdata[colBufferIndex]->data + startPos * bytes;
if (QUERY_IS_ASC_QUERY(pQuery)) {
memcpy(pData, pRead, numOfReads * bytes);
} else { //reversed copy to output buffer
for(int32_t j = 0; j < numOfReads; ++j) {
memcpy(pData + bytes * j, pRead + (numOfReads - 1 - j) * bytes, bytes);
}
}
}
numOfQualifiedPoints = numOfReads;
} else {
// check each data one by one set the input column data
for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
struct SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
pFilterInfo->pData = sdata[pFilterInfo->info.colIdxInBuf]->data;
}
int32_t *ids = calloc(1, numOfReads * sizeof(int32_t));
numOfActualRead = 0;
if (QUERY_IS_ASC_QUERY(pQuery)) {
for (int32_t j = startPos; j < pBlock->numOfPoints; j -= step) {
TSKEY key = vnodeGetTSInDataBlock(pQuery, j, startPositionFactor);
if (key < startKey || key > endKey) {
dError("vid:%d sid:%d id:%s, timestamp in file block disordered. slot:%d, pos:%d, ts:%" PRId64 ", block "
"range:%" PRId64 "-%" PRId64, pObj->vnode, pObj->sid, pObj->meterId, pQuery->slot, j, key, startKey, endKey);
tfree(ids);
return -TSDB_CODE_FILE_BLOCK_TS_DISORDERED;
}
// out of query range, quit
if (key > pQuery->ekey) {
break;
}
if (!vnodeFilterData(pQuery, &numOfActualRead, j)) {
continue;
}
ids[numOfQualifiedPoints] = j;
if (++numOfQualifiedPoints == numOfReads) { // qualified data are enough
break;
}
}
} else {
for (int32_t j = pQuery->pos; j >= 0; --j) {
TSKEY key = vnodeGetTSInDataBlock(pQuery, j, startPositionFactor);
if (key < startKey || key > endKey) {
dError("vid:%d sid:%d id:%s, timestamp in file block disordered. slot:%d, pos:%d, ts:%" PRId64 ", block "
"range:%" PRId64 "-%" PRId64, pObj->vnode, pObj->sid, pObj->meterId, pQuery->slot, j, key, startKey, endKey);
tfree(ids);
return -TSDB_CODE_FILE_BLOCK_TS_DISORDERED;
}
// out of query range, quit
if (key < pQuery->ekey) {
break;
}
if (!vnodeFilterData(pQuery, &numOfActualRead, j)) {
continue;
}
ids[numOfQualifiedPoints] = j;
if (++numOfQualifiedPoints == numOfReads) { // qualified data are enough
break;
}
}
}
// int32_t start = QUERY_IS_ASC_QUERY(pQuery) ? 0 : numOfReads - numOfQualifiedPoints;
for (int32_t j = 0; j < numOfQualifiedPoints; ++j) {
for (int32_t col = 0; col < pQuery->numOfOutputCols; ++col) {
int16_t colIndexInBuffer = pQuery->pSelectExpr[col].pBase.colInfo.colIdxInBuf;
int32_t bytes = GET_COLUMN_BYTES(pQuery, col);
pData = pQuery->sdata[col]->data + (pQuery->pointsOffset + j) * bytes;
pRead = sdata[colIndexInBuffer]->data + ids[j/* + start*/] * bytes;
memcpy(pData, pRead, bytes);
}
}
tfree(ids);
assert(numOfQualifiedPoints <= numOfReads);
}
// Note: numOfQualifiedPoints may be 0, since no data in this block are qualified
assert(pQuery->pointsRead == 0);
pQuery->pointsRead += numOfQualifiedPoints;
for (col = 0; col < pQuery->numOfOutputCols; ++col) {
int16_t bytes = GET_COLUMN_BYTES(pQuery, col);
pQuery->sdata[col]->len = bytes * (pQuery->pointsOffset + pQuery->pointsRead);
}
pQuery->pos -= numOfActualRead * step;
// update the lastkey/skey
int32_t lastAccessPos = pQuery->pos + step;
pQuery->lastKey = vnodeGetTSInDataBlock(pQuery, lastAccessPos, startPositionFactor);
pQuery->skey = pQuery->lastKey - step;
_next:
if ((pQuery->pos < 0 || pQuery->pos >= pBlock->numOfPoints || numOfReads == 0) && (pQuery->over == 0)) {
pQuery->slot = pQuery->slot - step;
pQuery->pos = FILE_QUERY_NEW_BLOCK;
}
if ((pQuery->slot < 0 || pQuery->slot >= pQuery->numOfBlocks) && (pQuery->over == 0)) {
int ret;
while (1) {
ret = -1;
pQuery->fileId -= step; // jump to next file
if (QUERY_IS_ASC_QUERY(pQuery)) {
if (pQuery->fileId > pVnode->fileId) {
// to do:
// check if file is updated, if updated, open again and check if this Meter is updated
// if meter is updated, read in new block info, and
break;
}
} else {
if ((pVnode->fileId - pQuery->fileId + 1) > pVnode->numOfFiles) break;
}
ret = vnodeGetCompBlockInfo(pObj, pQuery);
if (ret > 0) break;
if (ret < 0) code = ret;
}
if (ret <= 0) pQuery->over = 1;
pQuery->slot = QUERY_IS_ASC_QUERY(pQuery) ? 0 : pQuery->numOfBlocks - 1;
}
for(int32_t i = 0; i < pQuery->numOfCols; ++i) {
tfree(sdata[i]);
}
return code;
}
int vnodeUpdateFileMagic(int vnode, int fileId) {
struct stat fstat;
char fileName[256];
SVnodeObj *pVnode = vnodeList + vnode;
uint64_t magic = 0;
vnodeGetHeadDataLname(fileName, NULL, NULL, vnode, fileId);
if (stat(fileName, &fstat) != 0) {
dError("vid:%d, head file:%s is not there", vnode, fileName);
return -1;
}
int size = sizeof(SCompHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM) + TSDB_FILE_HEADER_LEN;
if (fstat.st_size < size) {
dError("vid:%d, head file:%s is corrupted", vnode, fileName);
return -1;
}
#ifdef CLUSTER
//if (fstat.st_size == size) return 0;
#else
if (fstat.st_size == size) return 0;
#endif
vnodeGetHeadDataLname(NULL, fileName, NULL, vnode, fileId);
if (stat(fileName, &fstat) == 0) {
magic = fstat.st_size;
} else {
dError("vid:%d, data file:%s is not there", vnode, fileName);
return -1;
}
vnodeGetHeadDataLname(NULL, NULL, fileName, vnode, fileId);
if (stat(fileName, &fstat) == 0) {
magic += fstat.st_size;
}
int slot = fileId % pVnode->maxFiles;
pVnode->fmagic[slot] = magic;
return 0;
}
int vnodeInitFile(int vnode) {
int code = TSDB_CODE_SUCCESS;
SVnodeObj *pVnode = vnodeList + vnode;
pVnode->maxFiles = pVnode->cfg.daysToKeep / pVnode->cfg.daysPerFile + 1;
pVnode->maxFile1 = pVnode->cfg.daysToKeep1 / pVnode->cfg.daysPerFile;
pVnode->maxFile2 = pVnode->cfg.daysToKeep2 / pVnode->cfg.daysPerFile;
pVnode->fmagic = (uint64_t *)calloc(pVnode->maxFiles + 1, sizeof(uint64_t));
int fileId = pVnode->fileId;
/*
* The actual files will far exceed the files that need to exist
*/
if (pVnode->numOfFiles > pVnode->maxFiles) {
dError("vid:%d numOfFiles:%d should not larger than maxFiles:%d", vnode, pVnode->numOfFiles, pVnode->maxFiles);
}
int numOfFiles = MIN(pVnode->numOfFiles, pVnode->maxFiles);
for (int i = 0; i < numOfFiles; ++i) {
if (vnodeUpdateFileMagic(vnode, fileId) < 0) {
if (pVnode->cfg.replications > 1) {
pVnode->badFileId = fileId;
}
dError("vid:%d fileId:%d is corrupted", vnode, fileId);
} else {
dTrace("vid:%d fileId:%d is checked", vnode, fileId);
}
fileId--;
}
return code;
}
int vnodeRecoverCompHeader(int vnode, int fileId) {
// TODO: try to recover SCompHeader part
dTrace("starting to recover vnode head file comp header part, vnode: %d fileId: %d", vnode, fileId);
assert(0);
return 0;
}
int vnodeRecoverHeadFile(int vnode, int fileId) {
// TODO: try to recover SCompHeader part
dTrace("starting to recover vnode head file, vnode: %d, fileId: %d", vnode, fileId);
assert(0);
return 0;
}
int vnodeRecoverDataFile(int vnode, int fileId) {
// TODO: try to recover SCompHeader part
dTrace("starting to recover vnode data file, vnode: %d, fileId: %d", vnode, fileId);
assert(0);
return 0;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "vnode.h"
#include "vnodeFile.h"
char* vnodeGetDiskFromHeadFile(char *headName) { return tsDirectory; }
char* vnodeGetDataDir(int vnode, int fileId) { return dataDir; }
void vnodeAdustVnodeFile(SVnodeObj *pVnode) {
// Retention policy here
int fileId = pVnode->fileId - pVnode->numOfFiles + 1;
int cfile = taosGetTimestamp(pVnode->cfg.precision)/pVnode->cfg.daysPerFile/tsMsPerDay[(uint8_t)pVnode->cfg.precision];
while (fileId <= cfile - pVnode->maxFiles) {
vnodeRemoveFile(pVnode->vnode, fileId);
pVnode->numOfFiles--;
fileId++;
}
}
int vnodeCheckNewHeaderFile(int fd, SVnodeObj *pVnode) {
SCompHeader *pHeader = NULL;
SCompBlock *pBlocks = NULL;
int blockSize = 0;
SCompInfo compInfo;
int tmsize = 0;
tmsize = sizeof(SCompHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM);
pHeader = (SCompHeader *)malloc(tmsize);
if (pHeader == NULL) return 0;
lseek(fd, TSDB_FILE_HEADER_LEN, SEEK_SET);
if (read(fd, (void *)pHeader, tmsize) != tmsize) {
goto _broken_exit;
}
if (!taosCheckChecksumWhole((uint8_t *)pHeader, tmsize)) {
goto _broken_exit;
}
for (int sid = 0; sid < pVnode->cfg.maxSessions; sid++) {
if (pVnode->meterList == NULL) goto _correct_exit;
if (pVnode->meterList[sid] == NULL || pHeader[sid].compInfoOffset == 0) continue;
lseek(fd, pHeader[sid].compInfoOffset, SEEK_SET);
if (read(fd, (void *)(&compInfo), sizeof(SCompInfo)) != sizeof(SCompInfo)) {
goto _broken_exit;
}
if (!taosCheckChecksumWhole((uint8_t *)(&compInfo), sizeof(SCompInfo))) {
goto _broken_exit;
}
if (compInfo.uid != ((SMeterObj *)pVnode->meterList[sid])->uid) continue;
int expectedSize = sizeof(SCompBlock) * compInfo.numOfBlocks + sizeof(TSCKSUM);
if (blockSize < expectedSize) {
pBlocks = (SCompBlock *)realloc(pBlocks, expectedSize);
if (pBlocks == NULL) {
tfree(pHeader);
return 0;
}
blockSize = expectedSize;
}
if (read(fd, (void *)pBlocks, expectedSize) != expectedSize) {
dError("failed to read block part");
goto _broken_exit;
}
if (!taosCheckChecksumWhole((uint8_t *)pBlocks, expectedSize)) {
dError("block part is broken");
goto _broken_exit;
}
for (int i = 0; i < compInfo.numOfBlocks; i++) {
if (pBlocks[i].last && i != compInfo.numOfBlocks-1) {
dError("last block in middle, block:%d", i);
goto _broken_exit;
}
}
}
_correct_exit:
dPrint("vid: %d new header file %s is correct", pVnode->vnode, pVnode->nfn);
tfree(pBlocks);
tfree(pHeader);
return 0;
_broken_exit:
dError("vid: %d new header file %s is broken", pVnode->vnode, pVnode->nfn);
tfree(pBlocks);
tfree(pHeader);
return -1;
}
\ No newline at end of file
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "vnode.h"
int vnodeCheckHeaderFile(int fd, int dfd, SVnodeCfg cfg, int mode) {
SCompHeader *pHeaders = NULL;
SVnodeCfg *pCfg = &cfg;
SCompInfo compInfo;
SCompBlock *pBlocks = NULL;
int blockSize = 0;
SField *pFields = NULL;
char *pBuf = NULL;
int size = 0;
int ret = 0;
if (fd < 0 || dfd < 0) return -1;
lseek(fd, TSDB_FILE_HEADER_LEN, SEEK_SET);
size = pCfg->maxSessions*sizeof(SCompHeader)+sizeof(TSCKSUM);
pHeaders = calloc(1, size);
if (pHeaders == NULL) {
return -1;
}
read(fd, pHeaders, size);
if (!taosCheckChecksumWhole((uint8_t *)pHeaders, size)) {
return -1;
}
for (int i = 0; i < pCfg->maxSessions; i++) {
if (pHeaders[i].compInfoOffset == 0) continue;
if (pHeaders[i].compInfoOffset < 0) {
// TODO : report error here
ret = -1;
continue;
}
lseek(fd, pHeaders[i].compInfoOffset, SEEK_SET);
read(fd, &compInfo, sizeof(SCompInfo));
if (!taosCheckChecksumWhole((uint8_t *)&compInfo, sizeof(SCompInfo))) {
// TODO : report error
ret = -1;
continue;
}
int tsize = sizeof(SCompBlock) * compInfo.numOfBlocks + sizeof(TSCKSUM);
if (tsize > blockSize) {
if (pBlocks == NULL) {
pBlocks = calloc(1, tsize);
} else {
pBlocks = realloc(pBlocks, tsize);
}
blockSize = tsize;
}
read(fd, tsize);
if (!taosCheckChecksumWhole(pBlocks, tsize)) {
// TODO: Report error
ret = -1;
continue;
}
TSKEY keyLast = 0;
for (int j = 0; j < compInfo.numOfBlocks; j++) {
SCompBlock *pBlock = pBlocks + j;
if (pBlock->last != 0 && j < compInfo.numOfBlocks-1) {
// TODO: report error
ret = -1;
break;
}
if (pBlock->offset < TSDB_FILE_HEADER_LEN) {
// TODO : report erro
ret = -1;
break;
}
if (pBlock->keyLast < pBlock->keyFirst) {
// TODO : report error
ret = -1;
break;
}
if (pBlock->keyFirst <= keyLast) {
// TODO : report error
ret = -1;
break;
}
keyLast = pBlock->keyLast;
// Check block in data
lseek(dfd, pBlock->offset, SEEK_SET);
tsize = sizeof(SField) * pBlock->numOfCols + sizeof(TSCKSUM);
pFields = realloc(pFields, tsize);
read(dfd, pFields, tsize);
if (!taosCheckChecksumWhole((uint8_t*)pFields, tsize)) {
// TODO : report error
ret = -1;
continue;
}
for (int k = 0; k < pBlock->numOfCols; k++) {
// TODO: Check pFields[k] content
pBuf = realloc(pBuf, pFields[k].len);
if (!taosCheckChecksumWhole((uint8_t *)pBuf, pFields[k].len)) {
// TODO : report error;
ret = -1;
continue;
}
}
}
}
tfree(pBuf);
tfree(pFields);
tfree(pBlocks);
tfree(pHeaders);
return ret;
}
int vnodePackDataFile(int vnode, int fileId) {
// TODO: check if it is able to pack current file
// TODO: assign value to headerFile and dataFile
char *headerFile = NULL;
char *dataFile = NULL;
char *lastFile = NULL;
SVnodeObj *pVnode = vnodeList+vnode;
SCompHeader *pHeaders = NULL;
SCompBlock *pBlocks = NULL;
int blockSize = 0;
char *pBuff = 0;
int buffSize = 0;
SCompInfo compInfo;
int size = 0;
int hfd = open(headerFile, O_RDONLY);
if (hfd < 0) {
dError("vid: %d, failed to open header file:%s\n", vnode, headerFile);
return -1;
}
int dfd = open(dataFile, O_RDONLY);
if (dfd < 0) {
dError("vid: %d, failed to open data file:%s\n", vnode, dataFile);
return -1;
}
int lfd = open(lastFile, O_RDONLY);
if (lfd < 0) {
dError("vid: %d, failed to open data file:%s\n", vnode, lastFile);
return -1;
}
lseek(hfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
size = sizeof(SCompHeader)*pVnode->cfg.maxSessions+sizeof(TSCKSUM);
pHeaders = malloc(size);
if (pHeaders == NULL) goto _exit_failure;
read(hfd, pHeaders, size);
if (!taosCheckChecksumWhole((uint8_t *)pHeaders, size)) {
dError("vid: %d, header file %s is broken", vnode, headerFile);
goto _exit_failure;
}
for (size_t i = 0; i < pVnode->cfg.maxSessions; i++)
{
if (pHeaders[i].compInfoOffset <= 0) continue;
SMeterObj *pObj = (SMeterObj *)pVnode->meterList[i];
// read compInfo part
lseek(hfd, pHeaders[i].compInfoOffset, SEEK_SET);
read(hfd, &compInfo, sizeof(SCompInfo));
if (!taosCheckChecksumWhole((uint8_t *)&compInfo, sizeof(SCompInfo))) {
dError("vid: %d sid:%d fileId:%d compInfo is broken", vnode, i, fileId);
goto _exit_failure;
}
// read compBlock part
int tsize = compInfo.numOfBlocks * sizeof(SCompBlock) + sizeof(TSCKSUM);
if (tsize > blockSize) {
if (blockSize == 0) {
pBlocks = malloc(tsize);
} else {
pBlocks = realloc(pBlocks, tsize);
}
blockSize = tsize;
}
read(hfd, pBlocks, tsize);
if (!taosCheckChecksumWhole((uint8_t *)pBlocks, tsize)) {
dError("vid:%d sid:%d fileId:%d block part is broken", vnode, i, fileId);
goto _exit_failure;
}
assert(compInfo.numOfBlocks > 0);
// Loop to scan the blocks and merge block when neccessary.
tsize = sizeof(SCompInfo) + compInfo.numOfBlocks *sizeof(SCompBlock) + sizeof(TSCKSUM);
pBuff = realloc(pBuff, tsize);
SCompInfo *pInfo = (SCompInfo *)pBuff;
SCompBlock *pNBlocks = pBuff + sizeof(SCompInfo);
int nCounter = 0;
for (int j; j < compInfo.numOfBlocks; j++) {
// TODO : Check if it is the last block
// if (j == compInfo.numOfBlocks - 1) {}
if (pBlocks[j].numOfPoints + pNBlocks[nCounter].numOfPoints <= pObj->pointsPerFileBlock) {
// Merge current block to current new block
} else {
// Write new block to new data file
// pNBlocks[nCounter].
nCounter++;
}
}
}
return 0;
_exit_failure:
tfree(pHeaders);
if (hfd > 0) close(hfd);
if (dfd > 0) close(dfd);
if (lfd > 0) close(lfd);
return -1;
}
\ No newline at end of file
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "taosmsg.h"
#include "tsqlfunction.h"
#include "vnode.h"
#include "vnodeDataFilterFunc.h"
bool less_i8(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)minval < pFilter->filterInfo.upperBndi);
}
bool less_i16(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)minval < pFilter->filterInfo.upperBndi);
}
bool less_i32(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)minval < pFilter->filterInfo.upperBndi);
}
bool less_i64(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)minval < pFilter->filterInfo.upperBndi);
}
bool less_ds(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)minval < pFilter->filterInfo.upperBndd);
}
bool less_dd(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)minval < pFilter->filterInfo.upperBndd);
}
//////////////////////////////////////////////////////////////////
bool large_i8(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)maxval > pFilter->filterInfo.lowerBndi);
}
bool large_i16(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)maxval > pFilter->filterInfo.lowerBndi);
}
bool large_i32(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)maxval > pFilter->filterInfo.lowerBndi);
}
bool large_i64(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)maxval > pFilter->filterInfo.lowerBndi);
}
bool large_ds(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)maxval > pFilter->filterInfo.lowerBndd);
}
bool large_dd(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)maxval > pFilter->filterInfo.lowerBndd);
}
/////////////////////////////////////////////////////////////////////
bool lessEqual_i8(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)minval <= pFilter->filterInfo.upperBndi);
}
bool lessEqual_i16(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)minval <= pFilter->filterInfo.upperBndi);
}
bool lessEqual_i32(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)minval <= pFilter->filterInfo.upperBndi);
}
bool lessEqual_i64(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)minval <= pFilter->filterInfo.upperBndi);
}
bool lessEqual_ds(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)minval <= pFilter->filterInfo.upperBndd);
}
bool lessEqual_dd(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)minval <= pFilter->filterInfo.upperBndd);
}
//////////////////////////////////////////////////////////////////////////
bool largeEqual_i8(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool largeEqual_i16(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool largeEqual_i32(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool largeEqual_i64(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool largeEqual_ds(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)maxval >= pFilter->filterInfo.lowerBndd);
}
bool largeEqual_dd(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)maxval >= pFilter->filterInfo.lowerBndd);
}
////////////////////////////////////////////////////////////////////////
bool equal_i8(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int8_t *)minval == *(int8_t *)maxval) {
return (*(int8_t *)minval == pFilter->filterInfo.lowerBndi);
} else { /* range filter */
assert(*(int8_t *)minval < *(int8_t *)maxval);
return *(int8_t *)minval <= pFilter->filterInfo.lowerBndi && *(int8_t *)maxval >= pFilter->filterInfo.lowerBndi;
}
}
bool equal_i16(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int16_t *)minval == *(int16_t *)maxval) {
return (*(int16_t *)minval == pFilter->filterInfo.lowerBndi);
} else { /* range filter */
assert(*(int16_t *)minval < *(int16_t *)maxval);
return *(int16_t *)minval <= pFilter->filterInfo.lowerBndi && *(int16_t *)maxval >= pFilter->filterInfo.lowerBndi;
}
}
bool equal_i32(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int32_t *)minval == *(int32_t *)maxval) {
return (*(int32_t *)minval == pFilter->filterInfo.lowerBndi);
} else { /* range filter */
assert(*(int32_t *)minval < *(int32_t *)maxval);
return *(int32_t *)minval <= pFilter->filterInfo.lowerBndi && *(int32_t *)maxval >= pFilter->filterInfo.lowerBndi;
}
}
bool equal_i64(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int64_t *)minval == *(int64_t *)maxval) {
return (*(int64_t *)minval == pFilter->filterInfo.lowerBndi);
} else { /* range filter */
assert(*(int64_t *)minval < *(int64_t *)maxval);
return *(int64_t *)minval <= pFilter->filterInfo.lowerBndi && *(int64_t *)maxval >= pFilter->filterInfo.lowerBndi;
}
}
bool equal_ds(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(float *)minval == *(float *)maxval) {
return (fabs(*(float *)minval - pFilter->filterInfo.lowerBndd) <= FLT_EPSILON);
} else { /* range filter */
assert(*(float *)minval < *(float *)maxval);
return *(float *)minval <= pFilter->filterInfo.lowerBndd && *(float *)maxval >= pFilter->filterInfo.lowerBndd;
}
}
bool equal_dd(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(double *)minval == *(double *)maxval) {
return (*(double *)minval == pFilter->filterInfo.lowerBndd);
} else { /* range filter */
assert(*(double *)minval < *(double *)maxval);
return *(double *)minval <= pFilter->filterInfo.lowerBndi && *(double *)maxval >= pFilter->filterInfo.lowerBndi;
}
}
bool equal_str(SColumnFilterElem *pFilter, char *minval, char *maxval) {
// query condition string is greater than the max length of string, not qualified data
if (pFilter->filterInfo.len > pFilter->bytes) {
return false;
}
return strncmp((char *)pFilter->filterInfo.pz, minval, pFilter->bytes) == 0;
}
bool equal_nchar(SColumnFilterElem *pFilter, char *minval, char *maxval) {
// query condition string is greater than the max length of string, not qualified data
if (pFilter->filterInfo.len > pFilter->bytes) {
return false;
}
return wcsncmp((wchar_t *)pFilter->filterInfo.pz, (wchar_t*) minval, pFilter->bytes/TSDB_NCHAR_SIZE) == 0;
}
////////////////////////////////////////////////////////////////
bool like_str(SColumnFilterElem *pFilter, char *minval, char *maxval) {
SPatternCompareInfo info = PATTERN_COMPARE_INFO_INITIALIZER;
return patternMatch((char *)pFilter->filterInfo.pz, minval, pFilter->bytes, &info) == TSDB_PATTERN_MATCH;
}
bool like_nchar(SColumnFilterElem* pFilter, char* minval, char *maxval) {
SPatternCompareInfo info = PATTERN_COMPARE_INFO_INITIALIZER;
return WCSPatternMatch((wchar_t*) pFilter->filterInfo.pz, (wchar_t*) minval, pFilter->bytes/TSDB_NCHAR_SIZE, &info) == TSDB_PATTERN_MATCH;
}
////////////////////////////////////////////////////////////////
/**
* If minval equals to maxval, it may serve as the one element filter,
* or all elements of an array are identical during pref-filter stage.
* Otherwise, it must be pre-filter of array list of elements.
*
* During pre-filter stage, if there is one element that locates in [minval, maxval],
* the filter function will return true.
*/
bool nequal_i8(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int8_t *)minval == *(int8_t *)maxval) {
return (*(int8_t *)minval != pFilter->filterInfo.lowerBndi);
}
return true;
}
bool nequal_i16(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int16_t *)minval == *(int16_t *)maxval) {
return (*(int16_t *)minval != pFilter->filterInfo.lowerBndi);
}
return true;
}
bool nequal_i32(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int32_t *)minval == *(int32_t *)maxval) {
return (*(int32_t *)minval != pFilter->filterInfo.lowerBndi);
}
return true;
}
bool nequal_i64(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(int64_t *)minval == *(int64_t *)maxval) {
return (*(int64_t *)minval != pFilter->filterInfo.lowerBndi);
}
return true;
}
bool nequal_ds(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(float *)minval == *(float *)maxval) {
return (*(float *)minval != pFilter->filterInfo.lowerBndd);
}
return true;
}
bool nequal_dd(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (*(double *)minval == *(double *)maxval) {
return (*(double *)minval != pFilter->filterInfo.lowerBndd);
}
return true;
}
bool nequal_str(SColumnFilterElem *pFilter, char *minval, char *maxval) {
if (pFilter->filterInfo.len > pFilter->bytes) {
return true;
}
return strncmp((char *)pFilter->filterInfo.pz, minval, pFilter->bytes) != 0;
}
bool nequal_nchar(SColumnFilterElem *pFilter, char* minval, char *maxval) {
if (pFilter->filterInfo.len > pFilter->bytes) {
return true;
}
return wcsncmp((wchar_t *)pFilter->filterInfo.pz, (wchar_t*)minval, pFilter->bytes/TSDB_NCHAR_SIZE) != 0;
}
////////////////////////////////////////////////////////////////
bool rangeFilter_i32_ii(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)minval <= pFilter->filterInfo.upperBndi && *(int32_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i32_ee(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)minval<pFilter->filterInfo.upperBndi &&*(int32_t *)maxval> pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i32_ie(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)minval < pFilter->filterInfo.upperBndi && *(int32_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i32_ei(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int32_t *)minval <= pFilter->filterInfo.upperBndi && *(int32_t *)maxval > pFilter->filterInfo.lowerBndi);
}
///////////////////////////////////////////////////////////////////////////////
bool rangeFilter_i8_ii(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)minval <= pFilter->filterInfo.upperBndi && *(int8_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i8_ee(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)minval<pFilter->filterInfo.upperBndi &&*(int8_t *)maxval> pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i8_ie(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)minval < pFilter->filterInfo.upperBndi && *(int8_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i8_ei(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int8_t *)minval <= pFilter->filterInfo.upperBndi && *(int8_t *)maxval > pFilter->filterInfo.lowerBndi);
}
/////////////////////////////////////////////////////////////////////////////////////
bool rangeFilter_i16_ii(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)minval <= pFilter->filterInfo.upperBndi && *(int16_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i16_ee(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)minval<pFilter->filterInfo.upperBndi &&*(int16_t *)maxval> pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i16_ie(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)minval < pFilter->filterInfo.upperBndi && *(int16_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i16_ei(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int16_t *)minval <= pFilter->filterInfo.upperBndi && *(int16_t *)maxval > pFilter->filterInfo.lowerBndi);
}
////////////////////////////////////////////////////////////////////////
bool rangeFilter_i64_ii(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)minval <= pFilter->filterInfo.upperBndi && *(int64_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i64_ee(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)minval<pFilter->filterInfo.upperBndi &&*(int64_t *)maxval> pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i64_ie(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)minval < pFilter->filterInfo.upperBndi && *(int64_t *)maxval >= pFilter->filterInfo.lowerBndi);
}
bool rangeFilter_i64_ei(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(int64_t *)minval <= pFilter->filterInfo.upperBndi && *(int64_t *)maxval > pFilter->filterInfo.lowerBndi);
}
////////////////////////////////////////////////////////////////////////
bool rangeFilter_ds_ii(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)minval <= pFilter->filterInfo.upperBndd && *(float *)maxval >= pFilter->filterInfo.lowerBndd);
}
bool rangeFilter_ds_ee(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)minval<pFilter->filterInfo.upperBndd &&*(float *)maxval> pFilter->filterInfo.lowerBndd);
}
bool rangeFilter_ds_ie(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)minval < pFilter->filterInfo.upperBndd && *(float *)maxval >= pFilter->filterInfo.lowerBndd);
}
bool rangeFilter_ds_ei(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(float *)minval <= pFilter->filterInfo.upperBndd && *(float *)maxval > pFilter->filterInfo.lowerBndd);
}
//////////////////////////////////////////////////////////////////////////
bool rangeFilter_dd_ii(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)minval <= pFilter->filterInfo.upperBndd && *(double *)maxval >= pFilter->filterInfo.lowerBndd);
}
bool rangeFilter_dd_ee(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)minval<pFilter->filterInfo.upperBndd &&*(double *)maxval> pFilter->filterInfo.lowerBndd);
}
bool rangeFilter_dd_ie(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)minval < pFilter->filterInfo.upperBndd && *(double *)maxval >= pFilter->filterInfo.lowerBndd);
}
bool rangeFilter_dd_ei(SColumnFilterElem *pFilter, char *minval, char *maxval) {
return (*(double *)minval <= pFilter->filterInfo.upperBndd && *(double *)maxval > pFilter->filterInfo.lowerBndd);
}
////////////////////////////////////////////////////////////////////////////
bool (*filterFunc_i8[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
less_i8,
large_i8,
equal_i8,
lessEqual_i8,
largeEqual_i8,
nequal_i8,
NULL,
};
bool (*filterFunc_i16[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
less_i16,
large_i16,
equal_i16,
lessEqual_i16,
largeEqual_i16,
nequal_i16,
NULL,
};
bool (*filterFunc_i32[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
less_i32,
large_i32,
equal_i32,
lessEqual_i32,
largeEqual_i32,
nequal_i32,
NULL,
};
bool (*filterFunc_i64[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
less_i64,
large_i64,
equal_i64,
lessEqual_i64,
largeEqual_i64,
nequal_i64,
NULL,
};
bool (*filterFunc_ds[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
less_ds,
large_ds,
equal_ds,
lessEqual_ds,
largeEqual_ds,
nequal_ds,
NULL,
};
bool (*filterFunc_dd[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
less_dd,
large_dd,
equal_dd,
lessEqual_dd,
largeEqual_dd,
nequal_dd,
NULL,
};
bool (*filterFunc_str[])(SColumnFilterElem* pFilter, char* minval, char *maxval) = {
NULL,
NULL,
NULL,
equal_str,
NULL,
NULL,
nequal_str,
like_str,
};
bool (*filterFunc_nchar[])(SColumnFilterElem* pFitler, char* minval, char* maxval) = {
NULL,
NULL,
NULL,
equal_nchar,
NULL,
NULL,
nequal_nchar,
like_nchar,
};
bool (*rangeFilterFunc_i8[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
rangeFilter_i8_ee,
rangeFilter_i8_ie,
rangeFilter_i8_ei,
rangeFilter_i8_ii,
};
bool (*rangeFilterFunc_i16[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
rangeFilter_i16_ee,
rangeFilter_i16_ie,
rangeFilter_i16_ei,
rangeFilter_i16_ii,
};
bool (*rangeFilterFunc_i32[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
rangeFilter_i32_ee,
rangeFilter_i32_ie,
rangeFilter_i32_ei,
rangeFilter_i32_ii,
};
bool (*rangeFilterFunc_i64[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
rangeFilter_i64_ee,
rangeFilter_i64_ie,
rangeFilter_i64_ei,
rangeFilter_i64_ii,
};
bool (*rangeFilterFunc_ds[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
rangeFilter_ds_ee,
rangeFilter_ds_ie,
rangeFilter_ds_ei,
rangeFilter_ds_ii,
};
bool (*rangeFilterFunc_dd[])(SColumnFilterElem *pFilter, char *minval, char *maxval) = {
NULL,
rangeFilter_dd_ee,
rangeFilter_dd_ie,
rangeFilter_dd_ei,
rangeFilter_dd_ii,
};
__filter_func_t* vnodeGetRangeFilterFuncArray(int32_t type) {
switch(type) {
case TSDB_DATA_TYPE_BOOL: return rangeFilterFunc_i8;
case TSDB_DATA_TYPE_TINYINT: return rangeFilterFunc_i8;
case TSDB_DATA_TYPE_SMALLINT: return rangeFilterFunc_i16;
case TSDB_DATA_TYPE_INT: return rangeFilterFunc_i32;
case TSDB_DATA_TYPE_TIMESTAMP: //timestamp uses bigint filter
case TSDB_DATA_TYPE_BIGINT: return rangeFilterFunc_i64;
case TSDB_DATA_TYPE_FLOAT: return rangeFilterFunc_ds;
case TSDB_DATA_TYPE_DOUBLE: return rangeFilterFunc_dd;
default:return NULL;
}
}
__filter_func_t* vnodeGetValueFilterFuncArray(int32_t type) {
switch(type) {
case TSDB_DATA_TYPE_BOOL: return filterFunc_i8;
case TSDB_DATA_TYPE_TINYINT: return filterFunc_i8;
case TSDB_DATA_TYPE_SMALLINT: return filterFunc_i16;
case TSDB_DATA_TYPE_INT: return filterFunc_i32;
case TSDB_DATA_TYPE_TIMESTAMP: //timestamp uses bigint filter
case TSDB_DATA_TYPE_BIGINT: return filterFunc_i64;
case TSDB_DATA_TYPE_FLOAT: return filterFunc_ds;
case TSDB_DATA_TYPE_DOUBLE: return filterFunc_dd;
case TSDB_DATA_TYPE_BINARY: return filterFunc_str;
case TSDB_DATA_TYPE_NCHAR: return filterFunc_nchar;
default: return NULL;
}
}
bool vnodeSupportPrefilter(int32_t type) { return type != TSDB_DATA_TYPE_BINARY && type != TSDB_DATA_TYPE_NCHAR; }
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "vnode.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
extern void vnodeGetHeadTname(char *nHeadName, char *nLastName, int vnode, int fileId);
extern int vnodeReadColumnToMem(int fd, SCompBlock *pBlock, SField **fields, int col, char *data, int dataSize,
char *temp, char *buffer, int bufferSize);
extern int vnodeSendShellSubmitRspMsg(SShellObj *pObj, int code, int numOfPoints);
extern void vnodeGetHeadDataLname(char *headName, char *dataName, char *lastName, int vnode, int fileId);
extern int vnodeCreateEmptyCompFile(int vnode, int fileId);
extern int vnodeUpdateFreeSlot(SVnodeObj *pVnode);
extern SCacheBlock *vnodeGetFreeCacheBlock(SVnodeObj *pVnode);
extern int vnodeCreateNeccessaryFiles(SVnodeObj *pVnode);
#define KEY_AT_INDEX(payload, step, idx) (*(TSKEY *)((char *)(payload) + (step) * (idx)))
typedef struct {
void * signature;
SShellObj *pShell;
SMeterObj *pObj;
int retry;
TSKEY firstKey;
TSKEY lastKey;
int importedRows;
int commit; // start to commit if it is set to 1
int slot; // slot/block to start writing the import data
int pos; // pos to start writing the import data in the slot/block
TSKEY key;
// only for file
int numOfPoints;
int64_t offset; // offset in data file
char * payload;
char * opayload; // allocated space for payload from client
int rows;
} SImportInfo;
typedef struct {
// in .head file
SCompHeader *pHeader;
size_t pHeaderSize;
SCompInfo compInfo;
SCompBlock *pBlocks;
// in .data file
int blockId;
uint8_t blockLoadState;
SField *pField;
size_t pFieldSize;
SData *data[TSDB_MAX_COLUMNS];
char * buffer;
char *temp;
char * tempBuffer;
size_t tempBufferSize;
// Variables for sendfile
int64_t compInfoOffset;
int64_t nextNo0Offset; // next sid whose compInfoOffset > 0
int64_t hfSize;
int64_t driftOffset;
int oldNumOfBlocks;
int newNumOfBlocks;
int last;
} SImportHandle;
typedef struct {
int slot;
int pos;
int oslot; // old slot
TSKEY nextKey;
} SBlockIter;
typedef struct {
int64_t spos;
int64_t epos;
int64_t totalRows;
char * offset[];
} SMergeBuffer;
int vnodeImportData(SMeterObj *pObj, SImportInfo *pImport);
int vnodeFindKeyInCache(SImportInfo *pImport, int order) {
SMeterObj * pObj = pImport->pObj;
int code = 0;
SQuery query;
SCacheInfo *pInfo = (SCacheInfo *)pObj->pCache;
TSKEY key = order ? pImport->firstKey : pImport->lastKey;
memset(&query, 0, sizeof(query));
query.order.order = order;
query.skey = key;
query.ekey = order ? pImport->lastKey : pImport->firstKey;
vnodeSearchPointInCache(pObj, &query);
if (query.slot < 0) {
pImport->slot = pInfo->commitSlot;
if (pInfo->commitPoint >= pObj->pointsPerBlock) pImport->slot = (pImport->slot + 1) % pInfo->maxBlocks;
pImport->pos = 0;
pImport->key = 0;
dTrace("vid:%d sid:%d id:%s, key:%" PRId64 ", import to head of cache", pObj->vnode, pObj->sid, pObj->meterId, key);
code = 0;
} else {
pImport->slot = query.slot;
pImport->pos = query.pos;
pImport->key = query.key;
if (key != query.key) {
if (order == 0) {
// since pos is the position which has smaller key, data shall be imported after it
pImport->pos++;
if (pImport->pos >= pObj->pointsPerBlock) {
pImport->slot = (pImport->slot + 1) % pInfo->maxBlocks;
pImport->pos = 0;
}
} else {
if (pImport->pos < 0) pImport->pos = 0;
}
}
code = 0;
}
return code;
}
void vnodeGetValidDataRange(int vnode, TSKEY now, TSKEY *minKey, TSKEY *maxKey) {
SVnodeObj *pVnode = vnodeList + vnode;
int64_t delta = pVnode->cfg.daysPerFile * tsMsPerDay[(uint8_t)pVnode->cfg.precision];
int fid = now / delta;
*minKey = (fid - pVnode->maxFiles + 1) * delta;
*maxKey = (fid + 2) * delta - 1;
return;
}
int vnodeImportPoints(SMeterObj *pObj, char *cont, int contLen, char source, void *param, int sversion,
int *pNumOfPoints, TSKEY now) {
SSubmitMsg *pSubmit = (SSubmitMsg *)cont;
SVnodeObj * pVnode = vnodeList + pObj->vnode;
int rows = 0;
char * payload = NULL;
int code = TSDB_CODE_SUCCESS;
SCachePool *pPool = (SCachePool *)(pVnode->pCachePool);
SShellObj * pShell = (SShellObj *)param;
TSKEY firstKey, lastKey;
payload = pSubmit->payLoad;
rows = htons(pSubmit->numOfRows);
assert(rows > 0);
int expectedLen = rows * pObj->bytesPerPoint + sizeof(pSubmit->numOfRows);
if (expectedLen != contLen) {
dError("vid:%d sid:%d id:%s, invalid import, expected:%d, contLen:%d", pObj->vnode, pObj->sid, pObj->meterId,
expectedLen, contLen);
return TSDB_CODE_WRONG_MSG_SIZE;
}
// Check timestamp context.
TSKEY minKey = 0, maxKey = 0;
firstKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, 0);
lastKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, rows - 1);
assert(firstKey <= lastKey);
vnodeGetValidDataRange(pObj->vnode, now, &minKey, &maxKey);
if (firstKey < minKey || firstKey > maxKey || lastKey < minKey || lastKey > maxKey) {
dError(
"vid:%d sid:%d id:%s, invalid timestamp to import, rows:%d firstKey: %" PRId64 " lastKey: %" PRId64 " minAllowedKey:%" PRId64 " "
"maxAllowedKey:%" PRId64,
pObj->vnode, pObj->sid, pObj->meterId, rows, firstKey, lastKey, minKey, maxKey);
return TSDB_CODE_TIMESTAMP_OUT_OF_RANGE;
}
// forward to peers
if (pShell && pVnode->cfg.replications > 1) {
code = vnodeForwardToPeer(pObj, cont, contLen, TSDB_ACTION_IMPORT, sversion);
if (code != 0) return code;
}
if (pVnode->cfg.commitLog && source != TSDB_DATA_SOURCE_LOG) {
if (pVnode->logFd < 0) return TSDB_CODE_INVALID_COMMIT_LOG;
code = vnodeWriteToCommitLog(pObj, TSDB_ACTION_IMPORT, cont, contLen, sversion);
if (code != 0) return code;
}
/*
* The timestamp of all records in a submit payload are always in ascending order, guaranteed by client, so here only
* the first key.
*/
if (firstKey > pObj->lastKey) { // Just call insert
code = vnodeInsertPoints(pObj, cont, contLen, TSDB_DATA_SOURCE_LOG, NULL, sversion, pNumOfPoints, now);
} else { // trigger import
if (sversion != pObj->sversion) {
dError("vid:%d sid:%d id:%s, invalid sversion, expected:%d received:%d", pObj->vnode, pObj->sid, pObj->meterId,
pObj->sversion, sversion);
return TSDB_CODE_OTHERS;
}
// check the table status for perform import historical data
if ((code = vnodeSetMeterInsertImportStateEx(pObj, TSDB_METER_STATE_IMPORTING)) != TSDB_CODE_SUCCESS) {
return code;
}
SImportInfo import = {0};
dTrace("vid:%d sid:%d id:%s, try to import %d rows data, firstKey:%" PRId64 ", lastKey:%" PRId64 ", object lastKey:%" PRId64,
pObj->vnode, pObj->sid, pObj->meterId, rows, firstKey, lastKey, pObj->lastKey);
import.firstKey = firstKey;
import.lastKey = lastKey;
import.pObj = pObj;
import.pShell = pShell;
import.payload = payload;
import.rows = rows;
// FIXME: mutex here seems meaningless and num here still can be changed
int32_t num = 0;
pthread_mutex_lock(&pVnode->vmutex);
num = pObj->numOfQueries;
pthread_mutex_unlock(&pVnode->vmutex);
int32_t commitInProcess = 0;
pthread_mutex_lock(&pPool->vmutex);
if (((commitInProcess = pPool->commitInProcess) == 1) || num > 0) {
// mutual exclusion with read (need to change here)
pthread_mutex_unlock(&pPool->vmutex);
vnodeClearMeterState(pObj, TSDB_METER_STATE_IMPORTING);
return TSDB_CODE_ACTION_IN_PROGRESS;
} else {
pPool->commitInProcess = 1;
pthread_mutex_unlock(&pPool->vmutex);
code = vnodeImportData(pObj, &import);
*pNumOfPoints = import.importedRows;
}
pVnode->version++;
vnodeClearMeterState(pObj, TSDB_METER_STATE_IMPORTING);
}
return code;
}
/* Function to search keys in a range
*
* Assumption: keys in payload are in ascending order
*
* @payload: data records, key in ascending order
* @step: bytes each record takes
* @rows: number of data records
* @skey: range start (included)
* @ekey: range end (included)
* @srows: rtype, start index of records
* @nrows: rtype, number of records in range
*
* @rtype: 0 means find data in the range
* -1 means find no data in the range
*/
static int vnodeSearchKeyInRange(char *payload, int step, int rows, TSKEY skey, TSKEY ekey, int *srow, int *nrows) {
if (rows <= 0 || KEY_AT_INDEX(payload, step, 0) > ekey || KEY_AT_INDEX(payload, step, rows - 1) < skey || skey > ekey)
return -1;
int left = 0;
int right = rows - 1;
int mid;
// Binary search the first key in payload >= skey
do {
mid = (left + right) / 2;
if (skey < KEY_AT_INDEX(payload, step, mid)) {
right = mid;
} else if (skey > KEY_AT_INDEX(payload, step, mid)) {
left = mid + 1;
} else {
break;
}
} while (left < right);
if (skey <= KEY_AT_INDEX(payload, step, mid)) {
*srow = mid;
} else {
if (mid + 1 >= rows) {
return -1;
} else {
*srow = mid + 1;
}
}
assert(skey <= KEY_AT_INDEX(payload, step, *srow));
*nrows = 0;
for (int i = *srow; i < rows; i++) {
if (KEY_AT_INDEX(payload, step, i) <= ekey) {
(*nrows)++;
} else {
break;
}
}
if (*nrows == 0) return -1;
return 0;
}
int vnodeOpenMinFilesForImport(int vnode, int fid) {
char dname[TSDB_FILENAME_LEN] = "\0";
SVnodeObj * pVnode = vnodeList + vnode;
struct stat filestat;
int minFileSize;
minFileSize = TSDB_FILE_HEADER_LEN + sizeof(SCompHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM);
vnodeGetHeadDataLname(pVnode->cfn, dname, pVnode->lfn, vnode, fid);
// Open .head file
pVnode->hfd = open(pVnode->cfn, O_RDONLY);
if (pVnode->hfd < 0) {
dError("vid:%d, failed to open head file:%s, reason:%s", vnode, pVnode->cfn, strerror(errno));
taosLogError("vid:%d, failed to open head file:%s, reason:%s", vnode, pVnode->cfn, strerror(errno));
goto _error_open;
}
fstat(pVnode->hfd, &filestat);
if (filestat.st_size < minFileSize) {
dError("vid:%d, head file:%s is corrupted", vnode, pVnode->cfn);
taosLogError("vid:%d, head file:%s corrupted", vnode, pVnode->cfn);
goto _error_open;
}
// Open .data file
pVnode->dfd = open(dname, O_RDWR);
if (pVnode->dfd < 0) {
dError("vid:%d, failed to open data file:%s, reason:%s", vnode, dname, strerror(errno));
taosLogError("vid:%d, failed to open data file:%s, reason:%s", vnode, dname, strerror(errno));
goto _error_open;
}
fstat(pVnode->dfd, &filestat);
if (filestat.st_size < TSDB_FILE_HEADER_LEN) {
dError("vid:%d, data file:%s corrupted", vnode, dname);
taosLogError("vid:%d, data file:%s corrupted", vnode, dname);
goto _error_open;
}
// Open .last file
pVnode->lfd = open(pVnode->lfn, O_RDWR);
if (pVnode->lfd < 0) {
dError("vid:%d, failed to open last file:%s, reason:%s", vnode, pVnode->lfn, strerror(errno));
taosLogError("vid:%d, failed to open last file:%s, reason:%s", vnode, pVnode->lfn, strerror(errno));
goto _error_open;
}
fstat(pVnode->lfd, &filestat);
if (filestat.st_size < TSDB_FILE_HEADER_LEN) {
dError("vid:%d, last file:%s corrupted", vnode, pVnode->lfn);
taosLogError("vid:%d, last file:%s corrupted", vnode, pVnode->lfn);
goto _error_open;
}
return 0;
_error_open:
if (pVnode->hfd > 0) close(pVnode->hfd);
pVnode->hfd = 0;
if (pVnode->dfd > 0) close(pVnode->dfd);
pVnode->dfd = 0;
if (pVnode->lfd > 0) close(pVnode->lfd);
pVnode->lfd = 0;
return -1;
}
/* Function to open .t file and sendfile the first part
*/
int vnodeOpenTempFilesForImport(SImportHandle *pHandle, SMeterObj *pObj, int fid) {
char dHeadName[TSDB_FILENAME_LEN] = "\0";
SVnodeObj * pVnode = vnodeList + pObj->vnode;
struct stat filestat;
int sid;
// cfn: .head
if (readlink(pVnode->cfn, dHeadName, TSDB_FILENAME_LEN) < 0) return -1;
size_t len = strlen(dHeadName);
// switch head name
switch (dHeadName[len - 1]) {
case '0':
dHeadName[len - 1] = '1';
break;
case '1':
dHeadName[len - 1] = '0';
break;
default:
dError("vid: %d, fid: %d, head target filename not end with 0 or 1", pVnode->vnode, fid);
return -1;
}
vnodeGetHeadTname(pVnode->nfn, NULL, pVnode->vnode, fid);
if (symlink(dHeadName, pVnode->nfn) < 0) return -1;
pVnode->nfd = open(pVnode->nfn, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU | S_IRWXG | S_IRWXO);
if (pVnode->nfd < 0) {
dError("vid:%d, failed to open new head file:%s, reason:%s", pVnode->vnode, pVnode->nfn, strerror(errno));
taosLogError("vid:%d, failed to open new head file:%s, reason:%s", pVnode->vnode, pVnode->nfn, strerror(errno));
return -1;
}
fstat(pVnode->hfd, &filestat);
pHandle->hfSize = filestat.st_size;
// Find the next sid whose compInfoOffset > 0
for (sid = pObj->sid + 1; sid < pVnode->cfg.maxSessions; sid++) {
if (pHandle->pHeader[sid].compInfoOffset > 0) break;
}
pHandle->nextNo0Offset = (sid == pVnode->cfg.maxSessions) ? pHandle->hfSize : pHandle->pHeader[sid].compInfoOffset;
// FIXME: sendfile the original part
// TODO: Here, we need to take the deleted table case in consideration, this function
// just assume the case is handled before calling this function
if (pHandle->pHeader[pObj->sid].compInfoOffset > 0) {
pHandle->compInfoOffset = pHandle->pHeader[pObj->sid].compInfoOffset;
} else {
pHandle->compInfoOffset = pHandle->nextNo0Offset;
}
assert(pHandle->compInfoOffset <= pHandle->hfSize);
lseek(pVnode->hfd, 0, SEEK_SET);
lseek(pVnode->nfd, 0, SEEK_SET);
if (tsendfile(pVnode->nfd, pVnode->hfd, NULL, pHandle->compInfoOffset) < 0) {
return -1;
}
// Leave a SCompInfo space here
lseek(pVnode->nfd, sizeof(SCompInfo), SEEK_CUR);
return 0;
}
typedef enum { DATA_LOAD_TIMESTAMP = 0x1, DATA_LOAD_OTHER_DATA = 0x2 } DataLoadMod;
/* Function to load a block data at the requirement of mod
*/
static int vnodeLoadNeededBlockData(SMeterObj *pObj, SImportHandle *pHandle, int blockId, uint8_t loadMod, int *code) {
size_t size;
SCompBlock *pBlock = pHandle->pBlocks + blockId;
*code = TSDB_CODE_SUCCESS;
SVnodeObj *pVnode = vnodeList + pObj->vnode;
int dfd = pBlock->last ? pVnode->lfd : pVnode->dfd;
if (pHandle->blockId != blockId) {
pHandle->blockId = blockId;
pHandle->blockLoadState = 0;
}
if (pHandle->blockLoadState == 0){ // Reload pField
size = sizeof(SField) * pBlock->numOfCols + sizeof(TSCKSUM);
if (pHandle->pFieldSize < size) {
pHandle->pField = (SField *)realloc((void *)(pHandle->pField), size);
if (pHandle->pField == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, size);
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
return -1;
}
pHandle->pFieldSize = size;
}
lseek(dfd, pBlock->offset, SEEK_SET);
if (read(dfd, (void *)(pHandle->pField), pHandle->pFieldSize) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to read data file, size:%zu reason:%s", pVnode->vnode, pObj->sid,
pObj->meterId, pHandle->pFieldSize, strerror(errno));
*code = TSDB_CODE_FILE_CORRUPTED;
return -1;
}
if (!taosCheckChecksumWhole((uint8_t *)(pHandle->pField), pHandle->pFieldSize)) {
dError("vid:%d sid:%d meterId:%s, data file %s is broken since checksum mismatch", pVnode->vnode, pObj->sid,
pObj->meterId, pVnode->lfn);
*code = TSDB_CODE_FILE_CORRUPTED;
return -1;
}
}
{ // Allocate necessary buffer
size = pObj->bytesPerPoint * pObj->pointsPerFileBlock +
(sizeof(SData) + EXTRA_BYTES + sizeof(TSCKSUM)) * pObj->numOfColumns;
if (pHandle->buffer == NULL) {
pHandle->buffer = malloc(size);
if (pHandle->buffer == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, size);
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
return -1;
}
// TODO: Init data
pHandle->data[0] = (SData *)(pHandle->buffer);
for (int col = 1; col < pObj->numOfColumns; col++) {
pHandle->data[col] = (SData *)((char *)(pHandle->data[col - 1]) + sizeof(SData) + EXTRA_BYTES +
sizeof(TSCKSUM) + pObj->pointsPerFileBlock * pObj->schema[col - 1].bytes);
}
}
if (pHandle->temp == NULL) {
pHandle->temp = malloc(size);
if (pHandle->temp == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, size);
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
return -1;
}
}
if (pHandle->tempBuffer == NULL) {
pHandle->tempBufferSize = pObj->maxBytes * pObj->pointsPerFileBlock + EXTRA_BYTES + sizeof(TSCKSUM);
pHandle->tempBuffer = malloc(pHandle->tempBufferSize);
if (pHandle->tempBuffer == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, pHandle->tempBufferSize);
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
return -1;
}
}
}
if ((loadMod & DATA_LOAD_TIMESTAMP) &&
(~(pHandle->blockLoadState & DATA_LOAD_TIMESTAMP))) { // load only timestamp part
if (vnodeReadColumnToMem(dfd, pBlock, &(pHandle->pField), PRIMARYKEY_TIMESTAMP_COL_INDEX,
pHandle->data[PRIMARYKEY_TIMESTAMP_COL_INDEX]->data, sizeof(TSKEY) * pBlock->numOfPoints,
pHandle->temp, pHandle->tempBuffer, pHandle->tempBufferSize) < 0) {
*code = TSDB_CODE_FILE_CORRUPTED;
return -1;
}
pHandle->blockLoadState |= DATA_LOAD_TIMESTAMP;
}
if ((loadMod & DATA_LOAD_OTHER_DATA) && (~(pHandle->blockLoadState & DATA_LOAD_OTHER_DATA))) { // load other columns
for (int col = 1; col < pBlock->numOfCols; col++) {
if (vnodeReadColumnToMem(dfd, pBlock, &(pHandle->pField), col, pHandle->data[col]->data,
pBlock->numOfPoints * pObj->schema[col].bytes, pHandle->temp, pHandle->tempBuffer,
pHandle->tempBufferSize) < 0) {
*code = TSDB_CODE_FILE_CORRUPTED;
return -1;
}
}
pHandle->blockLoadState |= DATA_LOAD_OTHER_DATA;
}
return 0;
}
static int vnodeCloseImportFiles(SMeterObj *pObj, SImportHandle *pHandle) {
SVnodeObj *pVnode = vnodeList + pObj->vnode;
char dpath[TSDB_FILENAME_LEN] = "\0";
SCompInfo compInfo;
#ifdef _ALPINE
off_t offset = 0;
#else
__off_t offset = 0;
#endif
if (pVnode->nfd > 0) {
offset = lseek(pVnode->nfd, 0, SEEK_CUR);
assert(offset == pHandle->nextNo0Offset + pHandle->driftOffset);
{ // Write the SCompInfo part
compInfo.uid = pObj->uid;
compInfo.last = pHandle->last;
compInfo.numOfBlocks = pHandle->newNumOfBlocks + pHandle->oldNumOfBlocks;
compInfo.delimiter = TSDB_VNODE_DELIMITER;
taosCalcChecksumAppend(0, (uint8_t *)(&compInfo), sizeof(SCompInfo));
lseek(pVnode->nfd, pHandle->compInfoOffset, SEEK_SET);
if (twrite(pVnode->nfd, (void *)(&compInfo), sizeof(SCompInfo)) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to wirte SCompInfo, reason:%s", pObj->vnode, pObj->sid, pObj->meterId,
strerror(errno));
return -1;
}
}
// Write the rest of the SCompBlock part
if (pHandle->hfSize > pHandle->nextNo0Offset) {
lseek(pVnode->nfd, 0, SEEK_END);
lseek(pVnode->hfd, pHandle->nextNo0Offset, SEEK_SET);
if (tsendfile(pVnode->nfd, pVnode->hfd, NULL, pHandle->hfSize - pHandle->nextNo0Offset) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to sendfile, size:%" PRId64 ", reason:%s", pObj->vnode, pObj->sid,
pObj->meterId, pHandle->hfSize - pHandle->nextNo0Offset, strerror(errno));
return -1;
}
}
// Write SCompHeader part
pHandle->pHeader[pObj->sid].compInfoOffset = pHandle->compInfoOffset;
for (int sid = pObj->sid + 1; sid < pVnode->cfg.maxSessions; ++sid) {
if (pHandle->pHeader[sid].compInfoOffset > 0) {
pHandle->pHeader[sid].compInfoOffset += pHandle->driftOffset;
}
}
taosCalcChecksumAppend(0, (uint8_t *)(pHandle->pHeader), pHandle->pHeaderSize);
lseek(pVnode->nfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
if (twrite(pVnode->nfd, (void *)(pHandle->pHeader), pHandle->pHeaderSize) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to wirte SCompHeader part, size:%zu, reason:%s", pObj->vnode, pObj->sid,
pObj->meterId, pHandle->pHeaderSize, strerror(errno));
return -1;
}
}
// Close opened files
close(pVnode->dfd);
pVnode->dfd = 0;
close(pVnode->hfd);
pVnode->hfd = 0;
close(pVnode->lfd);
pVnode->lfd = 0;
if (pVnode->nfd > 0) {
close(pVnode->nfd);
pVnode->nfd = 0;
readlink(pVnode->cfn, dpath, TSDB_FILENAME_LEN);
rename(pVnode->nfn, pVnode->cfn);
remove(dpath);
}
return 0;
}
static void vnodeConvertRowsToCols(SMeterObj *pObj, const char *payload, int rows, SData *data[], int rowOffset) {
int sdataRow;
int offset;
for (int row = 0; row < rows; ++row) {
sdataRow = row + rowOffset;
offset = 0;
for (int col = 0; col < pObj->numOfColumns; ++col) {
memcpy(data[col]->data + sdataRow * pObj->schema[col].bytes, payload + pObj->bytesPerPoint * row + offset,
pObj->schema[col].bytes);
offset += pObj->schema[col].bytes;
}
}
}
static int vnodeMergeDataIntoFile(SImportInfo *pImport, const char *payload, int rows, int fid) {
SMeterObj * pObj = (SMeterObj *)(pImport->pObj);
SVnodeObj * pVnode = vnodeList + pObj->vnode;
SImportHandle importHandle;
size_t size = 0;
SData * data[TSDB_MAX_COLUMNS];
char * buffer = NULL;
SData * cdata[TSDB_MAX_COLUMNS];
char * cbuffer = NULL;
SCompBlock compBlock;
TSCKSUM checksum = 0;
int pointsImported = 0;
int code = TSDB_CODE_SUCCESS;
SCachePool * pPool = (SCachePool *)pVnode->pCachePool;
SCacheInfo * pInfo = (SCacheInfo *)(pObj->pCache);
TSKEY lastKeyImported = 0;
TSKEY delta = pVnode->cfg.daysPerFile * tsMsPerDay[(uint8_t)pVnode->cfg.precision];
TSKEY minFileKey = fid * delta;
TSKEY maxFileKey = minFileKey + delta - 1;
TSKEY firstKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, 0);
TSKEY lastKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, rows - 1);
assert(firstKey >= minFileKey && firstKey <= maxFileKey && lastKey >= minFileKey && lastKey <= maxFileKey);
// create neccessary files
pVnode->commitFirstKey = firstKey;
if (vnodeCreateNeccessaryFiles(pVnode) < 0) return TSDB_CODE_OTHERS;
assert(pVnode->commitFileId == fid);
// Open least files to import .head(hfd) .data(dfd) .last(lfd)
if (vnodeOpenMinFilesForImport(pObj->vnode, fid) < 0) return TSDB_CODE_FILE_CORRUPTED;
memset(&importHandle, 0, sizeof(SImportHandle));
{ // Load SCompHeader part from .head file
importHandle.pHeaderSize = sizeof(SCompHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM);
importHandle.pHeader = (SCompHeader *)malloc(importHandle.pHeaderSize);
if (importHandle.pHeader == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, importHandle.pHeaderSize);
code = TSDB_CODE_SERV_OUT_OF_MEMORY;
goto _error_merge;
}
lseek(pVnode->hfd, TSDB_FILE_HEADER_LEN, SEEK_SET);
if (read(pVnode->hfd, (void *)(importHandle.pHeader), importHandle.pHeaderSize) < importHandle.pHeaderSize) {
dError("vid: %d, sid: %d, meterId: %s, fid: %d failed to read SCompHeader part, reason:%s", pObj->vnode,
pObj->sid, pObj->meterId, fid, strerror(errno));
code = TSDB_CODE_FILE_CORRUPTED;
goto _error_merge;
}
if (!taosCheckChecksumWhole((uint8_t *)(importHandle.pHeader), importHandle.pHeaderSize)) {
dError("vid: %d, sid: %d, meterId: %s, fid: %d SCompHeader part is broken", pObj->vnode, pObj->sid, pObj->meterId,
fid);
code = TSDB_CODE_FILE_CORRUPTED;
goto _error_merge;
}
}
{ // Initialize data[] and cdata[], which is used to hold data to write to data file
size = pObj->bytesPerPoint * pVnode->cfg.rowsInFileBlock + (sizeof(SData) + EXTRA_BYTES + sizeof(TSCKSUM)) * pObj->numOfColumns;
buffer = (char *)malloc(size);
if (buffer == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, size);
code = TSDB_CODE_SERV_OUT_OF_MEMORY;
goto _error_merge;
}
cbuffer = (char *)malloc(size);
if (cbuffer == NULL) {
dError("vid: %d, sid: %d, meterId: %s, failed to allocate memory, size: %ul", pObj->vnode, pObj->sid,
pObj->meterId, size);
code = TSDB_CODE_SERV_OUT_OF_MEMORY;
goto _error_merge;
}
data[0] = (SData *)buffer;
cdata[0] = (SData *)cbuffer;
for (int col = 1; col < pObj->numOfColumns; col++) {
data[col] = (SData *)((char *)data[col - 1] + sizeof(SData) + EXTRA_BYTES + sizeof(TSCKSUM) +
pObj->pointsPerFileBlock * pObj->schema[col - 1].bytes);
cdata[col] = (SData *)((char *)cdata[col - 1] + sizeof(SData) + EXTRA_BYTES + sizeof(TSCKSUM) +
pObj->pointsPerFileBlock * pObj->schema[col - 1].bytes);
}
}
if (importHandle.pHeader[pObj->sid].compInfoOffset == 0) { // No data in this file, just write it
_write_empty_point:
if (vnodeOpenTempFilesForImport(&importHandle, pObj, fid) < 0) {
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
importHandle.oldNumOfBlocks = 0;
importHandle.driftOffset += sizeof(SCompInfo);
lastKeyImported = lastKey;
for (int rowsWritten = 0; rowsWritten < rows;) {
int rowsToWrite = MIN(pVnode->cfg.rowsInFileBlock, (rows - rowsWritten) /* the rows left */);
vnodeConvertRowsToCols(pObj, payload + rowsWritten * pObj->bytesPerPoint, rowsToWrite, data, 0);
pointsImported += rowsToWrite;
compBlock.last = 1;
if (vnodeWriteBlockToFile(pObj, &compBlock, data, cdata, rowsToWrite) < 0) {
// TODO: deal with ERROR here
}
importHandle.last = compBlock.last;
checksum = taosCalcChecksum(checksum, (uint8_t *)(&compBlock), sizeof(SCompBlock));
twrite(pVnode->nfd, &compBlock, sizeof(SCompBlock));
importHandle.newNumOfBlocks++;
importHandle.driftOffset += sizeof(SCompBlock);
rowsWritten += rowsToWrite;
}
twrite(pVnode->nfd, &checksum, sizeof(TSCKSUM));
importHandle.driftOffset += sizeof(TSCKSUM);
} else { // Else if there are old data in this file.
{ // load SCompInfo and SCompBlock part
lseek(pVnode->hfd, importHandle.pHeader[pObj->sid].compInfoOffset, SEEK_SET);
if (read(pVnode->hfd, (void *)(&(importHandle.compInfo)), sizeof(SCompInfo)) < sizeof(SCompInfo)) {
dError("vid:%d sid:%d meterId:%s, failed to read .head file, reason:%s", pVnode->vnode, pObj->sid,
pObj->meterId, strerror(errno));
code = TSDB_CODE_FILE_CORRUPTED;
goto _error_merge;
}
if ((importHandle.compInfo.delimiter != TSDB_VNODE_DELIMITER) ||
(!taosCheckChecksumWhole((uint8_t *)(&(importHandle.compInfo)), sizeof(SCompInfo)))) {
dError("vid:%d sid:%d meterId:%s, .head file %s is broken, delemeter:%x", pVnode->vnode, pObj->sid,
pObj->meterId, pVnode->cfn, importHandle.compInfo.delimiter);
code = TSDB_CODE_FILE_CORRUPTED;
goto _error_merge;
}
// Check the context of SCompInfo part
if (importHandle.compInfo.uid != pObj->uid) { // The data belongs to the other meter
goto _write_empty_point;
}
importHandle.oldNumOfBlocks = importHandle.compInfo.numOfBlocks;
importHandle.last = importHandle.compInfo.last;
size = sizeof(SCompBlock) * importHandle.compInfo.numOfBlocks + sizeof(TSCKSUM);
importHandle.pBlocks = (SCompBlock *)malloc(size);
if (importHandle.pBlocks == NULL) {
dError("vid:%d sid:%d meterId:%s, failed to allocate importHandle.pBlock, size:%ul", pVnode->vnode, pObj->sid,
pObj->meterId, size);
code = TSDB_CODE_SERV_OUT_OF_MEMORY;
goto _error_merge;
}
if (read(pVnode->hfd, (void *)(importHandle.pBlocks), size) < size) {
dError("vid:%d sid:%d meterId:%s, failed to read importHandle.pBlock, reason:%s", pVnode->vnode, pObj->sid,
pObj->meterId, strerror(errno));
code = TSDB_CODE_FILE_CORRUPTED;
goto _error_merge;
}
if (!taosCheckChecksumWhole((uint8_t *)(importHandle.pBlocks), size)) {
dError("vid:%d sid:%d meterId:%s, pBlock part is broken in %s", pVnode->vnode, pObj->sid, pObj->meterId,
pVnode->cfn);
code = TSDB_CODE_FILE_CORRUPTED;
goto _error_merge;
}
}
/* Now we have _payload_, we have _importHandle.pBlocks_, just merge payload into the importHandle.pBlocks
*
* Input: payload, pObj->bytesPerBlock, rows, importHandle.pBlocks
*/
{
int payloadIter = 0;
SBlockIter blockIter = {0, 0, 0, 0};
while (1) {
if (payloadIter >= rows) { // payload end, break
// write the remaining blocks to the file
if (pVnode->nfd > 0) {
int blocksLeft = importHandle.compInfo.numOfBlocks - blockIter.oslot;
if (blocksLeft > 0) {
checksum = taosCalcChecksum(checksum, (uint8_t *)(importHandle.pBlocks + blockIter.oslot),
sizeof(SCompBlock) * blocksLeft);
if (twrite(pVnode->nfd, (void *)(importHandle.pBlocks + blockIter.oslot),
sizeof(SCompBlock) * blocksLeft) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to write %s file, size:%ul, reason:%s", pVnode->vnode,
pObj->sid, pObj->meterId, pVnode->nfn, sizeof(SCompBlock) * blocksLeft, strerror(errno));
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
}
if (twrite(pVnode->nfd, (void *)(&checksum), sizeof(TSCKSUM)) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to write %s file, size:%ul, reason:%s", pVnode->vnode, pObj->sid,
pObj->meterId, pVnode->nfn, sizeof(TSCKSUM), strerror(errno));
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
}
break;
}
if (blockIter.slot >= importHandle.compInfo.numOfBlocks) { // blocks end, break
// Should never come here
assert(false);
}
TSKEY key = KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter);
{ // Binary search the (slot, pos) which is >= key as well as nextKey
int left = blockIter.slot;
int right = importHandle.compInfo.numOfBlocks - 1;
TSKEY minKey = importHandle.pBlocks[left].keyFirst;
TSKEY maxKey = importHandle.pBlocks[right].keyLast;
assert(minKey <= maxKey);
if (key < minKey) { // Case 1. write just ahead the blockIter.slot
blockIter.slot = left;
blockIter.pos = 0;
blockIter.nextKey = minKey;
} else if (key > maxKey) { // Case 2. write to the end
if (importHandle.pBlocks[right].last) { // Case 2.1 last block in .last file, need to merge
assert(importHandle.last != 0);
importHandle.last = 0;
blockIter.slot = right;
blockIter.pos = importHandle.pBlocks[right].numOfPoints;
} else { // Case 2.2 just write after the last block
blockIter.slot = right + 1;
blockIter.pos = 0;
}
blockIter.nextKey = maxFileKey + 1;
} else { // Case 3. need to search the block for slot and pos
if (key == minKey || key == maxKey) {
if (tsAffectedRowsMod) pointsImported++;
payloadIter++;
continue;
}
// Here: minKey < key < maxKey
int mid;
TSKEY blockMinKey;
TSKEY blockMaxKey;
// Binary search the slot
do {
mid = (left + right) / 2;
blockMinKey = importHandle.pBlocks[mid].keyFirst;
blockMaxKey = importHandle.pBlocks[mid].keyLast;
assert(blockMinKey <= blockMaxKey);
if (key < blockMinKey) {
right = mid;
} else if (key > blockMaxKey) {
left = mid + 1;
} else { /* blockMinKey <= key <= blockMaxKey */
break;
}
} while (left < right);
if (key == blockMinKey || key == blockMaxKey) { // duplicate key
if (tsAffectedRowsMod) pointsImported++;
payloadIter++;
continue;
}
// Get the slot
if (key > blockMaxKey) { /* pos = 0 or pos = ? */
blockIter.slot = mid + 1;
} else { /* key < blockMinKey (pos = 0) || (key > blockMinKey && key < blockMaxKey) (pos=?) */
blockIter.slot = mid;
}
// Get the pos
assert(blockIter.slot < importHandle.compInfo.numOfBlocks);
if (key == importHandle.pBlocks[blockIter.slot].keyFirst ||
key == importHandle.pBlocks[blockIter.slot].keyLast) {
if (tsAffectedRowsMod) pointsImported++;
payloadIter++;
continue;
}
assert(key < importHandle.pBlocks[blockIter.slot].keyLast);
/* */
if (key < importHandle.pBlocks[blockIter.slot].keyFirst) {
blockIter.pos = 0;
blockIter.nextKey = importHandle.pBlocks[blockIter.slot].keyFirst;
} else {
SCompBlock *pBlock = importHandle.pBlocks + blockIter.slot;
if (pBlock->sversion != pObj->sversion) { /*TODO*/
}
if (vnodeLoadNeededBlockData(pObj, &importHandle, blockIter.slot, DATA_LOAD_TIMESTAMP, &code) < 0) {
goto _error_merge;
}
int pos = (*vnodeSearchKeyFunc[pObj->searchAlgorithm])(
importHandle.data[PRIMARYKEY_TIMESTAMP_COL_INDEX]->data, pBlock->numOfPoints, key, TSQL_SO_ASC);
assert(pos != 0);
if (KEY_AT_INDEX(importHandle.data[PRIMARYKEY_TIMESTAMP_COL_INDEX]->data, sizeof(TSKEY), pos) == key) {
if (tsAffectedRowsMod) pointsImported++;
payloadIter++;
continue;
}
blockIter.pos = pos;
blockIter.nextKey = (blockIter.slot + 1 < importHandle.compInfo.numOfBlocks)
? importHandle.pBlocks[blockIter.slot + 1].keyFirst
: maxFileKey + 1;
// Need to merge with this block
if (importHandle.pBlocks[blockIter.slot].last) { // this is to merge with the last block
assert((blockIter.slot == (importHandle.compInfo.numOfBlocks - 1)));
importHandle.last = 0;
}
}
}
}
int aslot = MIN(blockIter.slot, importHandle.compInfo.numOfBlocks - 1);
int64_t sversion = importHandle.pBlocks[aslot].sversion;
if (sversion != pObj->sversion) {
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
// Open the new .t file if not opened yet.
if (pVnode->nfd <= 0) {
if (vnodeOpenTempFilesForImport(&importHandle, pObj, fid) < 0) {
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
}
if (blockIter.slot > blockIter.oslot) { // write blocks in range [blockIter.oslot, blockIter.slot) to .t file
checksum = taosCalcChecksum(checksum, (uint8_t *)(importHandle.pBlocks + blockIter.oslot),
sizeof(SCompBlock) * (blockIter.slot - blockIter.oslot));
if (twrite(pVnode->nfd, (void *)(importHandle.pBlocks + blockIter.oslot),
sizeof(SCompBlock) * (blockIter.slot - blockIter.oslot)) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to write %s file, size:%ul, reason:%s", pVnode->vnode, pObj->sid,
pObj->meterId, pVnode->nfn, sizeof(SCompBlock) * (blockIter.slot - blockIter.oslot),
strerror(errno));
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
blockIter.oslot = blockIter.slot;
}
if (blockIter.pos == 0) { // No need to merge
// copy payload part to data
int rowOffset = 0;
for (; payloadIter < rows; rowOffset++) {
if (KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) >= blockIter.nextKey) break;
vnodeConvertRowsToCols(pObj, payload + pObj->bytesPerPoint * payloadIter, 1, data, rowOffset);
pointsImported++;
lastKeyImported = KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter);
payloadIter++;
}
// write directly to .data file
compBlock.last = 0;
if (vnodeWriteBlockToFile(pObj, &compBlock, data, cdata, rowOffset) < 0) {
// TODO: Deal with the ERROR here
}
checksum = taosCalcChecksum(checksum, (uint8_t *)(&compBlock), sizeof(SCompBlock));
if (twrite(pVnode->nfd, &compBlock, sizeof(SCompBlock)) < 0) {
// TODO : deal with the ERROR here
}
importHandle.newNumOfBlocks++;
importHandle.driftOffset += sizeof(SCompBlock);
} else { // Merge block and payload from payloadIter
if (vnodeLoadNeededBlockData(pObj, &importHandle, blockIter.slot,
DATA_LOAD_TIMESTAMP | DATA_LOAD_OTHER_DATA, &code) < 0) { // Load neccessary blocks
goto _error_merge;
}
importHandle.oldNumOfBlocks--;
importHandle.driftOffset -= sizeof(SCompBlock);
int rowOffset = blockIter.pos; // counter for data
// Copy the front part
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy((void *)(data[col]->data), (void *)(importHandle.data[col]->data),
pObj->schema[col].bytes * blockIter.pos);
}
// Merge part
while (1) {
if (rowOffset >= pVnode->cfg.rowsInFileBlock) { // data full in a block to commit
compBlock.last = 0;
if (vnodeWriteBlockToFile(pObj, &compBlock, data, cdata, rowOffset) < 0) {
// TODO : deal with the ERROR here
}
checksum = taosCalcChecksum(checksum, (uint8_t *)(&compBlock), sizeof(SCompBlock));
if (twrite(pVnode->nfd, (void *)(&compBlock), sizeof(SCompBlock)) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to write %s file, size:%ul, reason:%s", pVnode->vnode,
pObj->sid, pObj->meterId, pVnode->nfn, sizeof(SCompBlock), strerror(errno));
goto _error_merge;
}
importHandle.newNumOfBlocks++;
importHandle.driftOffset += sizeof(SCompBlock);
rowOffset = 0;
}
if ((payloadIter >= rows || KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) >= blockIter.nextKey) &&
blockIter.pos >= importHandle.pBlocks[blockIter.slot].numOfPoints)
break;
if (payloadIter >= rows ||
KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) >= blockIter.nextKey) { // payload end
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy(data[col]->data + rowOffset * pObj->schema[col].bytes,
importHandle.data[col]->data + pObj->schema[col].bytes * blockIter.pos, pObj->schema[col].bytes);
}
blockIter.pos++;
rowOffset++;
} else if (blockIter.pos >= importHandle.pBlocks[blockIter.slot].numOfPoints) { // block end
vnodeConvertRowsToCols(pObj, payload + pObj->bytesPerPoint * payloadIter, 1, data, rowOffset);
pointsImported++;
lastKeyImported = KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter);
payloadIter++;
rowOffset++;
} else {
if (KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) ==
KEY_AT_INDEX(importHandle.data[PRIMARYKEY_TIMESTAMP_COL_INDEX]->data, sizeof(TSKEY),
blockIter.pos)) { // duplicate key
if (tsAffectedRowsMod) pointsImported++;
payloadIter++;
continue;
} else if (KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) <
KEY_AT_INDEX(importHandle.data[PRIMARYKEY_TIMESTAMP_COL_INDEX]->data, sizeof(TSKEY),
blockIter.pos)) {
vnodeConvertRowsToCols(pObj, payload + pObj->bytesPerPoint * payloadIter, 1, data, rowOffset);
pointsImported++;
lastKeyImported = KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter);
payloadIter++;
rowOffset++;
} else {
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy(data[col]->data + rowOffset * pObj->schema[col].bytes,
importHandle.data[col]->data + pObj->schema[col].bytes * blockIter.pos,
pObj->schema[col].bytes);
}
blockIter.pos++;
rowOffset++;
}
}
}
if (rowOffset > 0) { // data full in a block to commit
compBlock.last = 0;
if (vnodeWriteBlockToFile(pObj, &compBlock, data, cdata, rowOffset) < 0) {
// TODO : deal with the ERROR here
}
checksum = taosCalcChecksum(checksum, (uint8_t *)(&compBlock), sizeof(SCompBlock));
if (twrite(pVnode->nfd, (void *)(&compBlock), sizeof(SCompBlock)) < 0) {
dError("vid:%d sid:%d meterId:%s, failed to write %s file, size:%ul, reason:%s", pVnode->vnode, pObj->sid,
pObj->meterId, pVnode->nfn, sizeof(SCompBlock), strerror(errno));
goto _error_merge;
}
importHandle.newNumOfBlocks++;
importHandle.driftOffset += sizeof(SCompBlock);
rowOffset = 0;
}
blockIter.slot++;
blockIter.oslot = blockIter.slot;
}
}
}
}
// Write the SCompInfo part
if (vnodeCloseImportFiles(pObj, &importHandle) < 0) {
code = TSDB_CODE_OTHERS;
goto _error_merge;
}
pImport->importedRows += pointsImported;
pthread_mutex_lock(&(pPool->vmutex));
if (pInfo->numOfBlocks > 0) {
int slot = (pInfo->currentSlot - pInfo->numOfBlocks + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
TSKEY firstKeyInCache = *((TSKEY *)(pInfo->cacheBlocks[slot]->offset[0]));
// data may be in commited cache, cache shall be released
if (lastKeyImported > firstKeyInCache) {
while (slot != pInfo->commitSlot) {
SCacheBlock *pCacheBlock = pInfo->cacheBlocks[slot];
vnodeFreeCacheBlock(pCacheBlock);
slot = (slot + 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
}
if (pInfo->commitPoint == pObj->pointsPerBlock) {
if (pInfo->cacheBlocks[pInfo->commitSlot]->pMeterObj == pObj) {
vnodeFreeCacheBlock(pInfo->cacheBlocks[pInfo->commitSlot]);
}
}
}
}
pthread_mutex_unlock(&(pPool->vmutex));
// TODO: free the allocated memory
tfree(buffer);
tfree(cbuffer);
tfree(importHandle.pHeader);
tfree(importHandle.pBlocks);
tfree(importHandle.pField);
tfree(importHandle.buffer);
tfree(importHandle.temp);
tfree(importHandle.tempBuffer);
return code;
_error_merge:
tfree(buffer);
tfree(cbuffer);
tfree(importHandle.pHeader);
tfree(importHandle.pBlocks);
tfree(importHandle.pField);
tfree(importHandle.buffer);
tfree(importHandle.temp);
tfree(importHandle.tempBuffer);
close(pVnode->dfd);
pVnode->dfd = 0;
close(pVnode->hfd);
pVnode->hfd = 0;
close(pVnode->lfd);
pVnode->lfd = 0;
if (pVnode->nfd > 0) {
close(pVnode->nfd);
pVnode->nfd = 0;
remove(pVnode->nfn);
}
return code;
}
#define FORWARD_ITER(iter, step, slotLimit, posLimit) \
{ \
if ((iter.pos) + (step) < (posLimit)) { \
(iter.pos) = (iter.pos) + (step); \
} else { \
(iter.pos) = 0; \
(iter.slot) = ((iter.slot) + 1) % (slotLimit); \
} \
}
int isCacheEnd(SBlockIter iter, SMeterObj *pTable) {
SCacheInfo *pInfo = (SCacheInfo *)(pTable->pCache);
int slot = 0;
int pos = 0;
if (pInfo->cacheBlocks[pInfo->currentSlot]->numOfPoints == pTable->pointsPerBlock) {
slot = (pInfo->currentSlot + 1) % (pInfo->maxBlocks);
pos = 0;
} else {
slot = pInfo->currentSlot;
pos = pInfo->cacheBlocks[pInfo->currentSlot]->numOfPoints;
}
return ((iter.slot == slot) && (iter.pos == pos));
}
static void vnodeFlushMergeBuffer(SMergeBuffer *pBuffer, SBlockIter *pWriteIter, SBlockIter *pCacheIter,
SMeterObj *pObj, SCacheInfo *pInfo, int checkBound) {
// Function to flush the merge buffer data to cache
if (pWriteIter->pos == pObj->pointsPerBlock) {
pWriteIter->pos = 0;
pWriteIter->slot = (pWriteIter->slot + 1) % pInfo->maxBlocks;
}
while (pBuffer->spos != pBuffer->epos) {
if (checkBound && pWriteIter->slot == pCacheIter->slot && pWriteIter->pos == pCacheIter->pos) break;
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy(pInfo->cacheBlocks[pWriteIter->slot]->offset[col] + pObj->schema[col].bytes * pWriteIter->pos,
pBuffer->offset[col] + pObj->schema[col].bytes * pBuffer->spos, pObj->schema[col].bytes);
}
if (pWriteIter->pos + 1 < pObj->pointsPerBlock) {
(pWriteIter->pos)++;
} else {
pInfo->cacheBlocks[pWriteIter->slot]->numOfPoints = pWriteIter->pos + 1;
pWriteIter->slot = (pWriteIter->slot + 1) % pInfo->maxBlocks;
pWriteIter->pos = 0;
}
pBuffer->spos = (pBuffer->spos + 1) % pBuffer->totalRows;
}
if ((!checkBound) && pWriteIter->pos != 0) {
pInfo->cacheBlocks[pWriteIter->slot]->numOfPoints = pWriteIter->pos;
}
}
int vnodeImportDataToCache(SImportInfo *pImport, const char *payload, const int rows) {
SMeterObj * pObj = pImport->pObj;
SVnodeObj * pVnode = vnodeList + pObj->vnode;
int code = -1;
SCacheInfo * pInfo = (SCacheInfo *)(pObj->pCache);
int payloadIter;
SCachePool * pPool = (SCachePool *)(pVnode->pCachePool);
int isCacheIterEnd = 0;
int spayloadIter = 0;
int isAppendData = 0;
int rowsImported = 0;
int totalRows = 0;
size_t size = 0;
SMergeBuffer *pBuffer = NULL;
TSKEY firstKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, 0);
TSKEY lastKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, rows - 1);
assert(firstKey <= lastKey && firstKey > pObj->lastKeyOnFile);
// TODO: make this condition less strict
if (pObj->freePoints < rows || pObj->freePoints < (pObj->pointsPerBlock << 1)) { // No free room to hold the data
dError("vid:%d sid:%d id:%s, import failed, cache is full, freePoints:%d", pObj->vnode, pObj->sid, pObj->meterId,
pObj->freePoints);
pImport->importedRows = 0;
pImport->commit = 1;
code = TSDB_CODE_ACTION_IN_PROGRESS;
return code;
}
if (pInfo->numOfBlocks == 0) {
if (vnodeAllocateCacheBlock(pObj) < 0) {
pImport->importedRows = 0;
pImport->commit = 1;
code = TSDB_CODE_ACTION_IN_PROGRESS;
return code;
}
}
// Find the first importable record from payload
pImport->lastKey = lastKey;
for (payloadIter = 0; payloadIter < rows; payloadIter++) {
TSKEY key = KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter);
if (key == pObj->lastKey) {
if (tsAffectedRowsMod) rowsImported++;
continue;
}
if (key > pObj->lastKey) { // Just as insert
pImport->slot = pInfo->currentSlot;
pImport->pos = pInfo->cacheBlocks[pImport->slot]->numOfPoints;
isCacheIterEnd = 1;
break;
} else {
pImport->firstKey = key;
if (vnodeFindKeyInCache(pImport, 1) < 0) {
goto _exit;
}
if (pImport->firstKey != pImport->key) break;
if (tsAffectedRowsMod) rowsImported++;
}
}
if (payloadIter == rows) {
pImport->importedRows += rowsImported;
code = 0;
goto _exit;
}
spayloadIter = payloadIter;
if (pImport->pos == pObj->pointsPerBlock) assert(isCacheIterEnd);
// Allocate a new merge buffer work as buffer
totalRows = pObj->pointsPerBlock + rows - payloadIter + 1;
size = sizeof(SMergeBuffer) + sizeof(char *) * pObj->numOfColumns + pObj->bytesPerPoint * totalRows;
pBuffer = (SMergeBuffer *)malloc(size);
if (pBuffer == NULL) {
dError("vid:%d sid:%d meterId:%s, failed to allocate memory, size:%d", pObj->vnode, pObj->sid, pObj->meterId, size);
return TSDB_CODE_SERV_OUT_OF_MEMORY;
}
pBuffer->spos = 0;
pBuffer->epos = 0;
pBuffer->totalRows = totalRows;
pBuffer->offset[0] = (char *)pBuffer + sizeof(SMergeBuffer) + sizeof(char *) * pObj->numOfColumns;
for (int col = 1; col < pObj->numOfColumns; col++) {
pBuffer->offset[col] = pBuffer->offset[col - 1] + pObj->schema[col - 1].bytes * totalRows;
}
// TODO: take pImport->pos = pObj->pointsPerBlock into consideration
{ // Do the merge staff
SBlockIter cacheIter = {pImport->slot, pImport->pos, 0, 0}; // Iter to traverse old cache data
SBlockIter writeIter = {pImport->slot, pImport->pos, 0, 0}; // Iter to write data to cache
int availPoints = pObj->pointsPerBlock - pInfo->cacheBlocks[pInfo->currentSlot]->numOfPoints;
assert(availPoints >= 0);
while (1) {
if ((payloadIter >= rows) && isCacheIterEnd) break;
if ((pBuffer->epos + 1) % pBuffer->totalRows == pBuffer->spos) { // merge buffer is full, flush
vnodeFlushMergeBuffer(pBuffer, &writeIter, &cacheIter, pObj, pInfo, 1);
}
TSKEY payloadKey = (payloadIter < rows) ? KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) : INT64_MAX;
TSKEY cacheKey = (isCacheIterEnd) ? INT64_MAX : KEY_AT_INDEX(pInfo->cacheBlocks[cacheIter.slot]->offset[0], sizeof(TSKEY), cacheIter.pos);
if (cacheKey < payloadKey) { // if (payload end || (cacheIter not end && payloadKey > blockKey)), consume cache
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy(pBuffer->offset[col] + pObj->schema[col].bytes * pBuffer->epos,
pInfo->cacheBlocks[cacheIter.slot]->offset[col] + pObj->schema[col].bytes * cacheIter.pos,
pObj->schema[col].bytes);
}
FORWARD_ITER(cacheIter, 1, pInfo->maxBlocks, pObj->pointsPerBlock);
isCacheIterEnd = isCacheEnd(cacheIter, pObj);
} else if (cacheKey > payloadKey) { // cacheIter end || (payloadIter not end && payloadKey < blockKey), consume payload
if (availPoints == 0) { // Need to allocate a new cache block
pthread_mutex_lock(&(pPool->vmutex));
// TODO: Need to check if there are enough slots to hold a new one
SCacheBlock *pNewBlock = vnodeGetFreeCacheBlock(pVnode);
if (pNewBlock == NULL) { // Failed to allocate a new cache block, need to commit and loop over the remaining cache records
pthread_mutex_unlock(&(pPool->vmutex));
payloadIter = rows;
code = TSDB_CODE_ACTION_IN_PROGRESS;
pImport->commit = 1;
continue;
}
assert(pInfo->numOfBlocks <= pInfo->maxBlocks);
if (pInfo->numOfBlocks == pInfo->maxBlocks) {
vnodeFreeCacheBlock(pInfo->cacheBlocks[(pInfo->currentSlot + 1) % pInfo->maxBlocks]);
}
pNewBlock->pMeterObj = pObj;
pNewBlock->offset[0] = (char *)pNewBlock + sizeof(SCacheBlock) + sizeof(char *) * pObj->numOfColumns;
for (int col = 1; col < pObj->numOfColumns; col++)
pNewBlock->offset[col] = pNewBlock->offset[col - 1] + pObj->schema[col - 1].bytes * pObj->pointsPerBlock;
int newSlot = (writeIter.slot + 1) % pInfo->maxBlocks;
pInfo->blocks++;
int tblockId = pInfo->blocks;
if (writeIter.slot != pInfo->currentSlot) {
for (int tslot = pInfo->currentSlot; tslot != writeIter.slot;) {
int nextSlot = (tslot + 1) % pInfo->maxBlocks;
pInfo->cacheBlocks[nextSlot] = pInfo->cacheBlocks[tslot];
pInfo->cacheBlocks[nextSlot]->slot = nextSlot;
pInfo->cacheBlocks[nextSlot]->blockId = tblockId--;
tslot = (tslot - 1 + pInfo->maxBlocks) % pInfo->maxBlocks;
}
}
int index = pNewBlock->index;
if (cacheIter.slot == writeIter.slot) {
pNewBlock->numOfPoints = pInfo->cacheBlocks[cacheIter.slot]->numOfPoints;
int pointsLeft = pInfo->cacheBlocks[cacheIter.slot]->numOfPoints - cacheIter.pos;
if (pointsLeft > 0) {
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy((void *)(pNewBlock->offset[col] + pObj->schema[col].bytes*cacheIter.pos),
pInfo->cacheBlocks[cacheIter.slot]->offset[col] + pObj->schema[col].bytes * cacheIter.pos,
pObj->schema[col].bytes * pointsLeft);
}
}
}
pNewBlock->blockId = tblockId;
pNewBlock->slot = newSlot;
pNewBlock->index = index;
pInfo->cacheBlocks[newSlot] = pNewBlock;
pInfo->numOfBlocks++;
pInfo->unCommittedBlocks++;
pInfo->currentSlot = (pInfo->currentSlot + 1) % pInfo->maxBlocks;
pthread_mutex_unlock(&(pPool->vmutex));
cacheIter.slot = (cacheIter.slot + 1) % pInfo->maxBlocks;
// move a cache of data forward
availPoints = pObj->pointsPerBlock;
}
int offset = 0;
for (int col = 0; col < pObj->numOfColumns; col++) {
memcpy(pBuffer->offset[col] + pObj->schema[col].bytes * pBuffer->epos,
payload + pObj->bytesPerPoint * payloadIter + offset, pObj->schema[col].bytes);
offset += pObj->schema[col].bytes;
}
if (spayloadIter == payloadIter) {// update pVnode->firstKey
pthread_mutex_lock(&(pVnode->vmutex));
if (KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter) < pVnode->firstKey) pVnode->firstKey = firstKey;
pthread_mutex_unlock(&(pVnode->vmutex));
}
if (isCacheIterEnd) {
pObj->lastKey = KEY_AT_INDEX(payload, pObj->bytesPerPoint, payloadIter);
if (!isAppendData) isAppendData = 1;
}
rowsImported++;
availPoints--;
payloadIter++;
} else {
if (tsAffectedRowsMod) rowsImported++;
payloadIter++;
continue;
}
pBuffer->epos = (pBuffer->epos + 1) % pBuffer->totalRows;
}
if (pBuffer->spos != pBuffer->epos) { // Flush the remaining data in the merge buffer
vnodeFlushMergeBuffer(pBuffer, &writeIter, &cacheIter, pObj, pInfo, 0);
} else {
// Should never come here
assert(false);
}
if (isAppendData) {
pthread_mutex_lock(&(pVnode->vmutex));
if (pObj->lastKey > pVnode->lastKey) pVnode->lastKey = pObj->lastKey;
pthread_mutex_unlock(&(pVnode->vmutex));
}
}
pImport->importedRows += rowsImported;
atomic_fetch_sub_32(&(pObj->freePoints), rowsImported);
code = TSDB_CODE_SUCCESS;
_exit:
tfree(pBuffer);
return code;
}
int vnodeImportDataToFiles(SImportInfo *pImport, char *payload, const int rows) {
int code = 0;
// TODO : Check the correctness of pObj and pVnode
SMeterObj *pObj = (SMeterObj *)(pImport->pObj);
SVnodeObj *pVnode = vnodeList + pObj->vnode;
int64_t delta = pVnode->cfg.daysPerFile * tsMsPerDay[(uint8_t)pVnode->cfg.precision];
int sfid = KEY_AT_INDEX(payload, pObj->bytesPerPoint, 0) / delta;
int efid = KEY_AT_INDEX(payload, pObj->bytesPerPoint, rows - 1) / delta;
for (int fid = sfid; fid <= efid; fid++) {
TSKEY skey = fid * delta;
TSKEY ekey = skey + delta - 1;
int srow = 0, nrows = 0;
if (vnodeSearchKeyInRange(payload, pObj->bytesPerPoint, rows, skey, ekey, &srow, &nrows) < 0) continue;
assert(nrows > 0);
dTrace("vid:%d sid:%d meterId:%s, %d rows of data will be imported to file %d, srow:%d firstKey:%" PRId64 " lastKey:%" PRId64,
pObj->vnode, pObj->sid, pObj->meterId, nrows, fid, srow, KEY_AT_INDEX(payload, pObj->bytesPerPoint, srow),
KEY_AT_INDEX(payload, pObj->bytesPerPoint, (srow + nrows - 1)));
code = vnodeMergeDataIntoFile(pImport, payload + (srow * pObj->bytesPerPoint), nrows, fid);
if (code != TSDB_CODE_SUCCESS) break;
}
return code;
}
// TODO : add offset in pShell to make it avoid repeatedly deal with messages
int vnodeImportData(SMeterObj *pObj, SImportInfo *pImport) {
int code = 0;
int srow = 0, nrows = 0;
SVnodeObj * pVnode = vnodeList + pObj->vnode;
SCachePool *pPool = (SCachePool *)(pVnode->pCachePool);
// 1. import data in range (pObj->lastKeyOnFile, INT64_MAX) into cache
if (vnodeSearchKeyInRange(pImport->payload, pObj->bytesPerPoint, pImport->rows, pObj->lastKeyOnFile + 1, INT64_MAX,
&srow, &nrows) >= 0) {
assert(nrows > 0);
code = vnodeImportDataToCache(pImport, pImport->payload + pObj->bytesPerPoint * srow, nrows);
if (pImport->commit) { // Need to commit now
pPool->commitInProcess = 0;
vnodeProcessCommitTimer(pVnode, NULL);
return code;
}
if (code != TSDB_CODE_SUCCESS) return code;
}
// 2. import data (0, pObj->lastKeyOnFile) into files
if (vnodeSearchKeyInRange(pImport->payload, pObj->bytesPerPoint, pImport->rows, 0, pObj->lastKeyOnFile - 1, &srow,
&nrows) >= 0) {
assert(nrows > 0);
code = vnodeImportDataToFiles(pImport, pImport->payload + pObj->bytesPerPoint * srow, nrows);
}
pPool->commitInProcess = 0;
return code;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "trpc.h"
#include "tschemautil.h"
#include "ttime.h"
#include "tutil.h"
#include "vnode.h"
#include "vnodeShell.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
#define VALID_TIMESTAMP(key, curKey, prec) (((key) >= 0) && ((key) <= ((curKey) + 36500 * tsMsPerDay[prec])))
int tsMeterSizeOnFile;
void vnodeUpdateMeter(void *param, void *tmdId);
void vnodeRecoverMeterObjectFile(int vnode);
int (*vnodeProcessAction[])(SMeterObj *, char *, int, char, void *, int, int *, TSKEY) = {vnodeInsertPoints,
vnodeImportPoints};
void vnodeFreeMeterObj(SMeterObj *pObj) {
if (pObj == NULL) return;
dTrace("vid:%d sid:%d id:%s, meter is cleaned up", pObj->vnode, pObj->sid, pObj->meterId);
vnodeFreeCacheInfo(pObj);
if (vnodeList[pObj->vnode].meterList != NULL) {
vnodeList[pObj->vnode].meterList[pObj->sid] = NULL;
}
memset(pObj->meterId, 0, tListLen(pObj->meterId));
tfree(pObj);
}
int vnodeUpdateVnodeStatistic(FILE *fp, SVnodeObj *pVnode) {
fseek(fp, TSDB_FILE_HEADER_VERSION_SIZE, SEEK_SET);
fwrite(&(pVnode->vnodeStatistic), sizeof(SVnodeStatisticInfo), 1, fp);
return 0;
}
void vnodeUpdateVnodeFileHeader(FILE *fp, SVnodeObj *pVnode) {
fseek(fp, TSDB_FILE_HEADER_LEN * 1 / 4, SEEK_SET);
#ifdef _TD_ARM_32_
fprintf(fp, "%lld %lld %lld ", pVnode->lastCreate, pVnode->lastRemove, pVnode->version);
fprintf(fp, "%lld %d %d ", pVnode->lastKeyOnFile, pVnode->fileId, pVnode->numOfFiles);
#else
fprintf(fp, "%ld %ld %ld ", pVnode->lastCreate, pVnode->lastRemove, pVnode->version);
fprintf(fp, "%ld %d %d ", pVnode->lastKeyOnFile, pVnode->fileId, pVnode->numOfFiles);
#endif
}
int vnodeCreateMeterObjFile(int vnode) {
FILE * fp;
char fileName[TSDB_FILENAME_LEN];
int32_t size;
// SMeterObj *pObj;
sprintf(fileName, "%s/vnode%d/meterObj.v%d", tsDirectory, vnode, vnode);
fp = fopen(fileName, "w+");
if (fp == NULL) {
dError("failed to create vnode:%d file:%s, errno:%d, reason:%s", vnode, fileName, errno, strerror(errno));
if (errno == EACCES) {
return TSDB_CODE_NO_DISK_PERMISSIONS;
} else if (errno == ENOSPC) {
return TSDB_CODE_SERV_NO_DISKSPACE;
} else {
return TSDB_CODE_VG_INIT_FAILED;
}
} else {
vnodeCreateFileHeader(fp);
vnodeUpdateVnodeFileHeader(fp, vnodeList + vnode);
fseek(fp, TSDB_FILE_HEADER_LEN, SEEK_SET);
size = sizeof(SMeterObjHeader) * vnodeList[vnode].cfg.maxSessions + sizeof(TSCKSUM);
tfree(vnodeList[vnode].meterIndex);
vnodeList[vnode].meterIndex = calloc(1, size);
taosCalcChecksumAppend(0, (uint8_t *)(vnodeList[vnode].meterIndex), size);
fwrite(vnodeList[vnode].meterIndex, size, 1, fp);
fclose(fp);
}
return TSDB_CODE_SUCCESS;
}
FILE *vnodeOpenMeterObjFile(int vnode) {
FILE * fp;
char fileName[TSDB_FILENAME_LEN];
struct stat fstat;
// check if directory exists
sprintf(fileName, "%s/vnode%d", tsDirectory, vnode);
if (stat(fileName, &fstat) < 0) return NULL;
sprintf(fileName, "%s/vnode%d/meterObj.v%d", tsDirectory, vnode, vnode);
if (stat(fileName, &fstat) < 0) return NULL;
fp = fopen(fileName, "r+");
if (fp != NULL) {
if (vnodeCheckFileIntegrity(fp) < 0) {
dError("file:%s is corrupted, need to restore it first, exit program", fileName);
fclose(fp);
// todo: how to recover
exit(1);
}
} else {
dError("failed to open %s, reason:%s", fileName, strerror(errno));
}
return fp;
}
int vnodeSaveMeterObjToFile(SMeterObj *pObj) {
int64_t offset, length, new_length, new_offset;
FILE * fp;
SVnodeObj *pVnode = &vnodeList[pObj->vnode];
char * buffer = NULL;
fp = vnodeOpenMeterObjFile(pObj->vnode);
if (fp == NULL) return -1;
buffer = (char *)malloc(tsMeterSizeOnFile);
if (buffer == NULL) {
dError("Failed to allocate memory while saving meter object to file, meterId", pObj->meterId);
fclose(fp);
return -1;
}
offset = pVnode->meterIndex[pObj->sid].offset;
length = pVnode->meterIndex[pObj->sid].length;
new_length = offsetof(SMeterObj, reserved) + pObj->numOfColumns * sizeof(SColumn) + pObj->sqlLen + sizeof(TSCKSUM);
memcpy(buffer, pObj, offsetof(SMeterObj, reserved));
memcpy(buffer + offsetof(SMeterObj, reserved), pObj->schema, pObj->numOfColumns * sizeof(SColumn));
memcpy(buffer + offsetof(SMeterObj, reserved) + pObj->numOfColumns * sizeof(SColumn), pObj->pSql, pObj->sqlLen);
taosCalcChecksumAppend(0, (uint8_t *)buffer, new_length);
if (offset == 0 || length < new_length) { // New, append to file end
fseek(fp, 0, SEEK_END);
new_offset = ftell(fp);
fwrite(buffer, new_length, 1, fp);
pVnode->meterIndex[pObj->sid].offset = new_offset;
pVnode->meterIndex[pObj->sid].length = new_length;
} else if (offset < 0) { // deleted meter, append to end of file
fseek(fp, -offset, SEEK_SET);
fwrite(buffer, new_length, 1, fp);
pVnode->meterIndex[pObj->sid].offset = -offset;
pVnode->meterIndex[pObj->sid].length = new_length;
} else { // meter exists, overwrite it, offset > 0
fseek(fp, offset, SEEK_SET);
fwrite(buffer, new_length, 1, fp);
pVnode->meterIndex[pObj->sid].offset = (pObj->meterId[0] == 0) ? -offset : offset;
pVnode->meterIndex[pObj->sid].length = new_length;
}
// taosCalcChecksumAppend(0, pVnode->meterIndex, sizeof(SMeterObjHeader)*pVnode->cfg.maxSessions+sizeof(TSCKSUM));
// NOTE: no checksum, since it makes creating table slow
fseek(fp, TSDB_FILE_HEADER_LEN + sizeof(SMeterObjHeader) * pObj->sid, SEEK_SET);
fwrite(&(pVnode->meterIndex[pObj->sid]), sizeof(SMeterObjHeader), 1, fp);
// update checksum
// fseek(fp, TSDB_FILE_HEADER_LEN+sizeof(SMeterObjHeader)*(pVnode->cfg.maxSessions), SEEK_SET);
// fwrite(((char *)(pVnode->meterIndex) + sizeof(SMeterObjHeader)*(pVnode->cfg.maxSessions)), sizeof(TSCKSUM), 1, fp);
tfree(buffer);
vnodeUpdateVnodeStatistic(fp, pVnode);
vnodeUpdateVnodeFileHeader(fp, pVnode);
/* vnodeUpdateFileCheckSum(fp); */
fclose(fp);
return 0;
}
int vnodeSaveAllMeterObjToFile(int vnode) {
int64_t offset, length, new_length, new_offset;
FILE * fp;
SMeterObj *pObj;
SVnodeObj *pVnode = &vnodeList[vnode];
char * buffer = NULL;
fp = vnodeOpenMeterObjFile(vnode);
if (fp == NULL) return -1;
buffer = (char *)malloc(tsMeterSizeOnFile);
if (buffer == NULL) {
dError("Failed to allocate memory while saving all meter objects to file");
return -1;
}
for (int sid = 0; sid < pVnode->cfg.maxSessions; ++sid) {
pObj = pVnode->meterList[sid];
if (pObj == NULL) continue;
offset = pVnode->meterIndex[sid].offset;
length = pVnode->meterIndex[sid].length;
new_length = offsetof(SMeterObj, reserved) + pObj->numOfColumns * sizeof(SColumn) + pObj->sqlLen + sizeof(TSCKSUM);
memcpy(buffer, pObj, offsetof(SMeterObj, reserved));
memcpy(buffer + offsetof(SMeterObj, reserved), pObj->schema, pObj->numOfColumns * sizeof(SColumn));
memcpy(buffer + offsetof(SMeterObj, reserved) + pObj->numOfColumns * sizeof(SColumn), pObj->pSql, pObj->sqlLen);
taosCalcChecksumAppend(0, (uint8_t *)buffer, new_length);
if (offset == 0 || length > new_length) { // New, append to file end
new_offset = fseek(fp, 0, SEEK_END);
fwrite(buffer, new_length, 1, fp);
pVnode->meterIndex[sid].offset = new_offset;
pVnode->meterIndex[sid].length = new_length;
} else if (offset < 0) { // deleted meter, append to end of file
fseek(fp, -offset, SEEK_SET);
fwrite(buffer, new_length, 1, fp);
pVnode->meterIndex[sid].offset = -offset;
pVnode->meterIndex[sid].length = new_length;
} else { // meter exists, overwrite it, offset > 0
fseek(fp, offset, SEEK_SET);
fwrite(buffer, new_length, 1, fp);
pVnode->meterIndex[sid].offset = offset;
pVnode->meterIndex[sid].length = new_length;
}
}
// taosCalcChecksumAppend(0, pVnode->meterIndex, sizeof(SMeterObjHeader)*pVnode->cfg.maxSessions+sizeof(TSCKSUM));
fseek(fp, TSDB_FILE_HEADER_LEN, SEEK_SET);
fwrite(pVnode->meterIndex, sizeof(SMeterObjHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM), 1, fp);
tfree(buffer);
vnodeUpdateVnodeStatistic(fp, pVnode);
vnodeUpdateVnodeFileHeader(fp, pVnode);
/* vnodeUpdateFileCheckSum(fp); */
fclose(fp);
return 0;
}
int vnodeSaveVnodeCfg(int vnode, SVnodeCfg *pCfg, SVPeerDesc *pDesc) {
FILE *fp;
fp = vnodeOpenMeterObjFile(vnode);
if (fp == NULL) {
dError("failed to open vnode:%d file", vnode);
return -1;
}
fseek(fp, TSDB_FILE_HEADER_LEN * 2 / 4, SEEK_SET);
fwrite(pCfg, sizeof(SVnodeCfg), 1, fp);
char temp[TSDB_FILE_HEADER_LEN / 4];
memset(temp, 0, sizeof(temp));
fseek(fp, TSDB_FILE_HEADER_LEN * 3 / 4, SEEK_SET);
fwrite(temp, sizeof(temp), 1, fp);
if (pCfg->replications >= 1) {
fseek(fp, TSDB_FILE_HEADER_LEN * 3 / 4, SEEK_SET);
fwrite(pDesc, sizeof(SVPeerDesc), pCfg->replications, fp);
}
/* vnodeUpdateFileCheckSum(fp); */
fclose(fp);
return TSDB_CODE_SUCCESS;
}
int vnodeSaveVnodeInfo(int vnode) {
FILE * fp;
SVnodeObj *pVnode = &vnodeList[vnode];
fp = vnodeOpenMeterObjFile(vnode);
if (fp == NULL) return -1;
vnodeUpdateVnodeFileHeader(fp, pVnode);
/* vnodeUpdateFileCheckSum(fp); */
fclose(fp);
return 0;
}
int vnodeRestoreMeterObj(char *buffer, int64_t length) {
SMeterObj *pSavedObj, *pObj;
int size;
pSavedObj = (SMeterObj *)buffer;
if (pSavedObj->vnode < 0 || pSavedObj->vnode >= TSDB_MAX_VNODES) {
dTrace("vid:%d is out of range, corrupted meter obj file", pSavedObj->vnode);
return -1;
}
SVnodeCfg *pCfg = &vnodeList[pSavedObj->vnode].cfg;
if (pSavedObj->sid < 0 || pSavedObj->sid >= pCfg->maxSessions) {
dTrace("vid:%d, sid:%d is larger than max:%d", pSavedObj->vnode, pSavedObj->sid, pCfg->maxSessions);
return -1;
}
if (pSavedObj->meterId[0] == 0) return TSDB_CODE_SUCCESS;
size = sizeof(SMeterObj) + pSavedObj->sqlLen + 1;
pObj = (SMeterObj *)malloc(size);
if (pObj == NULL) {
dError("vid:%d sid:%d, no memory to allocate", pSavedObj->vnode, pSavedObj->sid);
return TSDB_CODE_SERV_OUT_OF_MEMORY;
}
pObj->schema = (SColumn *)malloc(pSavedObj->numOfColumns * sizeof(SColumn));
if (NULL == pObj->schema){
dError("vid:%d sid:%d, no memory to allocate for schema", pSavedObj->vnode, pSavedObj->sid);
free(pObj);
return TSDB_CODE_SERV_OUT_OF_MEMORY;
}
memcpy(pObj, pSavedObj, offsetof(SMeterObj, reserved));
pObj->numOfQueries = 0;
pObj->pCache = vnodeAllocateCacheInfo(pObj);
if (NULL == pObj->pCache){
dError("vid:%d sid:%d, no memory to allocate for cache", pSavedObj->vnode, pSavedObj->sid);
tfree(pObj->schema);
tfree(pObj);
return TSDB_CODE_SERV_OUT_OF_MEMORY;
}
vnodeList[pSavedObj->vnode].meterList[pSavedObj->sid] = pObj;
pObj->pStream = NULL;
memcpy(pObj->schema, buffer + offsetof(SMeterObj, reserved), pSavedObj->numOfColumns * sizeof(SColumn));
pObj->state = TSDB_METER_STATE_READY;
if (pObj->sqlLen > 0)
memcpy((char *)pObj + sizeof(SMeterObj),
((char *)pSavedObj) + offsetof(SMeterObj, reserved) + sizeof(SColumn) * pSavedObj->numOfColumns,
pSavedObj->sqlLen);
pObj->pSql = (char *)pObj + sizeof(SMeterObj);
pObj->lastKey = pObj->lastKeyOnFile;
if (pObj->lastKey > vnodeList[pObj->vnode].lastKey) vnodeList[pObj->vnode].lastKey = pObj->lastKey;
// taosSetSecurityInfo(pObj->vnode, pObj->sid, pObj->meterId, pObj->spi, pObj->encrypt, pObj->secret, pObj->cipheringKey);
dTrace("vid:%d sid:%d id:%s, meter is restored, uid:%" PRIu64 "", pObj->vnode, pObj->sid, pObj->meterId, pObj->uid);
return TSDB_CODE_SUCCESS;
}
int vnodeOpenMetersVnode(int vnode) {
FILE * fp;
char * buffer;
int64_t sid;
int64_t offset, length;
SVnodeObj *pVnode = &vnodeList[vnode];
fp = vnodeOpenMeterObjFile(vnode);
if (fp == NULL) return 0;
fseek(fp, TSDB_FILE_HEADER_VERSION_SIZE, SEEK_SET);
fread(&(pVnode->vnodeStatistic), sizeof(SVnodeStatisticInfo), 1, fp);
fseek(fp, TSDB_FILE_HEADER_LEN * 1 / 4, SEEK_SET);
#ifdef _TD_ARM_32_
fscanf(fp, "%lld %lld %lld ", &(pVnode->lastCreate), &(pVnode->lastRemove), &(pVnode->version));
fscanf(fp, "%lld %d %d ", &(pVnode->lastKeyOnFile), &(pVnode->fileId), &(pVnode->numOfFiles));
#else
fscanf(fp, "%ld %ld %ld ", &(pVnode->lastCreate), &(pVnode->lastRemove), &(pVnode->version));
fscanf(fp, "%ld %d %d ", &(pVnode->lastKeyOnFile), &(pVnode->fileId), &(pVnode->numOfFiles));
#endif
fseek(fp, TSDB_FILE_HEADER_LEN * 2 / 4, SEEK_SET);
fread(&pVnode->cfg, sizeof(SVnodeCfg), 1, fp);
if (vnodeIsValidVnodeCfg(&pVnode->cfg) == false) {
dError("vid:%d, maxSessions:%d cacheBlockSize:%d replications:%d daysPerFile:%d daysToKeep:%d invalid, clear it",
vnode, pVnode->cfg.maxSessions, pVnode->cfg.cacheBlockSize, pVnode->cfg.replications,
pVnode->cfg.daysPerFile, pVnode->cfg.daysToKeep);
pVnode->cfg.maxSessions = 0; // error in vnode file
return 0;
}
fseek(fp, TSDB_FILE_HEADER_LEN * 3 / 4, SEEK_SET);
fread(&pVnode->vpeers, sizeof(SVPeerDesc), TSDB_VNODES_SUPPORT, fp);
fseek(fp, TSDB_FILE_HEADER_LEN, SEEK_SET);
tsMeterSizeOnFile = sizeof(SMeterObj) + TSDB_MAX_COLUMNS * sizeof(SColumn) + TSDB_MAX_SAVED_SQL_LEN + sizeof(TSCKSUM);
int size = sizeof(SMeterObj *) * pVnode->cfg.maxSessions;
pVnode->meterList = (void *)malloc(size);
if (pVnode->meterList == NULL) return -1;
memset(pVnode->meterList, 0, size);
size = sizeof(SMeterObjHeader) * pVnode->cfg.maxSessions + sizeof(TSCKSUM);
pVnode->meterIndex = (SMeterObjHeader *)calloc(1, size);
if (pVnode->meterIndex == NULL) {
tfree(pVnode->meterList);
return -1;
}
// Read SMeterObjHeader list from file
if (fread(pVnode->meterIndex, size, 1, fp) < 0) return -1;
// if (!taosCheckChecksumWhole(pVnode->meterIndex, size)) {
// dError("vid: %d meter obj file header is broken since checksum mismatch", vnode);
// return -1;
// }
// Read the meter object from file and recover the structure
buffer = malloc(tsMeterSizeOnFile);
memset(buffer, 0, tsMeterSizeOnFile);
for (sid = 0; sid < pVnode->cfg.maxSessions; ++sid) {
offset = pVnode->meterIndex[sid].offset;
length = pVnode->meterIndex[sid].length;
if (offset <= 0 || length <= 0) continue;
fseek(fp, offset, SEEK_SET);
if (fread(buffer, length, 1, fp) <= 0) break;
if (taosCheckChecksumWhole((uint8_t *)buffer, length)) {
vnodeRestoreMeterObj(buffer, length - sizeof(TSCKSUM));
} else {
dError("meter object file is broken since checksum mismatch, vnode: %d sid: %d, try to recover", vnode, sid);
continue;
/* vnodeRecoverMeterObjectFile(vnode); */
}
}
tfree(buffer);
fclose(fp);
return 0;
}
void vnodeCloseMetersVnode(int vnode) {
SVnodeObj *pVnode = vnodeList + vnode;
SMeterObj *pObj;
if (pVnode->meterList) {
for (int sid = 0; sid < pVnode->cfg.maxSessions; ++sid) {
pObj = pVnode->meterList[sid];
if (pObj == NULL) continue;
vnodeFreeCacheInfo(pObj);
tfree(pObj->schema);
tfree(pObj);
}
tfree(pVnode->meterList);
}
pVnode->meterList = NULL;
}
int vnodeCreateMeterObj(SMeterObj *pNew, SConnSec *pSec) {
SMeterObj *pObj;
int code;
pObj = vnodeList[pNew->vnode].meterList[pNew->sid];
code = TSDB_CODE_SUCCESS;
if (pObj && pObj->uid == pNew->uid) {
if (pObj->sversion == pNew->sversion) {
dTrace("vid:%d sid:%d id:%s sversion:%d, identical meterObj, ignore create", pNew->vnode, pNew->sid,
pNew->meterId, pNew->sversion);
return -1;
}
dTrace("vid:%d sid:%d id:%s, update schema", pNew->vnode, pNew->sid, pNew->meterId);
if (!vnodeIsMeterState(pObj, TSDB_METER_STATE_UPDATING)) vnodeUpdateMeter(pNew, NULL);
return TSDB_CODE_SUCCESS;
}
if (pObj) {
dWarn("vid:%d sid:%d id:%s, old meter is there, remove it", pNew->vnode, pNew->sid, pNew->meterId);
vnodeRemoveMeterObj(pNew->vnode, pNew->sid);
}
pNew->pCache = vnodeAllocateCacheInfo(pNew);
if (pNew->pCache == NULL) {
code = TSDB_CODE_NO_RESOURCE;
} else {
vnodeList[pNew->vnode].meterList[pNew->sid] = pNew;
pNew->state = TSDB_METER_STATE_READY;
if (pNew->timeStamp > vnodeList[pNew->vnode].lastCreate) vnodeList[pNew->vnode].lastCreate = pNew->timeStamp;
vnodeSaveMeterObjToFile(pNew);
// vnodeCreateMeterMgmt(pNew, pSec);
vnodeCreateStream(pNew);
dTrace("vid:%d, sid:%d id:%s, meterObj is created, uid:%" PRIu64 "", pNew->vnode, pNew->sid, pNew->meterId, pNew->uid);
}
return code;
}
int vnodeRemoveMeterObj(int vnode, int sid) {
SMeterObj *pObj;
if (vnode < 0 || vnode >= TSDB_MAX_VNODES) {
dError("vid:%d is out of range", vnode);
return 0;
}
SVnodeCfg *pCfg = &vnodeList[vnode].cfg;
if (sid < 0 || sid >= pCfg->maxSessions) {
dError("vid:%d, sid:%d is larger than max:%d or less than 0", vnode, sid, pCfg->maxSessions);
return 0;
}
// vnode has been closed, no meters in this vnode
if (vnodeList[vnode].meterList == NULL) return 0;
pObj = vnodeList[vnode].meterList[sid];
if (pObj == NULL) {
return TSDB_CODE_SUCCESS;
}
if (!vnodeIsSafeToDeleteMeter(&vnodeList[vnode], sid)) {
return TSDB_CODE_ACTION_IN_PROGRESS;
}
// after remove this meter, change its state to DELETED
pObj->state = TSDB_METER_STATE_DROPPED;
pObj->timeStamp = taosGetTimestampMs();
vnodeList[vnode].lastRemove = pObj->timeStamp;
vnodeRemoveStream(pObj);
vnodeSaveMeterObjToFile(pObj);
vnodeFreeMeterObj(pObj);
return 0;
}
int vnodeInsertPoints(SMeterObj *pObj, char *cont, int contLen, char source, void *param, int sversion,
int *numOfInsertPoints, TSKEY now) {
int expectedLen, i;
short numOfPoints;
SSubmitMsg *pSubmit = (SSubmitMsg *)cont;
char * pData;
TSKEY tsKey;
int points = 0;
int code = TSDB_CODE_SUCCESS;
SVnodeObj * pVnode = vnodeList + pObj->vnode;
numOfPoints = htons(pSubmit->numOfRows);
expectedLen = numOfPoints * pObj->bytesPerPoint + sizeof(pSubmit->numOfRows);
if (expectedLen != contLen) {
dError("vid:%d sid:%d id:%s, invalid submit msg length:%d, expected:%d, bytesPerPoint: %d",
pObj->vnode, pObj->sid, pObj->meterId, contLen, expectedLen, pObj->bytesPerPoint);
code = TSDB_CODE_WRONG_MSG_SIZE;
goto _over;
}
// to guarantee time stamp is the same for all vnodes
pData = pSubmit->payLoad;
tsKey = now;
if (*((TSKEY *)pData) == 0) {
for (i = 0; i < numOfPoints; ++i) {
*((TSKEY *)pData) = tsKey++;
pData += pObj->bytesPerPoint;
}
}
if (numOfPoints >= (pVnode->cfg.blocksPerMeter - 2) * pObj->pointsPerBlock) {
code = TSDB_CODE_BATCH_SIZE_TOO_BIG;
dError("vid:%d sid:%d id:%s, batch size too big, insert points:%d, it shall be smaller than:%d", pObj->vnode, pObj->sid,
pObj->meterId, numOfPoints, (pVnode->cfg.blocksPerMeter - 2) * pObj->pointsPerBlock);
return code;
}
/*
* please refer to TBASE-926, data may be lost when the cache is full
*/
if (source == TSDB_DATA_SOURCE_SHELL && pVnode->cfg.replications > 1) {
code = vnodeForwardToPeer(pObj, cont, contLen, TSDB_ACTION_INSERT, sversion);
if (code != TSDB_CODE_SUCCESS) return code;
}
SCachePool *pPool = (SCachePool *)pVnode->pCachePool;
if (pObj->freePoints < numOfPoints || pObj->freePoints < (pObj->pointsPerBlock << 1) ||
pPool->notFreeSlots > pVnode->cfg.cacheNumOfBlocks.totalBlocks - 2) {
code = TSDB_CODE_ACTION_IN_PROGRESS;
dTrace("vid:%d sid:%d id:%s, cache is full, freePoints:%d, notFreeSlots:%d", pObj->vnode, pObj->sid, pObj->meterId,
pObj->freePoints, pPool->notFreeSlots);
vnodeProcessCommitTimer(pVnode, NULL);
return code;
}
// FIXME: Here should be after the comparison of sversions.
if (pVnode->cfg.commitLog && source != TSDB_DATA_SOURCE_LOG) {
if (pVnode->logFd < 0) return TSDB_CODE_INVALID_COMMIT_LOG;
code = vnodeWriteToCommitLog(pObj, TSDB_ACTION_INSERT, cont, contLen, sversion);
if (code != TSDB_CODE_SUCCESS) return code;
}
if (pObj->sversion < sversion) {
dTrace("vid:%d sid:%d id:%s, schema is changed, new:%d old:%d", pObj->vnode, pObj->sid, pObj->meterId, sversion,
pObj->sversion);
vnodeSendMeterCfgMsg(pObj->vnode, pObj->sid);
code = TSDB_CODE_ACTION_IN_PROGRESS;
return code;
} else if (pObj->sversion > sversion) {
dTrace("vid:%d sid:%d id:%s, client schema out of date, sql is invalid. client sversion:%d vnode sversion:%d",
pObj->vnode, pObj->sid, pObj->meterId, pObj->sversion, sversion);
code = TSDB_CODE_INVALID_SQL;
return code;
}
pData = pSubmit->payLoad;
TSKEY firstKey = *((TSKEY *)pData);
TSKEY lastKey = *((TSKEY *)(pData + pObj->bytesPerPoint * (numOfPoints - 1)));
int cfid = now/pVnode->cfg.daysPerFile/tsMsPerDay[(uint8_t)pVnode->cfg.precision];
TSKEY minAllowedKey = (cfid - pVnode->maxFiles + 1)*pVnode->cfg.daysPerFile*tsMsPerDay[(uint8_t)pVnode->cfg.precision];
TSKEY maxAllowedKey = (cfid + 2)*pVnode->cfg.daysPerFile*tsMsPerDay[(uint8_t)pVnode->cfg.precision] - 2;
if (firstKey < minAllowedKey || firstKey > maxAllowedKey || lastKey < minAllowedKey || lastKey > maxAllowedKey) {
dError("vid:%d sid:%d id:%s, vnode lastKeyOnFile:%" PRId64 ", data is out of range, numOfPoints:%d firstKey:%" PRId64 " lastKey:%" PRId64 " minAllowedKey:%" PRId64 " maxAllowedKey:%" PRId64,
pObj->vnode, pObj->sid, pObj->meterId, pVnode->lastKeyOnFile, numOfPoints,firstKey, lastKey, minAllowedKey, maxAllowedKey);
return TSDB_CODE_TIMESTAMP_OUT_OF_RANGE;
}
if ((code = vnodeSetMeterInsertImportStateEx(pObj, TSDB_METER_STATE_INSERTING)) != TSDB_CODE_SUCCESS) {
goto _over;
}
for (i = 0; i < numOfPoints; ++i) { // meter will be dropped, abort current insertion
if (vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPING)) {
dWarn("vid:%d sid:%d id:%s, meter is dropped, abort insert, state:%d", pObj->vnode, pObj->sid, pObj->meterId,
pObj->state);
code = TSDB_CODE_NOT_ACTIVE_TABLE;
break;
}
if (*((TSKEY *)pData) <= pObj->lastKey) {
dWarn("vid:%d sid:%d id:%s, received key:%" PRId64 " not larger than lastKey:%" PRId64, pObj->vnode, pObj->sid, pObj->meterId,
*((TSKEY *)pData), pObj->lastKey);
pData += pObj->bytesPerPoint;
continue;
}
if (!VALID_TIMESTAMP(*((TSKEY *)pData), tsKey, (uint8_t)pVnode->cfg.precision)) {
code = TSDB_CODE_TIMESTAMP_OUT_OF_RANGE;
break;
}
if (vnodeInsertPointToCache(pObj, pData) < 0) {
code = TSDB_CODE_ACTION_IN_PROGRESS;
break;
}
pObj->lastKey = *((TSKEY *)pData);
pData += pObj->bytesPerPoint;
points++;
}
atomic_fetch_add_64(&(pVnode->vnodeStatistic.pointsWritten), points * (pObj->numOfColumns - 1));
atomic_fetch_add_64(&(pVnode->vnodeStatistic.totalStorage), points * pObj->bytesPerPoint);
pthread_mutex_lock(&(pVnode->vmutex));
if (pObj->lastKey > pVnode->lastKey) pVnode->lastKey = pObj->lastKey;
if (firstKey < pVnode->firstKey) pVnode->firstKey = firstKey;
assert(pVnode->firstKey > 0);
pVnode->version++;
pthread_mutex_unlock(&(pVnode->vmutex));
vnodeClearMeterState(pObj, TSDB_METER_STATE_INSERTING);
_over:
dTrace("vid:%d sid:%d id:%s, %d out of %d points are inserted, lastKey:%" PRId64 " source:%d, vnode total storage: %" PRId64 "",
pObj->vnode, pObj->sid, pObj->meterId, points, numOfPoints, pObj->lastKey, source,
pVnode->vnodeStatistic.totalStorage);
*numOfInsertPoints = points;
return code;
}
/**
* continue running of the function may cause the free vnode crash with high probability
* todo fix it by set flag to disable commit in any cases
*
* @param param
* @param tmrId
*/
void vnodeProcessUpdateSchemaTimer(void *param, void *tmrId) {
SMeterObj * pObj = (SMeterObj *)param;
SVnodeObj * pVnode = vnodeList + pObj->vnode;
/*
* vnode may have been dropped, check it in the first place
* if the vnode is freed, the pObj is not valid any more, the pObj->vnode is meanless
* so may be the vid should be passed into this function as a parameter?
*/
if (pVnode->meterList == NULL) {
dTrace("vnode is deleted, abort update schema");
return;
}
SCachePool *pPool = (SCachePool *)pVnode->pCachePool;
pthread_mutex_lock(&pPool->vmutex);
if (pPool->commitInProcess) {
dTrace("vid:%d sid:%d mid:%s, committing in process, commit later", pObj->vnode, pObj->sid, pObj->meterId);
if (taosTmrStart(vnodeProcessUpdateSchemaTimer, 10, pObj, vnodeTmrCtrl) == NULL) {
vnodeClearMeterState(pObj, TSDB_METER_STATE_UPDATING);
}
pthread_mutex_unlock(&pPool->vmutex);
return;
}
pPool->commitInProcess = 1;
pthread_mutex_unlock(&pPool->vmutex);
vnodeCommitMultiToFile(pVnode, pObj->sid, pObj->sid);
}
void vnodeUpdateMeter(void *param, void *tmrId) {
SMeterObj *pNew = (SMeterObj *)param;
if (pNew == NULL || pNew->vnode < 0 || pNew->sid < 0) return;
SVnodeObj* pVnode = &vnodeList[pNew->vnode];
if (pVnode->meterList == NULL) {
dTrace("vid:%d sid:%d id:%s, vnode is deleted, status:%s, abort update schema",
pNew->vnode, pNew->sid, pNew->meterId, taosGetVnodeStatusStr(vnodeList[pNew->vnode].vnodeStatus));
free(pNew->schema);
free(pNew);
return;
}
SMeterObj *pObj = pVnode->meterList[pNew->sid];
if (pObj == NULL || vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPING)) {
dTrace("vid:%d sid:%d id:%s, meter is deleted, abort update schema", pNew->vnode, pNew->sid, pNew->meterId);
free(pNew->schema);
free(pNew);
return;
}
int32_t state = vnodeSetMeterState(pObj, TSDB_METER_STATE_UPDATING);
if (state >= TSDB_METER_STATE_DROPPING) {
dError("vid:%d sid:%d id:%s, meter is deleted, failed to update, state:%d",
pObj->vnode, pObj->sid, pObj->meterId, state);
return;
}
int32_t num = 0;
pthread_mutex_lock(&pVnode->vmutex);
num = pObj->numOfQueries;
pthread_mutex_unlock(&pVnode->vmutex);
if (num > 0 || state != TSDB_METER_STATE_READY) {
// the state may have been changed by vnodeSetMeterState, recover it in the first place
vnodeClearMeterState(pObj, TSDB_METER_STATE_UPDATING);
dTrace("vid:%d sid:%d id:%s, update failed, retry later, numOfQueries:%d, state:%d",
pNew->vnode, pNew->sid, pNew->meterId, num, state);
// retry update meter in 50ms
if (taosTmrStart(vnodeUpdateMeter, 50, pNew, vnodeTmrCtrl) == NULL) {
dError("vid:%d sid:%d id:%s, failed to start update timer, no retry", pNew->vnode, pNew->sid, pNew->meterId);
free(pNew->schema);
free(pNew);
}
return;
}
// commit first
if (!vnodeIsCacheCommitted(pObj)) {
// commit data first
if (taosTmrStart(vnodeProcessUpdateSchemaTimer, 0, pObj, vnodeTmrCtrl) == NULL) {
dError("vid:%d sid:%d id:%s, failed to start commit timer", pObj->vnode, pObj->sid, pObj->meterId);
vnodeClearMeterState(pObj, TSDB_METER_STATE_UPDATING);
free(pNew->schema);
free(pNew);
return;
}
if (taosTmrStart(vnodeUpdateMeter, 50, pNew, vnodeTmrCtrl) == NULL) {
dError("vid:%d sid:%d id:%s, failed to start update timer", pNew->vnode, pNew->sid, pNew->meterId);
vnodeClearMeterState(pObj, TSDB_METER_STATE_UPDATING);
free(pNew->schema);
free(pNew);
}
dTrace("vid:%d sid:%d meterId:%s, there are data in cache, commit first, update later",
pNew->vnode, pNew->sid, pNew->meterId);
vnodeClearMeterState(pObj, TSDB_METER_STATE_UPDATING);
return;
}
strcpy(pObj->meterId, pNew->meterId);
pObj->numOfColumns = pNew->numOfColumns;
pObj->timeStamp = pNew->timeStamp;
pObj->bytesPerPoint = pNew->bytesPerPoint;
pObj->maxBytes = pNew->maxBytes;
if (pObj->timeStamp > vnodeList[pObj->vnode].lastCreate) vnodeList[pObj->vnode].lastCreate = pObj->timeStamp;
tfree(pObj->schema);
pObj->schema = pNew->schema;
vnodeFreeCacheInfo(pObj);
pObj->pCache = vnodeAllocateCacheInfo(pObj);
pObj->sversion = pNew->sversion;
vnodeSaveMeterObjToFile(pObj);
vnodeClearMeterState(pObj, TSDB_METER_STATE_UPDATING);
dTrace("vid:%d sid:%d id:%s, schema is updated, state:%d", pObj->vnode, pObj->sid, pObj->meterId, pObj->state);
free(pNew);
}
void vnodeRecoverMeterObjectFile(int vnode) {
// TODO: start the recovery process
assert(0);
}
因为 它太大了无法显示 source diff 。你可以改为 查看blob
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "qextbuffer.h"
#include "taosmsg.h"
#include "tscJoinProcess.h"
#include "ttime.h"
#include "vnode.h"
#include "vnodeRead.h"
#include "vnodeUtil.h"
#include "vnodeQueryImpl.h"
#define ALL_CACHE_BLOCKS_CHECKED(q) \
(((q)->slot == (q)->currentSlot && QUERY_IS_ASC_QUERY(q)) || \
((q)->slot == (q)->firstSlot && (!QUERY_IS_ASC_QUERY(q))))
#define FORWARD_CACHE_BLOCK_CHECK_SLOT(slot, step, maxblocks) (slot) = ((slot) + (step) + (maxblocks)) % (maxblocks);
static bool isGroupbyEachTable(SSqlGroupbyExpr *pGroupbyExpr, tSidSet *pSidset) {
if (pGroupbyExpr == NULL || pGroupbyExpr->numOfGroupCols == 0) {
return false;
}
for (int32_t i = 0; i < pGroupbyExpr->numOfGroupCols; ++i) {
SColIndexEx *pColIndex = &pGroupbyExpr->columnInfo[i];
if (pColIndex->flag == TSDB_COL_TAG) {
assert(pSidset->numOfSids == pSidset->numOfSubSet);
return true;
}
}
return false;
}
static bool doCheckWithPrevQueryRange(SQuery *pQuery, TSKEY nextKey) {
if ((nextKey > pQuery->ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
(nextKey < pQuery->ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
return false;
}
return true;
}
/**
* The start position of the first check cache block is located before starting the loop.
* And the start position for next cache blocks needs to be decided before checking each cache block.
*/
static void setStartPositionForCacheBlock(SQuery *pQuery, SCacheBlock *pBlock, bool *firstCheckSlot) {
if (!(*firstCheckSlot)) {
if (QUERY_IS_ASC_QUERY(pQuery)) {
pQuery->pos = 0;
} else {
pQuery->pos = pBlock->numOfPoints - 1;
}
} else {
(*firstCheckSlot) = false;
}
}
static void enableExecutionForNextTable(SQueryRuntimeEnv *pRuntimeEnv) {
SQuery *pQuery = pRuntimeEnv->pQuery;
for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[i]);
if (pResInfo != NULL) {
pResInfo->complete = false;
}
}
}
static void queryOnMultiDataCache(SQInfo *pQInfo, SMeterDataInfo *pMeterDataInfo) {
SQuery * pQuery = &pQInfo->query;
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv * pRuntimeEnv = &pQInfo->pTableQuerySupporter->runtimeEnv;
SMeterSidExtInfo **pMeterSidExtInfo = pSupporter->pMeterSidExtInfo;
SMeterObj *pTempMeterObj = getMeterObj(pSupporter->pMetersHashTable, pMeterSidExtInfo[0]->sid);
assert(pTempMeterObj != NULL);
__block_search_fn_t searchFn = vnodeSearchKeyFunc[pTempMeterObj->searchAlgorithm];
int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
dTrace("QInfo:%p start to query data in cache", pQInfo);
int64_t st = taosGetTimestampUs();
int32_t totalBlocks = 0;
for (int32_t groupIdx = 0; groupIdx < pSupporter->pSidSet->numOfSubSet; ++groupIdx) {
int32_t start = pSupporter->pSidSet->starterPos[groupIdx];
int32_t end = pSupporter->pSidSet->starterPos[groupIdx + 1] - 1;
if (isQueryKilled(pQInfo)) {
return;
}
for (int32_t k = start; k <= end; ++k) {
SMeterObj *pMeterObj = getMeterObj(pSupporter->pMetersHashTable, pMeterSidExtInfo[k]->sid);
if (pMeterObj == NULL) {
dError("QInfo:%p failed to find meterId:%d, continue", pQInfo, pMeterSidExtInfo[k]->sid);
continue;
}
pQInfo->pObj = pMeterObj;
pRuntimeEnv->pMeterObj = pMeterObj;
if (pMeterDataInfo[k].pMeterQInfo == NULL) {
pMeterDataInfo[k].pMeterQInfo =
createMeterQueryInfo(pSupporter, pMeterObj->sid, pSupporter->rawSKey, pSupporter->rawEKey);
}
if (pMeterDataInfo[k].pMeterObj == NULL) { // no data in disk for this meter, set its pointer
setMeterDataInfo(&pMeterDataInfo[k], pMeterObj, k, groupIdx);
}
assert(pMeterDataInfo[k].meterOrderIdx == k && pMeterObj == pMeterDataInfo[k].pMeterObj);
SMeterQueryInfo *pMeterQueryInfo = pMeterDataInfo[k].pMeterQInfo;
restoreIntervalQueryRange(pRuntimeEnv, pMeterQueryInfo);
/*
* Update the query meter column index and the corresponding filter column index
* the original column index info may be inconsistent with current meter in cache.
*
* The stable schema has been changed, but the meter schema, along with the data in cache,
* will not be updated until data with new schema arrive.
*/
vnodeUpdateQueryColumnIndex(pQuery, pMeterObj);
vnodeUpdateFilterColumnIndex(pQuery);
if ((pQuery->lastKey > pQuery->ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
(pQuery->lastKey < pQuery->ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
dTrace("QInfo:%p vid:%d sid:%d id:%s, query completed, ignore data in cache. qrange:%" PRId64 "-%" PRId64
", lastKey:%" PRId64,
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->skey, pQuery->ekey,
pQuery->lastKey);
continue;
}
qTrace("QInfo:%p vid:%d sid:%d id:%s, query in cache, qrange:%" PRId64 "-%" PRId64 ", lastKey:%" PRId64, pQInfo,
pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->skey, pQuery->ekey, pQuery->lastKey);
/*
* find the appropriated start position in cache
* NOTE: (taking ascending order query for example)
* for the specific query range [pQuery->lastKey, pQuery->ekey], there may be no qualified result in cache.
* Therefore, we need the first point that is greater(less) than the pQuery->lastKey, so the boundary check
* should be ignored (the fourth parameter).
*/
TSKEY nextKey = getQueryStartPositionInCache(pRuntimeEnv, &pQuery->slot, &pQuery->pos, true);
if (nextKey < 0 || !doCheckWithPrevQueryRange(pQuery, nextKey)) {
qTrace("QInfo:%p vid:%d sid:%d id:%s, no data qualified in cache, cache blocks:%d, lastKey:%" PRId64, pQInfo,
pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->numOfBlocks, pQuery->lastKey);
continue;
}
// data in this block may be flushed to disk and this block is allocated to other meter
// todo try with remain cache blocks
SCacheBlock *pBlock = getCacheDataBlock(pMeterObj, pRuntimeEnv, pQuery->slot);
if (pBlock == NULL) {
continue;
}
bool firstCheckSlot = true;
SCacheInfo *pCacheInfo = (SCacheInfo *)pMeterObj->pCache;
for (int32_t i = 0; i < pCacheInfo->maxBlocks; ++i) {
pBlock = getCacheDataBlock(pMeterObj, pRuntimeEnv, pQuery->slot);
/*
* 1. pBlock == NULL. The cache block may be flushed to disk, so it is not available, skip and try next
* The check for empty block is refactor to getCacheDataBlock function
*/
if (pBlock == NULL) {
if (ALL_CACHE_BLOCKS_CHECKED(pQuery)) {
break;
}
FORWARD_CACHE_BLOCK_CHECK_SLOT(pQuery->slot, step, pCacheInfo->maxBlocks);
continue;
}
setStartPositionForCacheBlock(pQuery, pBlock, &firstCheckSlot);
TSKEY *primaryKeys = (TSKEY *)pRuntimeEnv->primaryColBuffer->data;
TSKEY key = primaryKeys[pQuery->pos];
// in handling file data block, the timestamp range validation is done during fetching candidate file blocks
if ((key > pSupporter->rawEKey && QUERY_IS_ASC_QUERY(pQuery)) ||
(key < pSupporter->rawEKey && !QUERY_IS_ASC_QUERY(pQuery))) {
break;
}
if (pQuery->intervalTime == 0) {
setExecutionContext(pSupporter, pMeterQueryInfo, k, pMeterDataInfo[k].groupIdx, key);
} else {
setIntervalQueryRange(pMeterQueryInfo, pSupporter, key);
int32_t ret = setAdditionalInfo(pSupporter, k, pMeterQueryInfo);
if (ret != TSDB_CODE_SUCCESS) {
pQInfo->killed = 1;
return;
}
}
qTrace("QInfo:%p vid:%d sid:%d id:%s, query in cache, qrange:%" PRId64 "-%" PRId64 ", lastKey:%" PRId64, pQInfo,
pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->skey, pQuery->ekey, pQuery->lastKey);
// only record the key on last block
SET_CACHE_BLOCK_FLAG(pRuntimeEnv->blockStatus);
SBlockInfo binfo = getBlockInfo(pRuntimeEnv);
dTrace("QInfo:%p check data block, brange:%" PRId64 "-%" PRId64 ", fileId:%d, slot:%d, pos:%d, bstatus:%d",
GET_QINFO_ADDR(pQuery), binfo.keyFirst, binfo.keyLast, pQuery->fileId, pQuery->slot, pQuery->pos,
pRuntimeEnv->blockStatus);
totalBlocks++;
stableApplyFunctionsOnBlock(pSupporter, &pMeterDataInfo[k], &binfo, NULL, searchFn);
if (ALL_CACHE_BLOCKS_CHECKED(pQuery)) {
break;
}
FORWARD_CACHE_BLOCK_CHECK_SLOT(pQuery->slot, step, pCacheInfo->maxBlocks);
}
}
}
int64_t time = taosGetTimestampUs() - st;
SQueryCostSummary *pSummary = &pRuntimeEnv->summary;
pSummary->blocksInCache += totalBlocks;
pSummary->cacheTimeUs += time;
pSummary->numOfTables = pSupporter->pSidSet->numOfSids;
dTrace("QInfo:%p complete check %d cache blocks, elapsed time:%.3fms", pQInfo, totalBlocks, time / 1000.0);
setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
}
static void queryOnMultiDataFiles(SQInfo *pQInfo, SMeterDataInfo *pMeterDataInfo) {
SQuery * pQuery = &pQInfo->query;
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
SMeterDataBlockInfoEx *pDataBlockInfoEx = NULL;
int32_t nAllocBlocksInfoSize = 0;
SMeterObj * pTempMeter = getMeterObj(pSupporter->pMetersHashTable, pSupporter->pMeterSidExtInfo[0]->sid);
__block_search_fn_t searchFn = vnodeSearchKeyFunc[pTempMeter->searchAlgorithm];
int32_t vnodeId = pTempMeter->vnode;
SQueryFilesInfo *pVnodeFileInfo = &pRuntimeEnv->vnodeFileInfo;
dTrace("QInfo:%p start to check data blocks in %d files", pQInfo, pVnodeFileInfo->numOfFiles);
int32_t fid = QUERY_IS_ASC_QUERY(pQuery) ? -1 : INT32_MAX;
int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
SQueryCostSummary *pSummary = &pRuntimeEnv->summary;
int64_t totalBlocks = 0;
int64_t st = taosGetTimestampUs();
while (1) {
if (isQueryKilled(pQInfo)) {
break;
}
int32_t fileIdx = vnodeGetVnodeHeaderFileIndex(&fid, pRuntimeEnv, pQuery->order.order);
if (fileIdx < 0) { // no valid file, abort current search
break;
}
pRuntimeEnv->startPos.fileId = fid;
pQuery->fileId = fid;
pSummary->numOfFiles++;
if (vnodeGetHeaderFile(pRuntimeEnv, fileIdx) != TSDB_CODE_SUCCESS) {
fid += step;
continue;
}
int32_t numOfQualifiedMeters = 0;
assert(fileIdx == pRuntimeEnv->vnodeFileInfo.current);
SMeterDataInfo **pReqMeterDataInfo = NULL;
int32_t ret = vnodeFilterQualifiedMeters(pQInfo, vnodeId, pSupporter->pSidSet, pMeterDataInfo,
&numOfQualifiedMeters, &pReqMeterDataInfo);
if (ret != TSDB_CODE_SUCCESS) {
dError("QInfo:%p failed to create meterdata struct to perform query processing, abort", pQInfo);
tfree(pReqMeterDataInfo);
pQInfo->code = -ret;
pQInfo->killed = 1;
return;
}
dTrace("QInfo:%p file:%s, %d meters qualified", pQInfo, pVnodeFileInfo->dataFilePath, numOfQualifiedMeters);
// none of meters in query set have pHeaderFileData in this file, try next file
if (numOfQualifiedMeters == 0) {
fid += step;
tfree(pReqMeterDataInfo);
continue;
}
uint32_t numOfBlocks = 0;
ret = getDataBlocksForMeters(pSupporter, pQuery, numOfQualifiedMeters, pVnodeFileInfo->headerFilePath,
pReqMeterDataInfo, &numOfBlocks);
if (ret != TSDB_CODE_SUCCESS) {
dError("QInfo:%p failed to get data block before scan data blocks, abort", pQInfo);
tfree(pReqMeterDataInfo);
pQInfo->code = -ret;
pQInfo->killed = 1;
return;
}
dTrace("QInfo:%p file:%s, %d meters contains %d blocks to be checked", pQInfo, pVnodeFileInfo->dataFilePath,
numOfQualifiedMeters, numOfBlocks);
if (numOfBlocks == 0) {
fid += step;
tfree(pReqMeterDataInfo);
continue;
}
ret = createDataBlocksInfoEx(pReqMeterDataInfo, numOfQualifiedMeters, &pDataBlockInfoEx, numOfBlocks,
&nAllocBlocksInfoSize, (int64_t)pQInfo);
if (ret != TSDB_CODE_SUCCESS) { // failed to create data blocks
dError("QInfo:%p build blockInfoEx failed, abort", pQInfo);
tfree(pReqMeterDataInfo);
pQInfo->code = -ret;
pQInfo->killed = 1;
return;
}
dTrace("QInfo:%p start to load %d blocks and check", pQInfo, numOfBlocks);
int64_t TRACE_OUTPUT_BLOCK_CNT = 10000;
int64_t stimeUnit = 0;
int64_t etimeUnit = 0;
totalBlocks += numOfBlocks;
// sequentially scan the pHeaderFileData file
int32_t j = QUERY_IS_ASC_QUERY(pQuery) ? 0 : numOfBlocks - 1;
for (; j < numOfBlocks && j >= 0; j += step) {
if (isQueryKilled(pQInfo)) {
break;
}
/* output elapsed time for log every TRACE_OUTPUT_BLOCK_CNT blocks */
if (j == 0) {
stimeUnit = taosGetTimestampMs();
} else if ((j % TRACE_OUTPUT_BLOCK_CNT) == 0) {
etimeUnit = taosGetTimestampMs();
dTrace("QInfo:%p load and check %" PRId64 " blocks, and continue. elapsed:%" PRId64 " ms", pQInfo,
TRACE_OUTPUT_BLOCK_CNT, etimeUnit - stimeUnit);
stimeUnit = taosGetTimestampMs();
}
SMeterDataBlockInfoEx *pInfoEx = &pDataBlockInfoEx[j];
SMeterDataInfo * pOneMeterDataInfo = pInfoEx->pMeterDataInfo;
SMeterQueryInfo * pMeterQueryInfo = pOneMeterDataInfo->pMeterQInfo;
SMeterObj * pMeterObj = pOneMeterDataInfo->pMeterObj;
pQInfo->pObj = pMeterObj;
pRuntimeEnv->pMeterObj = pMeterObj;
restoreIntervalQueryRange(pRuntimeEnv, pMeterQueryInfo);
if ((pQuery->lastKey > pQuery->ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
(pQuery->lastKey < pQuery->ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
qTrace("QInfo:%p vid:%d sid:%d id:%s, query completed, no need to scan this data block. qrange:%" PRId64
"-%" PRId64 ", lastKey:%" PRId64,
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->skey, pQuery->ekey,
pQuery->lastKey);
continue;
}
SCompBlock *pBlock = pInfoEx->pBlock.compBlock;
bool ondemandLoad = onDemandLoadDatablock(pQuery, pMeterQueryInfo->queryRangeSet);
ret = LoadDatablockOnDemand(pBlock, &pInfoEx->pBlock.fields, &pRuntimeEnv->blockStatus, pRuntimeEnv, fileIdx,
pInfoEx->blockIndex, searchFn, ondemandLoad);
if (ret != DISK_DATA_LOADED) {
pSummary->skippedFileBlocks++;
continue;
}
SBlockInfo binfo = getBlockBasicInfo(pRuntimeEnv, pBlock, BLK_FILE_BLOCK);
int64_t nextKey = -1;
assert(pQuery->pos >= 0 && pQuery->pos < pBlock->numOfPoints);
TSKEY *primaryKeys = (TSKEY *)pRuntimeEnv->primaryColBuffer->data;
if (IS_DATA_BLOCK_LOADED(pRuntimeEnv->blockStatus) && needPrimaryTimestampCol(pQuery, &binfo)) {
nextKey = primaryKeys[pQuery->pos];
if (!doCheckWithPrevQueryRange(pQuery, nextKey)) {
qTrace("QInfo:%p vid:%d sid:%d id:%s, no data qualified in data file, lastKey:%" PRId64, pQInfo,
pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->numOfBlocks, pQuery->lastKey);
continue;
}
} else {
// if data block is not loaded, it must be the intermediate blocks
assert((pBlock->keyFirst >= pQuery->lastKey && pBlock->keyLast <= pQuery->ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
(pBlock->keyFirst >= pQuery->ekey && pBlock->keyLast <= pQuery->lastKey && !QUERY_IS_ASC_QUERY(pQuery)));
nextKey = QUERY_IS_ASC_QUERY(pQuery) ? pBlock->keyFirst : pBlock->keyLast;
}
if (pQuery->intervalTime == 0) {
setExecutionContext(pSupporter, pMeterQueryInfo, pOneMeterDataInfo->meterOrderIdx, pOneMeterDataInfo->groupIdx,
nextKey);
} else { // interval query
setIntervalQueryRange(pMeterQueryInfo, pSupporter, nextKey);
ret = setAdditionalInfo(pSupporter, pOneMeterDataInfo->meterOrderIdx, pMeterQueryInfo);
if (ret != TSDB_CODE_SUCCESS) {
tfree(pReqMeterDataInfo); // error code has been set
pQInfo->killed = 1;
return;
}
}
stableApplyFunctionsOnBlock(pSupporter, pOneMeterDataInfo, &binfo, pInfoEx->pBlock.fields, searchFn);
}
tfree(pReqMeterDataInfo);
// try next file
fid += step;
}
int64_t time = taosGetTimestampUs() - st;
dTrace("QInfo:%p complete check %d files, %d blocks, elapsed time:%.3fms", pQInfo, pVnodeFileInfo->numOfFiles,
totalBlocks, time / 1000.0);
pSummary->fileTimeUs += time;
pSummary->readDiskBlocks += totalBlocks;
pSummary->numOfTables = pSupporter->pSidSet->numOfSids;
setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
freeMeterBlockInfoEx(pDataBlockInfoEx, nAllocBlocksInfoSize);
}
static bool multimeterMultioutputHelper(SQInfo *pQInfo, bool *dataInDisk, bool *dataInCache, int32_t index,
int32_t start) {
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SMeterSidExtInfo **pMeterSidExtInfo = pSupporter->pMeterSidExtInfo;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
SQuery * pQuery = &pQInfo->query;
setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
SMeterObj *pMeterObj = getMeterObj(pSupporter->pMetersHashTable, pMeterSidExtInfo[index]->sid);
if (pMeterObj == NULL) {
dError("QInfo:%p do not find required meter id: %d, all meterObjs id is:", pQInfo, pMeterSidExtInfo[index]->sid);
return false;
}
vnodeSetTagValueInParam(pSupporter->pSidSet, pRuntimeEnv, pMeterSidExtInfo[index]);
dTrace("QInfo:%p query on (%d): vid:%d sid:%d meterId:%s, qrange:%" PRId64 "-%" PRId64, pQInfo, index - start,
pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->skey, pQuery->ekey);
pQInfo->pObj = pMeterObj;
pQuery->lastKey = pQuery->skey;
pRuntimeEnv->pMeterObj = pMeterObj;
vnodeUpdateQueryColumnIndex(pQuery, pRuntimeEnv->pMeterObj);
vnodeUpdateFilterColumnIndex(pQuery);
vnodeCheckIfDataExists(pRuntimeEnv, pMeterObj, dataInDisk, dataInCache);
// data in file or cache is not qualified for the query. abort
if (!(dataInCache || dataInDisk)) {
dTrace("QInfo:%p vid:%d sid:%d meterId:%s, qrange:%" PRId64 "-%" PRId64 ", nores, %p", pQInfo, pMeterObj->vnode,
pMeterObj->sid, pMeterObj->meterId, pQuery->skey, pQuery->ekey, pQuery);
return false;
}
if (pRuntimeEnv->pTSBuf != NULL) {
if (pRuntimeEnv->cur.vnodeIndex == -1) {
int64_t tag = pRuntimeEnv->pCtx[0].tag.i64Key;
STSElem elem = tsBufGetElemStartPos(pRuntimeEnv->pTSBuf, 0, tag);
// failed to find data with the specified tag value
if (elem.vnode < 0) {
return false;
}
} else {
tsBufSetCursor(pRuntimeEnv->pTSBuf, &pRuntimeEnv->cur);
}
}
initCtxOutputBuf(pRuntimeEnv);
return true;
}
static int64_t doCheckMetersInGroup(SQInfo *pQInfo, int32_t index, int32_t start) {
SQuery * pQuery = &pQInfo->query;
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
bool dataInDisk = true;
bool dataInCache = true;
if (!multimeterMultioutputHelper(pQInfo, &dataInDisk, &dataInCache, index, start)) {
return 0;
}
#if DEFAULT_IO_ENGINE == IO_ENGINE_MMAP
for (int32_t i = 0; i < pRuntimeEnv->numOfFiles; ++i) {
resetMMapWindow(&pRuntimeEnv->pVnodeFiles[i]);
}
#endif
SPointInterpoSupporter pointInterpSupporter = {0};
pointInterpSupporterInit(pQuery, &pointInterpSupporter);
if (!normalizedFirstQueryRange(dataInDisk, dataInCache, pSupporter, &pointInterpSupporter, NULL)) {
pointInterpSupporterDestroy(&pointInterpSupporter);
return 0;
}
/*
* here we set the value for before and after the specified time into the
* parameter for interpolation query
*/
pointInterpSupporterSetData(pQInfo, &pointInterpSupporter);
pointInterpSupporterDestroy(&pointInterpSupporter);
vnodeScanAllData(pRuntimeEnv);
// first/last_row query, do not invoke the finalize for super table query
doFinalizeResult(pRuntimeEnv);
int64_t numOfRes = getNumOfResult(pRuntimeEnv);
assert(numOfRes == 1 || numOfRes == 0);
// accumulate the point interpolation result
if (numOfRes > 0) {
pQuery->pointsRead += numOfRes;
forwardCtxOutputBuf(pRuntimeEnv, numOfRes);
}
return numOfRes;
}
/**
* super table query handler
* 1. super table projection query, group-by on normal columns query, ts-comp query
* 2. point interpolation query, last row query
*
* @param pQInfo
*/
static void vnodeSTableSeqProcessor(SQInfo *pQInfo) {
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SMeterSidExtInfo **pMeterSidExtInfo = pSupporter->pMeterSidExtInfo;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
SQuery * pQuery = &pQInfo->query;
tSidSet *pSids = pSupporter->pSidSet;
int32_t vid = getMeterObj(pSupporter->pMetersHashTable, pMeterSidExtInfo[0]->sid)->vnode;
if (isPointInterpoQuery(pQuery)) {
resetCtxOutputBuf(pRuntimeEnv);
assert(pQuery->limit.offset == 0 && pQuery->limit.limit != 0);
while (pSupporter->subgroupIdx < pSids->numOfSubSet) {
int32_t start = pSids->starterPos[pSupporter->subgroupIdx];
int32_t end = pSids->starterPos[pSupporter->subgroupIdx + 1] - 1;
if (isFirstLastRowQuery(pQuery)) {
dTrace("QInfo:%p last_row query on vid:%d, numOfGroups:%d, current group:%d", pQInfo, vid, pSids->numOfSubSet,
pSupporter->subgroupIdx);
TSKEY key = -1;
int32_t index = -1;
// choose the last key for one group
pSupporter->meterIdx = start;
for (int32_t k = start; k <= end; ++k, pSupporter->meterIdx++) {
if (isQueryKilled(pQInfo)) {
setQueryStatus(pQuery, QUERY_NO_DATA_TO_CHECK);
return;
}
// get the last key of meters that belongs to this group
SMeterObj *pMeterObj = getMeterObj(pSupporter->pMetersHashTable, pMeterSidExtInfo[k]->sid);
if (pMeterObj != NULL) {
if (key < pMeterObj->lastKey) {
key = pMeterObj->lastKey;
index = k;
}
}
}
pQuery->skey = key;
pQuery->ekey = key;
pSupporter->rawSKey = key;
pSupporter->rawEKey = key;
int64_t num = doCheckMetersInGroup(pQInfo, index, start);
assert(num >= 0);
} else {
dTrace("QInfo:%p interp query on vid:%d, numOfGroups:%d, current group:%d", pQInfo, vid, pSids->numOfSubSet,
pSupporter->subgroupIdx);
for (int32_t k = start; k <= end; ++k) {
if (isQueryKilled(pQInfo)) {
setQueryStatus(pQuery, QUERY_NO_DATA_TO_CHECK);
return;
}
pQuery->skey = pSupporter->rawSKey;
pQuery->ekey = pSupporter->rawEKey;
int64_t num = doCheckMetersInGroup(pQInfo, k, start);
if (num == 1) {
break;
}
}
}
pSupporter->subgroupIdx++;
// output buffer is full, return to client
if (pQuery->pointsRead >= pQuery->pointsToRead) {
break;
}
}
} else {
/*
* 1. super table projection query, 2. group-by on normal columns query, 3. ts-comp query
*/
assert(pSupporter->meterIdx >= 0);
/*
* if the subgroup index is larger than 0, results generated by group by tbname,k is existed.
* we need to return it to client in the first place.
*/
if (pSupporter->subgroupIdx > 0) {
copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
pQInfo->pointsRead += pQuery->pointsRead;
if (pQuery->pointsRead > 0) {
return;
}
}
if (pSupporter->meterIdx >= pSids->numOfSids) {
return;
}
resetCtxOutputBuf(pRuntimeEnv);
resetTimeWindowInfo(pRuntimeEnv, &pRuntimeEnv->windowResInfo);
while (pSupporter->meterIdx < pSupporter->numOfMeters) {
int32_t k = pSupporter->meterIdx;
if (isQueryKilled(pQInfo)) {
setQueryStatus(pQuery, QUERY_NO_DATA_TO_CHECK);
return;
}
TSKEY skey = pQInfo->pTableQuerySupporter->pMeterSidExtInfo[k]->key;
if (skey > 0) {
pQuery->skey = skey;
}
bool dataInDisk = true;
bool dataInCache = true;
if (!multimeterMultioutputHelper(pQInfo, &dataInDisk, &dataInCache, k, 0)) {
pQuery->skey = pSupporter->rawSKey;
pQuery->ekey = pSupporter->rawEKey;
pSupporter->meterIdx++;
continue;
}
#if DEFAULT_IO_ENGINE == IO_ENGINE_MMAP
for (int32_t i = 0; i < pRuntimeEnv->numOfFiles; ++i) {
resetMMapWindow(&pRuntimeEnv->pVnodeFiles[i]);
}
#endif
SPointInterpoSupporter pointInterpSupporter = {0};
if (normalizedFirstQueryRange(dataInDisk, dataInCache, pSupporter, &pointInterpSupporter, NULL) == false) {
pQuery->skey = pSupporter->rawSKey;
pQuery->ekey = pSupporter->rawEKey;
pSupporter->meterIdx++;
continue;
}
// TODO handle the limit problem
if (pQuery->numOfFilterCols == 0 && pQuery->limit.offset > 0) {
forwardQueryStartPosition(pRuntimeEnv);
if (Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK | QUERY_COMPLETED)) {
pQuery->skey = pSupporter->rawSKey;
pQuery->ekey = pSupporter->rawEKey;
pSupporter->meterIdx++;
continue;
}
}
vnodeScanAllData(pRuntimeEnv);
pQuery->pointsRead = getNumOfResult(pRuntimeEnv);
doSkipResults(pRuntimeEnv);
// the limitation of output result is reached, set the query completed
if (doRevisedResultsByLimit(pQInfo)) {
pSupporter->meterIdx = pSupporter->pSidSet->numOfSids;
break;
}
// enable execution for next table, when handling the projection query
enableExecutionForNextTable(pRuntimeEnv);
if (Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK | QUERY_COMPLETED)) {
/*
* query range is identical in terms of all meters involved in query,
* so we need to restore them at the *beginning* of query on each meter,
* not the consecutive query on meter on which is aborted due to buffer limitation
* to ensure that, we can reset the query range once query on a meter is completed.
*/
pQuery->skey = pSupporter->rawSKey;
pQuery->ekey = pSupporter->rawEKey;
pSupporter->meterIdx++;
pQInfo->pTableQuerySupporter->pMeterSidExtInfo[k]->key = pQuery->lastKey;
// if the buffer is full or group by each table, we need to jump out of the loop
if (Q_STATUS_EQUAL(pQuery->over, QUERY_RESBUF_FULL) ||
isGroupbyEachTable(pQuery->pGroupbyExpr, pSupporter->pSidSet)) {
break;
}
} else { // forward query range
pQuery->skey = pQuery->lastKey;
// all data in the result buffer are skipped due to the offset, continue to retrieve data from current meter
if (pQuery->pointsRead == 0) {
assert(!Q_STATUS_EQUAL(pQuery->over, QUERY_RESBUF_FULL));
continue;
} else {
pQInfo->pTableQuerySupporter->pMeterSidExtInfo[k]->key = pQuery->lastKey;
// buffer is full, wait for the next round to retrieve data from current meter
assert(Q_STATUS_EQUAL(pQuery->over, QUERY_RESBUF_FULL));
break;
}
}
}
}
/*
* 1. super table projection query, group-by on normal columns query, ts-comp query
* 2. point interpolation query, last row query
*
* group-by on normal columns query and last_row query do NOT invoke the finalizer here,
* since the finalize stage will be done at the client side.
*
* projection query, point interpolation query do not need the finalizer.
*
* Only the ts-comp query requires the finalizer function to be executed here.
*/
if (isTSCompQuery(pQuery)) {
doFinalizeResult(pRuntimeEnv);
}
if (pRuntimeEnv->pTSBuf != NULL) {
pRuntimeEnv->cur = pRuntimeEnv->pTSBuf->cur;
}
// todo refactor
if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
SWindowStatus *pStatus = &pWindowResInfo->pResult[i].status;
pStatus->closed = true; // enable return all results for group by normal columns
SWindowResult *pResult = &pWindowResInfo->pResult[i];
for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
pResult->numOfRows = MAX(pResult->numOfRows, pResult->resultInfo[j].numOfRes);
}
}
pQInfo->pTableQuerySupporter->subgroupIdx = 0;
pQuery->pointsRead = 0;
copyFromWindowResToSData(pQInfo, pWindowResInfo->pResult);
}
pQInfo->pointsRead += pQuery->pointsRead;
pQuery->pointsOffset = pQuery->pointsToRead;
dTrace(
"QInfo %p vid:%d, numOfMeters:%d, index:%d, numOfGroups:%d, %d points returned, totalRead:%d totalReturn:%d,"
"next skey:%" PRId64 ", offset:%" PRId64,
pQInfo, vid, pSids->numOfSids, pSupporter->meterIdx, pSids->numOfSubSet, pQuery->pointsRead, pQInfo->pointsRead,
pQInfo->pointsReturned, pQuery->skey, pQuery->limit.offset);
}
static void doOrderedScan(SQInfo *pQInfo) {
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQuery * pQuery = &pQInfo->query;
if (QUERY_IS_ASC_QUERY(pQuery)) {
queryOnMultiDataFiles(pQInfo, pSupporter->pMeterDataInfo);
if (pQInfo->code != TSDB_CODE_SUCCESS) {
return;
}
queryOnMultiDataCache(pQInfo, pSupporter->pMeterDataInfo);
} else {
queryOnMultiDataCache(pQInfo, pSupporter->pMeterDataInfo);
if (pQInfo->code != TSDB_CODE_SUCCESS) {
return;
}
queryOnMultiDataFiles(pQInfo, pSupporter->pMeterDataInfo);
}
}
static void setupMeterQueryInfoForSupplementQuery(STableQuerySupportObj *pSupporter) {
SQuery *pQuery = pSupporter->runtimeEnv.pQuery;
for (int32_t i = 0; i < pSupporter->numOfMeters; ++i) {
SMeterQueryInfo *pMeterQueryInfo = pSupporter->pMeterDataInfo[i].pMeterQInfo;
changeMeterQueryInfoForSuppleQuery(pQuery, pMeterQueryInfo, pSupporter->rawSKey, pSupporter->rawEKey);
}
}
static void doMultiMeterSupplementaryScan(SQInfo *pQInfo) {
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv *pRuntimeEnv = &pSupporter->runtimeEnv;
SQuery * pQuery = &pQInfo->query;
if (!needSupplementaryScan(pQuery)) {
dTrace("QInfo:%p no need to do supplementary scan, query completed", pQInfo);
return;
}
SET_SUPPLEMENT_SCAN_FLAG(pRuntimeEnv);
disableFunctForSuppleScan(pSupporter, pQuery->order.order);
if (pRuntimeEnv->pTSBuf != NULL) {
pRuntimeEnv->pTSBuf->cur.order = pRuntimeEnv->pTSBuf->cur.order ^ 1u;
}
SWAP(pSupporter->rawSKey, pSupporter->rawEKey, TSKEY);
setupMeterQueryInfoForSupplementQuery(pSupporter);
int64_t st = taosGetTimestampMs();
doOrderedScan(pQInfo);
int64_t et = taosGetTimestampMs();
dTrace("QInfo:%p supplementary scan completed, elapsed time: %lldms", pQInfo, et - st);
/*
* restore the env
* the meter query info is not reset to the original state
*/
SWAP(pSupporter->rawSKey, pSupporter->rawEKey, TSKEY);
enableFunctForMasterScan(pRuntimeEnv, pQuery->order.order);
if (pRuntimeEnv->pTSBuf != NULL) {
pRuntimeEnv->pTSBuf->cur.order = pRuntimeEnv->pTSBuf->cur.order ^ 1;
}
SET_MASTER_SCAN_FLAG(pRuntimeEnv);
}
static void vnodeMultiMeterQueryProcessor(SQInfo *pQInfo) {
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
SQuery * pQuery = &pQInfo->query;
if (pSupporter->subgroupIdx > 0) {
/*
* if the subgroupIdx > 0, the query process must be completed yet, we only need to
* copy the data into output buffer
*/
if (pQuery->intervalTime > 0) {
copyResToQueryResultBuf(pSupporter, pQuery);
#ifdef _DEBUG_VIEW
displayInterResult(pQuery->sdata, pQuery, pQuery->sdata[0]->len);
#endif
} else {
copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
}
pQInfo->pointsRead += pQuery->pointsRead;
if (pQuery->pointsRead == 0) {
vnodePrintQueryStatistics(pSupporter);
}
dTrace("QInfo:%p points returned:%d, totalRead:%d totalReturn:%d", pQInfo, pQuery->pointsRead, pQInfo->pointsRead,
pQInfo->pointsReturned);
return;
}
pSupporter->pMeterDataInfo = (SMeterDataInfo *)calloc(1, sizeof(SMeterDataInfo) * pSupporter->numOfMeters);
if (pSupporter->pMeterDataInfo == NULL) {
dError("QInfo:%p failed to allocate memory, %s", pQInfo, strerror(errno));
pQInfo->code = -TSDB_CODE_SERV_OUT_OF_MEMORY;
return;
}
dTrace("QInfo:%p query start, qrange:%" PRId64 "-%" PRId64 ", order:%d, group:%d", pQInfo, pSupporter->rawSKey,
pSupporter->rawEKey, pQuery->order.order, pSupporter->pSidSet->numOfSubSet);
dTrace("QInfo:%p main query scan start", pQInfo);
int64_t st = taosGetTimestampMs();
doOrderedScan(pQInfo);
int64_t et = taosGetTimestampMs();
dTrace("QInfo:%p main scan completed, elapsed time: %lldms, supplementary scan start, order:%d", pQInfo, et - st,
pQuery->order.order ^ 1u);
if (pQuery->intervalTime > 0) {
for (int32_t i = 0; i < pSupporter->numOfMeters; ++i) {
SMeterQueryInfo *pMeterQueryInfo = pSupporter->pMeterDataInfo[i].pMeterQInfo;
closeAllTimeWindow(&pMeterQueryInfo->windowResInfo);
}
} else { // close results for group result
closeAllTimeWindow(&pRuntimeEnv->windowResInfo);
}
doMultiMeterSupplementaryScan(pQInfo);
if (isQueryKilled(pQInfo)) {
dTrace("QInfo:%p query killed, abort", pQInfo);
return;
}
if (pQuery->intervalTime > 0 || isSumAvgRateQuery(pQuery)) {
assert(pSupporter->subgroupIdx == 0 && pSupporter->numOfGroupResultPages == 0);
if (mergeMetersResultToOneGroups(pSupporter) == TSDB_CODE_SUCCESS) {
copyResToQueryResultBuf(pSupporter, pQuery);
#ifdef _DEBUG_VIEW
displayInterResult(pQuery->sdata, pQuery, pQuery->sdata[0]->len);
#endif
}
} else { // not a interval query
copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
}
// handle the limitation of output buffer
pQInfo->pointsRead += pQuery->pointsRead;
dTrace("QInfo:%p points returned:%d, totalRead:%d totalReturn:%d", pQInfo, pQuery->pointsRead, pQInfo->pointsRead,
pQInfo->pointsReturned);
}
/*
* in each query, this function will be called only once, no retry for further result.
*
* select count(*)/top(field,k)/avg(field name) from table_name [where ts>now-1a];
* select count(*) from table_name group by status_column;
*/
static void vnodeSingleTableFixedOutputProcessor(SQInfo *pQInfo) {
SQuery * pQuery = &pQInfo->query;
SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->pTableQuerySupporter->runtimeEnv;
assert(pQuery->slot >= 0 && pQuery->pos >= 0);
vnodeScanAllData(pRuntimeEnv);
doFinalizeResult(pRuntimeEnv);
if (isQueryKilled(pQInfo)) {
return;
}
// since the numOfOutputElems must be identical for all sql functions that are allowed to be executed simutanelously.
pQuery->pointsRead = getNumOfResult(pRuntimeEnv);
assert(pQuery->pointsRead <= pQuery->pointsToRead &&
Q_STATUS_EQUAL(pQuery->over, QUERY_COMPLETED | QUERY_NO_DATA_TO_CHECK));
// must be top/bottom query if offset > 0
if (pQuery->limit.offset > 0) {
assert(isTopBottomQuery(pQuery));
}
doSkipResults(pRuntimeEnv);
doRevisedResultsByLimit(pQInfo);
pQInfo->pointsRead = pQuery->pointsRead;
}
static void vnodeSingleTableMultiOutputProcessor(SQInfo *pQInfo) {
SQuery * pQuery = &pQInfo->query;
SMeterObj *pMeterObj = pQInfo->pObj;
SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->pTableQuerySupporter->runtimeEnv;
// for ts_comp query, re-initialized is not allowed
if (!isTSCompQuery(pQuery)) {
resetCtxOutputBuf(pRuntimeEnv);
}
while (1) {
vnodeScanAllData(pRuntimeEnv);
doFinalizeResult(pRuntimeEnv);
if (isQueryKilled(pQInfo)) {
return;
}
pQuery->pointsRead = getNumOfResult(pRuntimeEnv);
if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols > 0 && pQuery->pointsRead > 0) {
doSkipResults(pRuntimeEnv);
}
/*
* 1. if pQuery->pointsRead == 0, pQuery->limit.offset >= 0, still need to check data
* 2. if pQuery->pointsRead > 0, pQuery->limit.offset must be 0
*/
if (pQuery->pointsRead > 0 || Q_STATUS_EQUAL(pQuery->over, QUERY_COMPLETED | QUERY_NO_DATA_TO_CHECK)) {
break;
}
TSKEY nextTimestamp = loadRequiredBlockIntoMem(pRuntimeEnv, &pRuntimeEnv->nextPos);
assert(nextTimestamp > 0 || ((nextTimestamp < 0) && Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK)));
dTrace("QInfo:%p vid:%d sid:%d id:%s, skip current result, offset:%" PRId64 ", next qrange:%" PRId64 "-%" PRId64,
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->limit.offset, pQuery->lastKey,
pQuery->ekey);
resetCtxOutputBuf(pRuntimeEnv);
}
doRevisedResultsByLimit(pQInfo);
pQInfo->pointsRead += pQuery->pointsRead;
if (Q_STATUS_EQUAL(pQuery->over, QUERY_RESBUF_FULL)) {
TSKEY nextTimestamp = loadRequiredBlockIntoMem(pRuntimeEnv, &pRuntimeEnv->nextPos);
assert(nextTimestamp > 0 || ((nextTimestamp < 0) && Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK)));
dTrace("QInfo:%p vid:%d sid:%d id:%s, query abort due to buffer limitation, next qrange:%" PRId64 "-%" PRId64,
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->lastKey, pQuery->ekey);
}
dTrace("QInfo:%p vid:%d sid:%d id:%s, %d points returned, totalRead:%d totalReturn:%d", pQInfo, pMeterObj->vnode,
pMeterObj->sid, pMeterObj->meterId, pQuery->pointsRead, pQInfo->pointsRead, pQInfo->pointsReturned);
pQuery->pointsOffset = pQuery->pointsToRead; // restore the available buffer
if (!isTSCompQuery(pQuery)) {
assert(pQuery->pointsRead <= pQuery->pointsToRead);
}
}
static void vnodeSingleMeterIntervalMainLooper(STableQuerySupportObj *pSupporter, SQueryRuntimeEnv *pRuntimeEnv) {
SQuery *pQuery = pRuntimeEnv->pQuery;
while (1) {
initCtxOutputBuf(pRuntimeEnv);
vnodeScanAllData(pRuntimeEnv);
if (isQueryKilled(pQInfo)) {
return;
}
assert(!Q_STATUS_EQUAL(pQuery->over, QUERY_NOT_COMPLETED));
doFinalizeResult(pRuntimeEnv);
// here we can ignore the records in case of no interpolation
// todo handle offset, in case of top/bottom interval query
if ((pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL) && pQuery->limit.offset > 0 &&
pQuery->interpoType == TSDB_INTERPO_NONE) {
// maxOutput <= 0, means current query does not generate any results
int32_t numOfClosed = numOfClosedTimeWindow(&pRuntimeEnv->windowResInfo);
int32_t c = MIN(numOfClosed, pQuery->limit.offset);
clearFirstNTimeWindow(pRuntimeEnv, c);
pQuery->limit.offset -= c;
}
if (Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK | QUERY_COMPLETED)) {
break;
}
// load the data block for the next retrieve
loadRequiredBlockIntoMem(pRuntimeEnv, &pRuntimeEnv->nextPos);
if (Q_STATUS_EQUAL(pQuery->over, QUERY_RESBUF_FULL)) {
break;
}
}
}
/* handle time interval query on single table */
static void vnodeSingleTableIntervalProcessor(SQInfo *pQInfo) {
SQuery * pQuery = &(pQInfo->query);
SMeterObj *pMeterObj = pQInfo->pObj;
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
int32_t numOfInterpo = 0;
while (1) {
resetCtxOutputBuf(pRuntimeEnv);
vnodeSingleMeterIntervalMainLooper(pSupporter, pRuntimeEnv);
if (pQuery->intervalTime > 0) {
pSupporter->subgroupIdx = 0; // always start from 0
pQuery->pointsRead = 0;
copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
clearFirstNTimeWindow(pRuntimeEnv, pSupporter->subgroupIdx);
}
// the offset is handled at prepare stage if no interpolation involved
if (pQuery->interpoType == TSDB_INTERPO_NONE) {
doRevisedResultsByLimit(pQInfo);
break;
} else {
taosInterpoSetStartInfo(&pRuntimeEnv->interpoInfo, pQuery->pointsRead, pQuery->interpoType);
SData **pInterpoBuf = pRuntimeEnv->pInterpoBuf;
for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
memcpy(pInterpoBuf[i]->data, pQuery->sdata[i]->data, pQuery->pointsRead * pQuery->pSelectExpr[i].resBytes);
}
numOfInterpo = 0;
pQuery->pointsRead = vnodeQueryResultInterpolate(pQInfo, (tFilePage **)pQuery->sdata, (tFilePage **)pInterpoBuf,
pQuery->pointsRead, &numOfInterpo);
dTrace("QInfo: %p interpo completed, final:%d", pQInfo, pQuery->pointsRead);
if (pQuery->pointsRead > 0 || Q_STATUS_EQUAL(pQuery->over, QUERY_COMPLETED | QUERY_NO_DATA_TO_CHECK)) {
doRevisedResultsByLimit(pQInfo);
break;
}
// no result generated yet, continue retrieve data
pQuery->pointsRead = 0;
}
}
// all data scanned, the group by normal column can return
if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {//todo refactor with merge interval time result
pSupporter->subgroupIdx = 0;
pQuery->pointsRead = 0;
copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
clearFirstNTimeWindow(pRuntimeEnv, pSupporter->subgroupIdx);
}
pQInfo->pointsRead += pQuery->pointsRead;
pQInfo->pointsInterpo += numOfInterpo;
dTrace("%p vid:%d sid:%d id:%s, %d points returned %d points interpo, totalRead:%d totalInterpo:%d totalReturn:%d",
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->pointsRead, numOfInterpo,
pQInfo->pointsRead - pQInfo->pointsInterpo, pQInfo->pointsInterpo, pQInfo->pointsReturned);
}
void vnodeSingleTableQuery(SSchedMsg *pMsg) {
SQInfo *pQInfo = (SQInfo *)pMsg->ahandle;
if (pQInfo == NULL || pQInfo->pTableQuerySupporter == NULL) {
dTrace("%p freed abort query", pQInfo);
return;
}
if (pQInfo->killed) {
dTrace("QInfo:%p it is already killed, abort", pQInfo);
vnodeDecRefCount(pQInfo);
return;
}
assert(pQInfo->refCount >= 1);
SQuery * pQuery = &pQInfo->query;
SMeterObj * pMeterObj = pQInfo->pObj;
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
SQueryRuntimeEnv * pRuntimeEnv = &pSupporter->runtimeEnv;
assert(pRuntimeEnv->pMeterObj == pMeterObj);
dTrace("vid:%d sid:%d id:%s, query thread is created, numOfQueries:%d, QInfo:%p", pMeterObj->vnode, pMeterObj->sid,
pMeterObj->meterId, pMeterObj->numOfQueries, pQInfo);
if (vnodeHasRemainResults(pQInfo)) {
/*
* There are remain results that are not returned due to result interpolation
* So, we do keep in this procedure instead of launching retrieve procedure for next results.
*/
int32_t numOfInterpo = 0;
int32_t remain = taosNumOfRemainPoints(&pRuntimeEnv->interpoInfo);
pQuery->pointsRead = vnodeQueryResultInterpolate(pQInfo, (tFilePage **)pQuery->sdata,
(tFilePage **)pRuntimeEnv->pInterpoBuf, remain, &numOfInterpo);
doRevisedResultsByLimit(pQInfo);
pQInfo->pointsInterpo += numOfInterpo;
pQInfo->pointsRead += pQuery->pointsRead;
dTrace(
"QInfo:%p vid:%d sid:%d id:%s, %d points returned %d points interpo, totalRead:%d totalInterpo:%d "
"totalReturn:%d",
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->pointsRead, numOfInterpo,
pQInfo->pointsRead, pQInfo->pointsInterpo, pQInfo->pointsReturned);
sem_post(&pQInfo->dataReady);
vnodeDecRefCount(pQInfo);
return;
}
// here we have scan all qualified data in both data file and cache
if (Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK | QUERY_COMPLETED)) {
// continue to get push data from the group result
if (isGroupbyNormalCol(pQuery->pGroupbyExpr) ||
(pQuery->intervalTime > 0 && pQInfo->pointsReturned < pQuery->limit.limit)) {
//todo limit the output for interval query?
pQuery->pointsRead = 0;
pSupporter->subgroupIdx = 0; // always start from 0
if (pRuntimeEnv->windowResInfo.size > 0) {
copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
pQInfo->pointsRead += pQuery->pointsRead;
clearFirstNTimeWindow(pRuntimeEnv, pSupporter->subgroupIdx);
if (pQuery->pointsRead > 0) {
dTrace("QInfo:%p vid:%d sid:%d id:%s, %d points returned %d from group results, totalRead:%d totalReturn:%d",
pQInfo, pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->pointsRead, pQInfo->pointsRead,
pQInfo->pointsInterpo, pQInfo->pointsReturned);
sem_post(&pQInfo->dataReady);
vnodeDecRefCount(pQInfo);
return;
}
}
}
pQInfo->over = 1;
dTrace("QInfo:%p vid:%d sid:%d id:%s, query over, %d points are returned", pQInfo, pMeterObj->vnode, pMeterObj->sid,
pMeterObj->meterId, pQInfo->pointsRead);
vnodePrintQueryStatistics(pSupporter);
sem_post(&pQInfo->dataReady);
vnodeDecRefCount(pQInfo);
return;
}
/* number of points returned during this query */
pQuery->pointsRead = 0;
assert(pQuery->pos >= 0 && pQuery->slot >= 0);
int64_t st = taosGetTimestampUs();
// group by normal column, sliding window query, interval query are handled by interval query processor
if (pQuery->intervalTime != 0 || isGroupbyNormalCol(pQuery->pGroupbyExpr)) { // interval (down sampling operation)
assert(pQuery->checkBufferInLoop == 0 && pQuery->pointsOffset == pQuery->pointsToRead);
vnodeSingleTableIntervalProcessor(pQInfo);
} else {
if (isFixedOutputQuery(pQuery)) {
assert(pQuery->checkBufferInLoop == 0);
vnodeSingleTableFixedOutputProcessor(pQInfo);
} else { // diff/add/multiply/subtract/division
assert(pQuery->checkBufferInLoop == 1);
vnodeSingleTableMultiOutputProcessor(pQInfo);
}
}
// record the total elapsed time
pQInfo->useconds += (taosGetTimestampUs() - st);
/* check if query is killed or not */
if (isQueryKilled(pQInfo)) {
dTrace("QInfo:%p query is killed", pQInfo);
pQInfo->over = 1;
} else {
dTrace("QInfo:%p vid:%d sid:%d id:%s, meter query thread completed, %d points are returned", pQInfo,
pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId, pQuery->pointsRead);
}
sem_post(&pQInfo->dataReady);
vnodeDecRefCount(pQInfo);
}
void vnodeMultiMeterQuery(SSchedMsg *pMsg) {
SQInfo *pQInfo = (SQInfo *)pMsg->ahandle;
if (pQInfo == NULL || pQInfo->pTableQuerySupporter == NULL) {
return;
}
if (pQInfo->killed) {
vnodeDecRefCount(pQInfo);
dTrace("QInfo:%p it is already killed, abort", pQInfo);
return;
}
assert(pQInfo->refCount >= 1);
SQuery *pQuery = &pQInfo->query;
pQuery->pointsRead = 0;
int64_t st = taosGetTimestampUs();
if (pQuery->intervalTime > 0 ||
(isFixedOutputQuery(pQuery) && (!isPointInterpoQuery(pQuery)) && !isGroupbyNormalCol(pQuery->pGroupbyExpr))) {
assert(pQuery->checkBufferInLoop == 0);
vnodeMultiMeterQueryProcessor(pQInfo);
} else {
assert((pQuery->checkBufferInLoop == 1 && pQuery->intervalTime == 0) || isPointInterpoQuery(pQuery) ||
isGroupbyNormalCol(pQuery->pGroupbyExpr));
vnodeSTableSeqProcessor(pQInfo);
}
/* record the total elapsed time */
pQInfo->useconds += (taosGetTimestampUs() - st);
pQInfo->over = isQueryKilled(pQInfo) ? 1 : 0;
taosInterpoSetStartInfo(&pQInfo->pTableQuerySupporter->runtimeEnv.interpoInfo, pQuery->pointsRead,
pQInfo->query.interpoType);
STableQuerySupportObj *pSupporter = pQInfo->pTableQuerySupporter;
if (pQuery->pointsRead == 0) {
pQInfo->over = 1;
dTrace("QInfo:%p over, %d meters queried, %d points are returned", pQInfo, pSupporter->numOfMeters,
pQInfo->pointsRead);
vnodePrintQueryStatistics(pSupporter);
}
sem_post(&pQInfo->dataReady);
vnodeDecRefCount(pQInfo);
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "hash.h"
#include "hashfunc.h"
#include "ihash.h"
#include "qast.h"
#include "qextbuffer.h"
#include "taosmsg.h"
#include "tscJoinProcess.h"
#include "tscompression.h"
#include "vnode.h"
#include "vnodeRead.h"
#include "vnodeUtil.h"
int (*pQueryFunc[])(SMeterObj *, SQuery *) = {vnodeQueryFromCache, vnodeQueryFromFile};
int vnodeInterpolationSearchKey(char *pValue, int num, TSKEY key, int order) {
int firstPos, lastPos, midPos = -1;
int delta, numOfPoints;
TSKEY *keyList;
keyList = (TSKEY *)pValue;
firstPos = 0;
lastPos = num - 1;
if (order == 0) {
// from latest to oldest
while (1) {
if (key >= keyList[lastPos]) return lastPos;
if (key == keyList[firstPos]) return firstPos;
if (key < keyList[firstPos]) return firstPos - 1;
numOfPoints = lastPos - firstPos + 1;
delta = keyList[lastPos] - keyList[firstPos];
midPos = (key - keyList[firstPos]) / delta * numOfPoints + firstPos;
if (key < keyList[midPos]) {
lastPos = midPos - 1;
} else if (key > keyList[midPos]) {
firstPos = midPos + 1;
} else {
break;
}
}
} else {
// from oldest to latest
while (1) {
if (key <= keyList[firstPos]) return firstPos;
if (key == keyList[lastPos]) return lastPos;
if (key > keyList[lastPos]) {
lastPos = lastPos + 1;
if (lastPos >= num) return -1;
}
numOfPoints = lastPos - firstPos + 1;
delta = keyList[lastPos] - keyList[firstPos];
midPos = (key - keyList[firstPos]) / delta * numOfPoints + firstPos;
if (key < keyList[midPos]) {
lastPos = midPos - 1;
} else if (key > keyList[midPos]) {
firstPos = midPos + 1;
} else {
break;
}
}
}
return midPos;
}
int vnodeBinarySearchKey(char *pValue, int num, TSKEY key, int order) {
int firstPos, lastPos, midPos = -1;
int numOfPoints;
TSKEY *keyList;
if (num <= 0) return -1;
keyList = (TSKEY *)pValue;
firstPos = 0;
lastPos = num - 1;
if (order == 0) {
// find the first position which is smaller than the key
while (1) {
if (key >= keyList[lastPos]) return lastPos;
if (key == keyList[firstPos]) return firstPos;
if (key < keyList[firstPos]) return firstPos - 1;
numOfPoints = lastPos - firstPos + 1;
midPos = (numOfPoints >> 1) + firstPos;
if (key < keyList[midPos]) {
lastPos = midPos - 1;
} else if (key > keyList[midPos]) {
firstPos = midPos + 1;
} else {
break;
}
}
} else {
// find the first position which is bigger than the key
while (1) {
if (key <= keyList[firstPos]) return firstPos;
if (key == keyList[lastPos]) return lastPos;
if (key > keyList[lastPos]) {
lastPos = lastPos + 1;
if (lastPos >= num)
return -1;
else
return lastPos;
}
numOfPoints = lastPos - firstPos + 1;
midPos = (numOfPoints >> 1) + firstPos;
if (key < keyList[midPos]) {
lastPos = midPos - 1;
} else if (key > keyList[midPos]) {
firstPos = midPos + 1;
} else {
break;
}
}
}
return midPos;
}
int (*vnodeSearchKeyFunc[])(char *pValue, int num, TSKEY key, int order) = {vnodeBinarySearchKey,
vnodeInterpolationSearchKey};
static SQInfo *vnodeAllocateQInfoCommon(SQueryMeterMsg *pQueryMsg, SMeterObj *pMeterObj, SSqlFunctionExpr *pExprs) {
SQInfo *pQInfo = (SQInfo *)calloc(1, sizeof(SQInfo));
if (pQInfo == NULL) {
return NULL;
}
SQuery *pQuery = &(pQInfo->query);
SColumnInfo *colList = pQueryMsg->colList;
short numOfCols = pQueryMsg->numOfCols;
short numOfOutputCols = pQueryMsg->numOfOutputCols;
pQuery->numOfCols = numOfCols;
pQuery->numOfOutputCols = numOfOutputCols;
pQuery->limit.limit = pQueryMsg->limit;
pQuery->limit.offset = pQueryMsg->offset;
pQuery->order.order = pQueryMsg->order;
pQuery->order.orderColId = pQueryMsg->orderColId;
pQuery->colList = calloc(1, sizeof(SSingleColumnFilterInfo) * numOfCols);
if (pQuery->colList == NULL) {
goto _clean_memory;
}
for (int16_t i = 0; i < numOfCols; ++i) {
pQuery->colList[i].req[0] = 1; // column required during mater scan of data blocks
pQuery->colList[i].colIdxInBuf = i;
pQuery->colList[i].data = colList[i];
SColumnInfo *pColInfo = &pQuery->colList[i].data;
pColInfo->filters = NULL;
if (colList[i].numOfFilters > 0) {
pColInfo->filters = calloc(1, colList[i].numOfFilters * sizeof(SColumnFilterInfo));
for (int32_t j = 0; j < colList[i].numOfFilters; ++j) {
tscColumnFilterInfoCopy(&pColInfo->filters[j], &colList[i].filters[j]);
}
} else {
pQuery->colList[i].data.filters = NULL;
}
}
vnodeUpdateQueryColumnIndex(pQuery, pMeterObj);
for (int16_t col = 0; col < numOfOutputCols; ++col) {
assert(pExprs[col].resBytes > 0);
pQuery->rowSize += pExprs[col].resBytes;
if (TSDB_COL_IS_TAG(pExprs[col].pBase.colInfo.flag)) {
continue;
}
int16_t colId = pExprs[col].pBase.colInfo.colId;
int16_t functId = pExprs[col].pBase.functionId;
// build the projection of actual column data in buffer and the real column index
for (int32_t k = 0; k < numOfCols; ++k) {
if (pQuery->colList[k].data.colId == colId) {
pExprs[col].pBase.colInfo.colIdxInBuf = (int16_t)k;
pExprs[col].pBase.colInfo.colIdx = pQuery->colList[k].colIdx;
if (((functId == TSDB_FUNC_FIRST_DST || functId == TSDB_FUNC_FIRST) && pQuery->order.order == TSQL_SO_DESC) ||
((functId == TSDB_FUNC_LAST_DST || functId == TSDB_FUNC_LAST) && pQuery->order.order == TSQL_SO_ASC)) {
pQuery->colList[k].req[1] = 1;
} else if (functId == TSDB_FUNC_STDDEV) {
pQuery->colList[k].req[1] = 1;
}
break;
}
}
}
pQuery->pSelectExpr = pExprs;
int32_t ret = vnodeCreateFilterInfo(pQInfo, pQuery);
if (ret != TSDB_CODE_SUCCESS) {
goto _clean_memory;
}
vnodeUpdateFilterColumnIndex(pQuery);
pQuery->precision = vnodeList[pMeterObj->vnode].cfg.precision;
return pQInfo;
_clean_memory:
tfree(pQuery->pFilterInfo);
tfree(pQuery->colList);
tfree(pQInfo);
return NULL;
}
static SQInfo *vnodeAllocateQInfoEx(SQueryMeterMsg *pQueryMsg, SSqlGroupbyExpr *pGroupbyExpr, SSqlFunctionExpr *pExprs,
SMeterObj *pMeterObj) {
SQInfo *pQInfo = vnodeAllocateQInfoCommon(pQueryMsg, pMeterObj, pExprs);
if (pQInfo == NULL) {
tfree(pExprs);
tfree(pGroupbyExpr);
return NULL;
}
SQuery *pQuery = &(pQInfo->query);
/* pQuery->sdata is the results output buffer. */
pQuery->sdata = (SData **)calloc(pQuery->numOfOutputCols, sizeof(SData *));
if (pQuery->sdata == NULL) {
goto sign_clean_memory;
}
pQuery->pGroupbyExpr = pGroupbyExpr;
pQuery->intervalTime = pQueryMsg->intervalTime;
pQuery->slidingTime = pQueryMsg->slidingTime;
pQuery->interpoType = pQueryMsg->interpoType;
pQuery->intervalTimeUnit = pQueryMsg->intervalTimeUnit;
pQInfo->query.pointsToRead = vnodeList[pMeterObj->vnode].cfg.rowsInFileBlock;
for (int32_t col = 0; col < pQuery->numOfOutputCols; ++col) {
assert(pExprs[col].interResBytes >= pExprs[col].resBytes);
// allocate additional memory for interResults that are usually larger then final results
size_t size = (pQInfo->query.pointsToRead + 1) * pExprs[col].resBytes + pExprs[col].interResBytes + sizeof(SData);
pQuery->sdata[col] = (SData *)calloc(1, size);
if (pQuery->sdata[col] == NULL) {
goto sign_clean_memory;
}
}
if (pQuery->interpoType != TSDB_INTERPO_NONE) {
pQuery->defaultVal = malloc(sizeof(int64_t) * pQuery->numOfOutputCols);
if (pQuery->defaultVal == NULL) {
goto sign_clean_memory;
}
// the first column is the timestamp
memcpy(pQuery->defaultVal, (char *)pQueryMsg->defaultVal, pQuery->numOfOutputCols * sizeof(int64_t));
}
// to make sure third party won't overwrite this structure
pQInfo->signature = (uint64_t)pQInfo;
pQInfo->pObj = pMeterObj;
pQuery->slot = -1;
pQuery->pos = -1;
pQuery->hfd = -1;
pQuery->dfd = -1;
pQuery->lfd = -1;
dTrace("vid:%d sid:%d meterId:%s, QInfo is allocated:%p", pMeterObj->vnode, pMeterObj->sid, pMeterObj->meterId,
pQInfo);
return pQInfo;
sign_clean_memory:
tfree(pQuery->defaultVal);
if (pQuery->sdata != NULL) {
for (int16_t col = 0; col < pQuery->numOfOutputCols; ++col) {
tfree(pQuery->sdata[col]);
}
}
tfree(pQuery->sdata);
tfree(pQuery->pFilterInfo);
tfree(pQuery->colList);
tfree(pExprs);
tfree(pGroupbyExpr);
tfree(pQInfo);
return NULL;
}
SQInfo *vnodeAllocateQInfo(SQueryMeterMsg *pQueryMsg, SMeterObj *pObj, SSqlFunctionExpr *pExprs) {
SQInfo *pQInfo = vnodeAllocateQInfoCommon(pQueryMsg, pObj, pExprs);
if (pQInfo == NULL) {
tfree(pExprs);
return NULL;
}
SQuery *pQuery = &(pQInfo->query);
pQuery->sdata = (SData **)calloc(1, sizeof(SData *) * pQuery->numOfOutputCols);
if (pQuery->sdata == NULL) {
goto __clean_memory;
}
size_t size = 0;
int32_t numOfRows = vnodeList[pObj->vnode].cfg.rowsInFileBlock;
for (int col = 0; col < pQuery->numOfOutputCols; ++col) {
size = 2 * (numOfRows * pQuery->pSelectExpr[col].resBytes + sizeof(SData));
pQuery->sdata[col] = (SData *)malloc(size);
if (pQuery->sdata[col] == NULL) {
goto __clean_memory;
}
}
if (pQuery->colList[0].data.colId != PRIMARYKEY_TIMESTAMP_COL_INDEX) {
size = 2 * (numOfRows * TSDB_KEYSIZE + sizeof(SData));
pQuery->tsData = (SData *)malloc(size);
if (pQuery->tsData == NULL) {
goto __clean_memory;
}
}
// to make sure third party won't overwrite this structure
pQInfo->signature = (uint64_t)pQInfo;
pQInfo->pObj = pObj;
pQuery->slot = -1;
pQuery->hfd = -1;
pQuery->dfd = -1;
pQuery->lfd = -1;
pQuery->pos = -1;
pQuery->interpoType = TSDB_INTERPO_NONE;
dTrace("vid:%d sid:%d meterId:%s, QInfo is allocated:%p", pObj->vnode, pObj->sid, pObj->meterId, pQInfo);
return pQInfo;
__clean_memory:
tfree(pQuery->tsData);
if (pQuery->sdata != NULL) {
for (int col = 0; col < pQuery->numOfOutputCols; ++col) {
tfree(pQuery->sdata[col]);
}
}
tfree(pQuery->sdata);
tfree(pQuery->pFilterInfo);
tfree(pQuery->colList);
tfree(pExprs);
tfree(pQInfo);
return NULL;
}
void vnodeFreeQInfoInQueue(void *param) {
SQInfo *pQInfo = (SQInfo *)param;
if (!vnodeIsQInfoValid(pQInfo)) return;
pQInfo->killed = 1;
dTrace("QInfo:%p set kill flag to free QInfo");
vnodeDecRefCount(pQInfo);
}
void vnodeFreeQInfo(void *param, bool decQueryRef) {
SQInfo *pQInfo = (SQInfo *)param;
if (!vnodeIsQInfoValid(param)) return;
pQInfo->killed = 1;
SMeterObj *pObj = pQInfo->pObj;
dTrace("QInfo:%p start to free SQInfo", pQInfo);
if (decQueryRef) {
vnodeDecMeterRefcnt(pQInfo);
}
SQuery *pQuery = &(pQInfo->query);
tclose(pQuery->hfd);
tclose(pQuery->dfd);
tclose(pQuery->lfd);
vnodeFreeFields(pQuery);
tfree(pQuery->pBlock);
for (int col = 0; col < pQuery->numOfOutputCols; ++col) {
tfree(pQuery->sdata[col]);
}
for (int col = 0; col < pQuery->numOfCols; ++col) {
vnodeFreeColumnInfo(&pQuery->colList[col].data);
}
if (pQuery->colList[0].colIdx != PRIMARYKEY_TIMESTAMP_COL_INDEX) {
tfree(pQuery->tsData);
}
sem_destroy(&(pQInfo->dataReady));
vnodeQueryFreeQInfoEx(pQInfo);
for (int32_t i = 0; i < pQuery->numOfFilterCols; ++i) {
SSingleColumnFilterInfo *pColFilter = &pQuery->pFilterInfo[i];
if (pColFilter->numOfFilters > 0) {
tfree(pColFilter->pFilters);
}
}
tfree(pQuery->pFilterInfo);
tfree(pQuery->colList);
tfree(pQuery->sdata);
if (pQuery->pSelectExpr != NULL) {
for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
SSqlBinaryExprInfo *pBinExprInfo = &pQuery->pSelectExpr[i].pBinExprInfo;
if (pBinExprInfo->numOfCols > 0) {
tfree(pBinExprInfo->pReqColumns);
tSQLBinaryExprDestroy(&pBinExprInfo->pBinExpr, NULL);
}
}
tfree(pQuery->pSelectExpr);
}
if (pQuery->defaultVal != NULL) {
tfree(pQuery->defaultVal);
}
tfree(pQuery->pGroupbyExpr);
dTrace("QInfo:%p vid:%d sid:%d meterId:%s, QInfo is freed", pQInfo, pObj->vnode, pObj->sid, pObj->meterId);
//destroy signature, in order to avoid the query process pass the object safety check
memset(pQInfo, 0, sizeof(SQInfo));
tfree(pQInfo);
}
bool vnodeIsQInfoValid(void *param) {
SQInfo *pQInfo = (SQInfo *)param;
if (pQInfo == NULL) {
return false;
}
/*
* pQInfo->signature may be changed by another thread, so we assign value of signature
* into local variable, then compare by using local variable
*/
uint64_t sig = pQInfo->signature;
return (sig == (uint64_t)pQInfo);
}
void vnodeDecRefCount(void *param) {
SQInfo *pQInfo = (SQInfo*) param;
assert(vnodeIsQInfoValid(pQInfo));
int32_t ref = atomic_sub_fetch_32(&pQInfo->refCount, 1);
assert(ref >= 0);
dTrace("QInfo:%p decrease obj refcount, %d", pQInfo, ref);
if (ref == 0) {
vnodeFreeQInfo(pQInfo, true);
}
}
void vnodeAddRefCount(void *param) {
SQInfo *pQInfo = (SQInfo*) param;
assert(vnodeIsQInfoValid(pQInfo));
int32_t ref = atomic_add_fetch_32(&pQInfo->refCount, 1);
dTrace("QInfo:%p add refcount, %d", pQInfo, ref);
}
void vnodeQueryData(SSchedMsg *pMsg) {
SQuery *pQuery;
SQInfo *pQInfo;
pQInfo = (SQInfo *)pMsg->ahandle;
if (pQInfo->killed) {
dTrace("QInfo:%p it is already killed, abort", pQInfo);
vnodeDecRefCount(pQInfo);
return;
}
pQuery = &(pQInfo->query);
SMeterObj *pObj = pQInfo->pObj;
dTrace("QInfo:%p vid:%d sid:%d id:%s, query thread is created, numOfQueries:%d, func:%s", pQInfo, pObj->vnode,
pObj->sid, pObj->meterId, pObj->numOfQueries, __FUNCTION__);
pQuery->pointsToRead = vnodeList[pObj->vnode].cfg.rowsInFileBlock;
pQuery->pointsOffset = pQInfo->bufIndex * pQuery->pointsToRead;
int64_t st = taosGetTimestampUs();
while (1) {
int64_t potentNumOfRes = pQInfo->pointsRead + pQuery->pointsToRead;
/* limit the potential overflow data */
if (pQuery->limit.limit > 0 && potentNumOfRes > pQuery->limit.limit) {
pQuery->pointsToRead = pQuery->limit.limit - pQInfo->pointsRead;
if (pQuery->pointsToRead == 0) {
/* reach the limitation, abort */
pQuery->pointsRead = 0;
pQInfo->over = 1;
break;
}
}
pQInfo->code = (*pQInfo->fp)(pObj, pQuery); // <0:error
// has read at least one point
if (pQuery->pointsRead > 0 || pQInfo->code < 0) break;
if (pQuery->pointsRead == 0 && pQuery->over == 0) continue;
if (pQInfo->changed) {
pQInfo->over = 1;
break;
}
// has read all data in file, check data in cache
pQInfo->fp = pQueryFunc[pQuery->order.order ^ 1];
pQInfo->changed = 1;
pQuery->slot = -1; // reset the handle
pQuery->over = 0;
dTrace("vid:%d sid:%d id:%s, query in other media, order:%d, skey:%" PRId64 " query:%p", pObj->vnode, pObj->sid,
pObj->meterId, pQuery->order.order, pQuery->skey, pQuery);
}
pQInfo->pointsRead += pQuery->pointsRead;
dTrace("vid:%d sid:%d id:%s, %d points returned, totalRead:%d totalReturn:%d last key:%" PRId64 ", query:%p", pObj->vnode,
pObj->sid, pObj->meterId, pQuery->pointsRead, pQInfo->pointsRead, pQInfo->pointsReturned, pQuery->lastKey,
pQuery);
int64_t et = taosGetTimestampUs();
pQInfo->useconds += et - st;
// close FDs as soon as possible
if (pQInfo->over) {
dTrace("vid:%d sid:%d id:%s, query over, %d points are returned", pObj->vnode, pObj->sid, pObj->meterId,
pQInfo->pointsRead);
tclose(pQInfo->query.hfd);
tclose(pQInfo->query.dfd);
tclose(pQInfo->query.lfd);
}
sem_post(&pQInfo->dataReady);
vnodeDecRefCount(pQInfo);
}
void *vnodeQueryOnSingleTable(SMeterObj **pMetersObj, SSqlGroupbyExpr *pGroupbyExpr, SSqlFunctionExpr *pSqlExprs,
SQueryMeterMsg *pQueryMsg, int32_t *code) {
SQInfo *pQInfo;
SQuery *pQuery;
SMeterObj *pMeterObj = pMetersObj[0];
bool isProjQuery = vnodeIsProjectionQuery(pSqlExprs, pQueryMsg->numOfOutputCols);
// todo pass the correct error code
if (isProjQuery && pQueryMsg->tsLen == 0) {
pQInfo = vnodeAllocateQInfo(pQueryMsg, pMeterObj, pSqlExprs);
} else {
pQInfo = vnodeAllocateQInfoEx(pQueryMsg, pGroupbyExpr, pSqlExprs, pMetersObj[0]);
}
if (pQInfo == NULL) {
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
goto _error;
}
pQuery = &(pQInfo->query);
dTrace("qmsg:%p create QInfo:%p, QInfo created", pQueryMsg, pQInfo);
SMeterSidExtInfo** pSids = (SMeterSidExtInfo**)pQueryMsg->pSidExtInfo;
if (pSids != NULL && pSids[0]->key > 0) {
pQuery->skey = pSids[0]->key;
} else {
pQuery->skey = pQueryMsg->skey;
}
pQuery->ekey = pQueryMsg->ekey;
pQuery->lastKey = pQuery->skey;
pQInfo->fp = pQueryFunc[pQueryMsg->order];
if (sem_init(&(pQInfo->dataReady), 0, 0) != 0) {
dError("QInfo:%p vid:%d sid:%d meterId:%s, init dataReady sem failed, reason:%s", pQInfo, pMeterObj->vnode,
pMeterObj->sid, pMeterObj->meterId, strerror(errno));
*code = TSDB_CODE_APP_ERROR;
goto _error;
}
SSchedMsg schedMsg = {0};
if (isProjQuery && pQueryMsg->tsLen == 0) {
schedMsg.fp = vnodeQueryData;
} else {
if (vnodeParametersSafetyCheck(pQuery) == false) {
*code = TSDB_CODE_APP_ERROR;
goto _error;
}
STableQuerySupportObj *pSupporter = (STableQuerySupportObj *)calloc(1, sizeof(STableQuerySupportObj));
pSupporter->numOfMeters = 1;
pSupporter->pMetersHashTable = taosHashInit(pSupporter->numOfMeters, taosIntHash_32, false);
taosHashPut(pSupporter->pMetersHashTable, (const char*) &pMetersObj[0]->sid, sizeof(pMeterObj[0].sid),
(char *)&pMetersObj[0], POINTER_BYTES);
pSupporter->pSidSet = NULL;
pSupporter->subgroupIdx = -1;
pSupporter->pMeterSidExtInfo = NULL;
pQInfo->pTableQuerySupporter = pSupporter;
STSBuf *pTSBuf = NULL;
if (pQueryMsg->tsLen > 0) {
// open new file to save the result
char *tsBlock = (char *)pQueryMsg + pQueryMsg->tsOffset;
pTSBuf = tsBufCreateFromCompBlocks(tsBlock, pQueryMsg->tsNumOfBlocks, pQueryMsg->tsLen, pQueryMsg->tsOrder);
tsBufResetPos(pTSBuf);
tsBufNextPos(pTSBuf);
}
if (((*code) = vnodeQueryTablePrepare(pQInfo, pQInfo->pObj, pSupporter, pTSBuf)) != TSDB_CODE_SUCCESS) {
goto _error;
}
if (pQInfo->over == 1) {
vnodeAddRefCount(pQInfo); // for retrieve procedure
return pQInfo;
}
schedMsg.fp = vnodeSingleTableQuery;
}
/*
* The reference count, which is 2, is for both the current query thread and the future retrieve request,
* which will always be issued by client to acquire data or free SQInfo struct.
*/
vnodeAddRefCount(pQInfo);
vnodeAddRefCount(pQInfo);
schedMsg.msg = NULL;
schedMsg.thandle = (void *)1;
schedMsg.ahandle = pQInfo;
dTrace("QInfo:%p set query flag and prepare runtime environment completed, ref:%d, wait for schedule", pQInfo,
pQInfo->refCount);
taosScheduleTask(tsQueryQhandle, &schedMsg);
return pQInfo;
_error:
// table query ref will be decrease during error handling
vnodeFreeQInfo(pQInfo, false);
return NULL;
}
/*
* query on multi-meters
*/
void *vnodeQueryOnMultiMeters(SMeterObj **pMetersObj, SSqlGroupbyExpr *pGroupbyExpr, SSqlFunctionExpr *pSqlExprs,
SQueryMeterMsg *pQueryMsg, int32_t *code) {
SQInfo *pQInfo;
SQuery *pQuery;
assert(QUERY_IS_STABLE_QUERY(pQueryMsg->queryType) && pQueryMsg->numOfCols > 0 && pQueryMsg->pSidExtInfo != 0 &&
pQueryMsg->numOfSids >= 1);
pQInfo = vnodeAllocateQInfoEx(pQueryMsg, pGroupbyExpr, pSqlExprs, *pMetersObj);
if (pQInfo == NULL) {
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
goto _error;
}
pQuery = &(pQInfo->query);
dTrace("qmsg:%p create QInfo:%p, QInfo created", pQueryMsg, pQInfo);
pQuery->skey = pQueryMsg->skey;
pQuery->ekey = pQueryMsg->ekey;
pQInfo->fp = pQueryFunc[pQueryMsg->order];
if (sem_init(&(pQInfo->dataReady), 0, 0) != 0) {
dError("QInfo:%p vid:%d sid:%d id:%s, init dataReady sem failed, reason:%s", pQInfo, pMetersObj[0]->vnode,
pMetersObj[0]->sid, pMetersObj[0]->meterId, strerror(errno));
*code = TSDB_CODE_APP_ERROR;
goto _error;
}
SSchedMsg schedMsg = {0};
STableQuerySupportObj *pSupporter = (STableQuerySupportObj *)calloc(1, sizeof(STableQuerySupportObj));
pSupporter->numOfMeters = pQueryMsg->numOfSids;
pSupporter->pMetersHashTable = taosHashInit(pSupporter->numOfMeters, taosIntHash_32, false);
for (int32_t i = 0; i < pSupporter->numOfMeters; ++i) {
taosHashPut(pSupporter->pMetersHashTable, (const char*) &pMetersObj[i]->sid, sizeof(pMetersObj[i]->sid), (char *)&pMetersObj[i],
POINTER_BYTES);
}
int32_t sidElemLen = pQueryMsg->tagLength + sizeof(SMeterSidExtInfo);
int32_t size = POINTER_BYTES * pQueryMsg->numOfSids + sidElemLen * pQueryMsg->numOfSids;
pSupporter->pMeterSidExtInfo = (SMeterSidExtInfo **)malloc(size);
if (pSupporter->pMeterSidExtInfo == NULL) {
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
dError("QInfo:%p failed to allocate memory for meterSid info, size:%d, abort", pQInfo, size);
goto _error;
}
char *px = ((char *)pSupporter->pMeterSidExtInfo) + POINTER_BYTES * pQueryMsg->numOfSids;
for (int32_t i = 0; i < pQueryMsg->numOfSids; ++i) {
SMeterSidExtInfo* pSrc = ((SMeterSidExtInfo **)pQueryMsg->pSidExtInfo)[i];
SMeterSidExtInfo* pDst = (SMeterSidExtInfo *)px;
pSupporter->pMeterSidExtInfo[i] = pDst;
pDst->sid = pSrc->sid;
pDst->uid = pSrc->uid;
pDst->key = pSrc->key;
if (pQueryMsg->tagLength > 0) {
memcpy(pDst->tags, pSrc->tags, pQueryMsg->tagLength);
}
px += sidElemLen;
}
if (pGroupbyExpr != NULL && pGroupbyExpr->numOfGroupCols > 0) {
pSupporter->pSidSet =
tSidSetCreate(pSupporter->pMeterSidExtInfo, pQueryMsg->numOfSids, (SSchema *)pQueryMsg->pTagSchema,
pQueryMsg->numOfTagsCols, pGroupbyExpr->columnInfo, pGroupbyExpr->numOfGroupCols);
} else {
pSupporter->pSidSet = tSidSetCreate(pSupporter->pMeterSidExtInfo, pQueryMsg->numOfSids,
(SSchema *)pQueryMsg->pTagSchema, pQueryMsg->numOfTagsCols, NULL, 0);
}
pQInfo->pTableQuerySupporter = pSupporter;
STSBuf *pTSBuf = NULL;
if (pQueryMsg->tsLen > 0) {
// open new file to save the result
char *tsBlock = (char *)pQueryMsg + pQueryMsg->tsOffset;
pTSBuf = tsBufCreateFromCompBlocks(tsBlock, pQueryMsg->tsNumOfBlocks, pQueryMsg->tsLen, pQueryMsg->tsOrder);
tsBufResetPos(pTSBuf);
}
if (((*code) = vnodeSTableQueryPrepare(pQInfo, pQuery, pTSBuf)) != TSDB_CODE_SUCCESS) {
goto _error;
}
vnodeAddRefCount(pQInfo);
if (pQInfo->over == 1) {
return pQInfo;
}
vnodeAddRefCount(pQInfo);
schedMsg.msg = NULL;
schedMsg.thandle = (void *)1;
schedMsg.ahandle = pQInfo;
schedMsg.fp = vnodeMultiMeterQuery;
dTrace("QInfo:%p set query flag and prepare runtime environment completed, wait for schedule", pQInfo);
taosScheduleTask(tsQueryQhandle, &schedMsg);
return pQInfo;
_error:
// table query ref will be decrease during error handling
vnodeFreeQInfo(pQInfo, false);
return NULL;
}
/* engine provides the storage, the app has to save the data before next
retrieve, *pNum is the number of points retrieved, and argv[] is
the point to retrieved column
*/
int vnodeRetrieveQueryInfo(void *handle, int *numOfRows, int *rowSize, int16_t *timePrec) {
SQInfo *pQInfo;
SQuery *pQuery;
*numOfRows = 0;
*rowSize = 0;
pQInfo = (SQInfo *)handle;
if (pQInfo == NULL) {
return TSDB_CODE_INVALID_QHANDLE;
}
pQuery = &(pQInfo->query);
if (!vnodeIsQInfoValid(pQInfo) || (pQuery->sdata == NULL)) {
dError("QInfo:%p %p retrieve memory is corrupted!!! QInfo:%p, sign:%p, sdata:%p", pQInfo, pQuery, pQInfo,
pQInfo->signature, pQuery->sdata);
return TSDB_CODE_INVALID_QHANDLE;
}
if (pQInfo->killed) {
dTrace("QInfo:%p query is killed, %p, code:%d", pQInfo, pQuery, pQInfo->code);
if (pQInfo->code == TSDB_CODE_SUCCESS) {
return TSDB_CODE_QUERY_CANCELLED;
} else { // in case of not TSDB_CODE_SUCCESS, return the code to client
return abs(pQInfo->code);
}
}
sem_wait(&pQInfo->dataReady);
*numOfRows = pQInfo->pointsRead - pQInfo->pointsReturned;
*rowSize = pQuery->rowSize;
*timePrec = vnodeList[pQInfo->pObj->vnode].cfg.precision;
dTrace("QInfo:%p, retrieve data info completed, precision:%d, rowsize:%d, rows:%d, code:%d", pQInfo, *timePrec,
*rowSize, *numOfRows, pQInfo->code);
if (pQInfo->code < 0) { // less than 0 means there are error existed.
return -pQInfo->code;
}
return TSDB_CODE_SUCCESS;
}
// vnodeRetrieveQueryInfo must be called first
int vnodeSaveQueryResult(void *handle, char *data, int32_t *size) {
SQInfo *pQInfo = (SQInfo *)handle;
// the remained number of retrieved rows, not the interpolated result
int numOfRows = pQInfo->pointsRead - pQInfo->pointsReturned;
int32_t numOfFinal = vnodeCopyQueryResultToMsg(pQInfo, data, numOfRows);
pQInfo->pointsReturned += numOfFinal;
dTrace("QInfo:%p %d are returned, totalReturned:%d totalRead:%d", pQInfo, numOfFinal, pQInfo->pointsReturned,
pQInfo->pointsRead);
if (pQInfo->over == 0) {
#ifdef _TD_ARM_
dTrace("QInfo:%p set query flag, sig:%" PRIu64 ", func:vnodeSaveQueryResult", pQInfo, pQInfo->signature);
#else
dTrace("QInfo:%p set query flag, sig:%" PRIu64 ", func:%s", pQInfo, pQInfo->signature, __FUNCTION__);
#endif
if (pQInfo->killed == 1) {
dTrace("%p freed or killed, abort query", pQInfo);
} else {
vnodeAddRefCount(pQInfo);
dTrace("%p add query into task queue for schedule", pQInfo);
SSchedMsg schedMsg = {0};
if (pQInfo->pTableQuerySupporter != NULL) {
if (pQInfo->pTableQuerySupporter->pSidSet == NULL) {
schedMsg.fp = vnodeSingleTableQuery;
} else { // group by tag
schedMsg.fp = vnodeMultiMeterQuery;
}
} else {
pQInfo->bufIndex = pQInfo->bufIndex ^ 1; // exchange between 0 and 1
schedMsg.fp = vnodeQueryData;
}
schedMsg.msg = NULL;
schedMsg.thandle = (void *)1;
schedMsg.ahandle = pQInfo;
taosScheduleTask(tsQueryQhandle, &schedMsg);
}
}
return numOfFinal;
}
static int32_t validateQueryMeterMsg(SQueryMeterMsg *pQueryMsg) {
if (pQueryMsg->intervalTime < 0) {
dError("qmsg:%p illegal value of aggTimeInterval %" PRId64 "", pQueryMsg, pQueryMsg->intervalTime);
return -1;
}
if (pQueryMsg->numOfTagsCols < 0 || pQueryMsg->numOfTagsCols > TSDB_MAX_TAGS + 1) {
dError("qmsg:%p illegal value of numOfTagsCols %d", pQueryMsg, pQueryMsg->numOfTagsCols);
return -1;
}
if (pQueryMsg->numOfCols <= 0 || pQueryMsg->numOfCols > TSDB_MAX_COLUMNS) {
dError("qmsg:%p illegal value of numOfCols %d", pQueryMsg, pQueryMsg->numOfCols);
return -1;
}
if (pQueryMsg->numOfSids <= 0) {
dError("qmsg:%p illegal value of numOfSids %d", pQueryMsg, pQueryMsg->numOfSids);
return -1;
}
if (pQueryMsg->numOfGroupCols < 0) {
dError("qmsg:%p illegal value of numOfGroupbyCols %d", pQueryMsg, pQueryMsg->numOfGroupCols);
return -1;
}
if (pQueryMsg->numOfOutputCols > TSDB_MAX_COLUMNS || pQueryMsg->numOfOutputCols <= 0) {
dError("qmsg:%p illegal value of output columns %d", pQueryMsg, pQueryMsg->numOfOutputCols);
return -1;
}
if (pQueryMsg->tagLength < 0) {
dError("qmsg:%p illegal value of tag length %d", pQueryMsg, pQueryMsg->tagLength);
return -1;
}
return 0;
}
int32_t vnodeConvertQueryMeterMsg(SQueryMeterMsg *pQueryMsg) {
pQueryMsg->vnode = htons(pQueryMsg->vnode);
pQueryMsg->numOfSids = htonl(pQueryMsg->numOfSids);
#ifdef TSKEY32
pQueryMsg->skey = htonl(pQueryMsg->skey);
pQueryMsg->ekey = htonl(pQueryMsg->ekey);
#else
pQueryMsg->skey = htobe64(pQueryMsg->skey);
pQueryMsg->ekey = htobe64(pQueryMsg->ekey);
#endif
pQueryMsg->order = htons(pQueryMsg->order);
pQueryMsg->orderColId = htons(pQueryMsg->orderColId);
pQueryMsg->queryType = htons(pQueryMsg->queryType);
pQueryMsg->intervalTime = htobe64(pQueryMsg->intervalTime);
pQueryMsg->slidingTime = htobe64(pQueryMsg->slidingTime);
pQueryMsg->numOfTagsCols = htons(pQueryMsg->numOfTagsCols);
pQueryMsg->numOfCols = htons(pQueryMsg->numOfCols);
pQueryMsg->numOfOutputCols = htons(pQueryMsg->numOfOutputCols);
pQueryMsg->numOfGroupCols = htons(pQueryMsg->numOfGroupCols);
pQueryMsg->tagLength = htons(pQueryMsg->tagLength);
pQueryMsg->limit = htobe64(pQueryMsg->limit);
pQueryMsg->offset = htobe64(pQueryMsg->offset);
pQueryMsg->tsOffset = htonl(pQueryMsg->tsOffset);
pQueryMsg->tsLen = htonl(pQueryMsg->tsLen);
pQueryMsg->tsNumOfBlocks = htonl(pQueryMsg->tsNumOfBlocks);
pQueryMsg->tsOrder = htonl(pQueryMsg->tsOrder);
// query msg safety check
if (validateQueryMeterMsg(pQueryMsg) != 0) {
return TSDB_CODE_INVALID_QUERY_MSG;
}
SMeterSidExtInfo **pSids = NULL;
char * pMsg = (char *)(pQueryMsg->colList) + sizeof(SColumnInfo) * pQueryMsg->numOfCols;
for (int32_t col = 0; col < pQueryMsg->numOfCols; ++col) {
pQueryMsg->colList[col].colId = htons(pQueryMsg->colList[col].colId);
pQueryMsg->colList[col].type = htons(pQueryMsg->colList[col].type);
pQueryMsg->colList[col].bytes = htons(pQueryMsg->colList[col].bytes);
pQueryMsg->colList[col].numOfFilters = htons(pQueryMsg->colList[col].numOfFilters);
assert(pQueryMsg->colList[col].type >= TSDB_DATA_TYPE_BOOL && pQueryMsg->colList[col].type <= TSDB_DATA_TYPE_NCHAR);
int32_t numOfFilters = pQueryMsg->colList[col].numOfFilters;
if (numOfFilters > 0) {
pQueryMsg->colList[col].filters = calloc(numOfFilters, sizeof(SColumnFilterInfo));
}
for (int32_t f = 0; f < numOfFilters; ++f) {
SColumnFilterInfo *pFilterInfo = (SColumnFilterInfo *)pMsg;
SColumnFilterInfo *pDestFilterInfo = &pQueryMsg->colList[col].filters[f];
pDestFilterInfo->filterOnBinary = htons(pFilterInfo->filterOnBinary);
pMsg += sizeof(SColumnFilterInfo);
if (pDestFilterInfo->filterOnBinary) {
pDestFilterInfo->len = htobe64(pFilterInfo->len);
pDestFilterInfo->pz = (int64_t)calloc(1, pDestFilterInfo->len + 1);
memcpy((void*)pDestFilterInfo->pz, pMsg, pDestFilterInfo->len + 1);
pMsg += (pDestFilterInfo->len + 1);
} else {
pDestFilterInfo->lowerBndi = htobe64(pFilterInfo->lowerBndi);
pDestFilterInfo->upperBndi = htobe64(pFilterInfo->upperBndi);
}
pDestFilterInfo->lowerRelOptr = htons(pFilterInfo->lowerRelOptr);
pDestFilterInfo->upperRelOptr = htons(pFilterInfo->upperRelOptr);
}
}
bool hasArithmeticFunction = false;
/*
* 1. simple projection query on meters, we only record the pSqlFuncExprs[i].colIdx value
* 2. for complex queries, whole SqlExprs object is required.
*/
pQueryMsg->pSqlFuncExprs = (int64_t)malloc(POINTER_BYTES * pQueryMsg->numOfOutputCols);
SSqlFuncExprMsg *pExprMsg = (SSqlFuncExprMsg *)pMsg;
for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
((SSqlFuncExprMsg **)pQueryMsg->pSqlFuncExprs)[i] = pExprMsg;
pExprMsg->colInfo.colIdx = htons(pExprMsg->colInfo.colIdx);
pExprMsg->colInfo.colId = htons(pExprMsg->colInfo.colId);
pExprMsg->colInfo.flag = htons(pExprMsg->colInfo.flag);
pExprMsg->functionId = htons(pExprMsg->functionId);
pExprMsg->numOfParams = htons(pExprMsg->numOfParams);
pMsg += sizeof(SSqlFuncExprMsg);
for (int32_t j = 0; j < pExprMsg->numOfParams; ++j) {
pExprMsg->arg[j].argType = htons(pExprMsg->arg[j].argType);
pExprMsg->arg[j].argBytes = htons(pExprMsg->arg[j].argBytes);
if (pExprMsg->arg[j].argType == TSDB_DATA_TYPE_BINARY) {
pExprMsg->arg[j].argValue.pz = pMsg;
pMsg += pExprMsg->arg[j].argBytes + 1; // one more for the string terminated char.
} else {
pExprMsg->arg[j].argValue.i64 = htobe64(pExprMsg->arg[j].argValue.i64);
}
}
if (pExprMsg->functionId == TSDB_FUNC_ARITHM) {
hasArithmeticFunction = true;
} else if (pExprMsg->functionId == TSDB_FUNC_TAG ||
pExprMsg->functionId == TSDB_FUNC_TAGPRJ ||
pExprMsg->functionId == TSDB_FUNC_TAG_DUMMY) {
if (pExprMsg->colInfo.flag != TSDB_COL_TAG) { // ignore the column index check for arithmetic expression.
return TSDB_CODE_INVALID_QUERY_MSG;
}
} else {
if (!vnodeValidateExprColumnInfo(pQueryMsg, pExprMsg)) {
return TSDB_CODE_INVALID_QUERY_MSG;
}
}
pExprMsg = (SSqlFuncExprMsg *)pMsg;
}
pQueryMsg->colNameLen = htonl(pQueryMsg->colNameLen);
if (hasArithmeticFunction) { // column name array
assert(pQueryMsg->colNameLen > 0);
pQueryMsg->colNameList = (int64_t)pMsg;
pMsg += pQueryMsg->colNameLen;
}
pSids = (SMeterSidExtInfo **)calloc(pQueryMsg->numOfSids, sizeof(SMeterSidExtInfo *));
pQueryMsg->pSidExtInfo = (uint64_t)pSids;
pSids[0] = (SMeterSidExtInfo *)pMsg;
pSids[0]->sid = htonl(pSids[0]->sid);
pSids[0]->uid = htobe64(pSids[0]->uid);
pSids[0]->key = htobe64(pSids[0]->key);
for (int32_t j = 1; j < pQueryMsg->numOfSids; ++j) {
pSids[j] = (SMeterSidExtInfo *)((char *)pSids[j - 1] + sizeof(SMeterSidExtInfo) + pQueryMsg->tagLength);
pSids[j]->sid = htonl(pSids[j]->sid);
pSids[j]->uid = htobe64(pSids[j]->uid);
pSids[j]->key = htobe64(pSids[j]->key);
}
pMsg = (char *)pSids[pQueryMsg->numOfSids - 1];
pMsg += sizeof(SMeterSidExtInfo) + pQueryMsg->tagLength;
if (pQueryMsg->numOfGroupCols > 0 || pQueryMsg->numOfTagsCols > 0) { // group by tag columns
pQueryMsg->pTagSchema = (uint64_t)pMsg;
SSchema *pTagSchema = (SSchema *)pQueryMsg->pTagSchema;
pMsg += sizeof(SSchema) * pQueryMsg->numOfTagsCols;
if (pQueryMsg->numOfGroupCols > 0) {
pQueryMsg->groupbyTagIds = (uint64_t) & (pTagSchema[pQueryMsg->numOfTagsCols]);
} else {
pQueryMsg->groupbyTagIds = 0;
}
pQueryMsg->orderByIdx = htons(pQueryMsg->orderByIdx);
pQueryMsg->orderType = htons(pQueryMsg->orderType);
pMsg += sizeof(SColIndexEx) * pQueryMsg->numOfGroupCols;
} else {
pQueryMsg->pTagSchema = 0;
pQueryMsg->groupbyTagIds = 0;
}
pQueryMsg->interpoType = htons(pQueryMsg->interpoType);
if (pQueryMsg->interpoType != TSDB_INTERPO_NONE) {
pQueryMsg->defaultVal = (uint64_t)(pMsg);
int64_t *v = (int64_t *)pMsg;
for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
v[i] = htobe64(v[i]);
}
}
dTrace("qmsg:%p query on %d meter(s), qrange:%" PRId64 "-%" PRId64 ", numOfGroupbyTagCols:%d, numOfTagCols:%d, timestamp order:%d, "
"tags order:%d, tags order col:%d, numOfOutputCols:%d, numOfCols:%d, interval:%" PRId64 ", fillType:%d, comptslen:%d, limit:%" PRId64 ", "
"offset:%" PRId64,
pQueryMsg, pQueryMsg->numOfSids, pQueryMsg->skey, pQueryMsg->ekey, pQueryMsg->numOfGroupCols,
pQueryMsg->numOfTagsCols, pQueryMsg->order, pQueryMsg->orderType, pQueryMsg->orderByIdx,
pQueryMsg->numOfOutputCols, pQueryMsg->numOfCols, pQueryMsg->intervalTime, pQueryMsg->interpoType,
pQueryMsg->tsLen, pQueryMsg->limit, pQueryMsg->offset);
return 0;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "dnodeSystem.h"
#include "trpc.h"
#include "ttime.h"
#include "vnode.h"
#include "vnodeStore.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
int tsMaxVnode = -1;
int tsOpenVnodes = 0;
SVnodeObj *vnodeList = NULL;
static int vnodeInitStoreVnode(int vnode) {
SVnodeObj *pVnode = vnodeList + vnode;
pVnode->vnode = vnode;
vnodeOpenMetersVnode(vnode);
if (pVnode->cfg.maxSessions <= 0) {
return TSDB_CODE_SUCCESS;
}
pVnode->firstKey = taosGetTimestamp(pVnode->cfg.precision);
pVnode->pCachePool = vnodeOpenCachePool(vnode);
if (pVnode->pCachePool == NULL) {
dError("vid:%d, cache pool init failed.", pVnode->vnode);
return TSDB_CODE_SERV_OUT_OF_MEMORY;
}
if (vnodeInitFile(vnode) != TSDB_CODE_SUCCESS) {
dError("vid:%d, files init failed.", pVnode->vnode);
return TSDB_CODE_VG_INIT_FAILED;
}
if (vnodeInitCommit(vnode) != TSDB_CODE_SUCCESS) {
dError("vid:%d, commit init failed.", pVnode->vnode);
return TSDB_CODE_VG_INIT_FAILED;
}
pthread_mutex_init(&(pVnode->vmutex), NULL);
dPrint("vid:%d, storage initialized, version:%" PRIu64 " fileId:%d numOfFiles:%d", vnode, pVnode->version, pVnode->fileId,
pVnode->numOfFiles);
return TSDB_CODE_SUCCESS;
}
int vnodeOpenVnode(int vnode) {
int32_t code = TSDB_CODE_SUCCESS;
SVnodeObj *pVnode = vnodeList + vnode;
pVnode->vnode = vnode;
pVnode->accessState = TSDB_VN_ALL_ACCCESS;
// vnode is empty
if (pVnode->cfg.maxSessions <= 0) {
return TSDB_CODE_SUCCESS;
}
if (!(pVnode->vnodeStatus == TSDB_VN_STATUS_OFFLINE || pVnode->vnodeStatus == TSDB_VN_STATUS_CREATING)) {
dError("vid:%d, status:%s, cannot enter open operation", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
return TSDB_CODE_INVALID_VNODE_STATUS;
}
dPrint("vid:%d, status:%s, start to open", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
pthread_mutex_lock(&dmutex);
// not enough memory, abort
if ((code = vnodeOpenShellVnode(vnode)) != TSDB_CODE_SUCCESS) {
pthread_mutex_unlock(&dmutex);
return code;
}
vnodeOpenPeerVnode(vnode);
if (vnode > tsMaxVnode) tsMaxVnode = vnode;
vnodeCalcOpenVnodes();
pthread_mutex_unlock(&dmutex);
#ifndef CLUSTER
vnodeOpenStreams(pVnode, NULL);
#endif
dPrint("vid:%d, vnode is opened, openVnodes:%d, status:%s", vnode, tsOpenVnodes, taosGetVnodeStatusStr(pVnode->vnodeStatus));
return TSDB_CODE_SUCCESS;
}
static int32_t vnodeMarkAllMetersDropped(SVnodeObj* pVnode) {
if (pVnode->meterList == NULL) {
return TSDB_CODE_SUCCESS;
}
bool ready = true;
for (int sid = 0; sid < pVnode->cfg.maxSessions; ++sid) {
if (!vnodeIsSafeToDeleteMeter(pVnode, sid)) {
ready = false;
} else { // set the meter is to be deleted
SMeterObj* pObj = pVnode->meterList[sid];
if (pObj != NULL) {
pObj->state = TSDB_METER_STATE_DROPPED;
}
}
}
return ready? TSDB_CODE_SUCCESS:TSDB_CODE_ACTION_IN_PROGRESS;
}
static int vnodeCloseVnode(int vnode) {
if (vnodeList == NULL) return TSDB_CODE_SUCCESS;
SVnodeObj* pVnode = &vnodeList[vnode];
pthread_mutex_lock(&dmutex);
if (pVnode->cfg.maxSessions == 0) {
pthread_mutex_unlock(&dmutex);
return TSDB_CODE_SUCCESS;
}
if (pVnode->vnodeStatus == TSDB_VN_STATUS_DELETING) {
dPrint("vid:%d, status:%s, another thread performed delete operation", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
return TSDB_CODE_SUCCESS;
} else {
dPrint("vid:%d, status:%s, enter close operation", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
pVnode->vnodeStatus = TSDB_VN_STATUS_CLOSING;
}
// set the meter is dropped flag
if (vnodeMarkAllMetersDropped(pVnode) != TSDB_CODE_SUCCESS) {
pthread_mutex_unlock(&dmutex);
return TSDB_CODE_ACTION_IN_PROGRESS;
}
dPrint("vid:%d, status:%s, enter delete operation", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
pVnode->vnodeStatus = TSDB_VN_STATUS_DELETING;
vnodeCloseStream(vnodeList + vnode);
vnodeCancelCommit(vnodeList + vnode);
vnodeClosePeerVnode(vnode);
vnodeCloseMetersVnode(vnode);
vnodeCloseShellVnode(vnode);
vnodeCloseCachePool(vnode);
vnodeCleanUpCommit(vnode);
pthread_mutex_destroy(&(vnodeList[vnode].vmutex));
if (tsMaxVnode == vnode) tsMaxVnode = vnode - 1;
tfree(vnodeList[vnode].meterIndex);
pthread_mutex_unlock(&dmutex);
return TSDB_CODE_SUCCESS;
}
int vnodeCreateVnode(int vnode, SVnodeCfg *pCfg, SVPeerDesc *pDesc) {
char fileName[128];
if (vnodeList[vnode].vnodeStatus != TSDB_VN_STATUS_OFFLINE) {
dError("vid:%d, status:%s, cannot enter create operation", vnode, taosGetVnodeStatusStr(vnodeList[vnode].vnodeStatus));
return TSDB_CODE_INVALID_VNODE_STATUS;
}
vnodeList[vnode].vnodeStatus = TSDB_VN_STATUS_CREATING;
sprintf(fileName, "%s/vnode%d", tsDirectory, vnode);
if (mkdir(fileName, 0755) != 0) {
dError("failed to create vnode:%d directory:%s, errno:%d, reason:%s", vnode, fileName, errno, strerror(errno));
if (errno == EACCES) {
return TSDB_CODE_NO_DISK_PERMISSIONS;
} else if (errno == ENOSPC) {
return TSDB_CODE_SERV_NO_DISKSPACE;
} else if (errno == EEXIST) {
} else {
return TSDB_CODE_VG_INIT_FAILED;
}
}
sprintf(fileName, "%s/vnode%d/db", tsDirectory, vnode);
if (mkdir(fileName, 0755) != 0) {
dError("failed to create vnode:%d directory:%s, errno:%d, reason:%s", vnode, fileName, errno, strerror(errno));
if (errno == EACCES) {
return TSDB_CODE_NO_DISK_PERMISSIONS;
} else if (errno == ENOSPC) {
return TSDB_CODE_SERV_NO_DISKSPACE;
} else if (errno == EEXIST) {
} else {
return TSDB_CODE_VG_INIT_FAILED;
}
}
vnodeList[vnode].cfg = *pCfg;
int code = vnodeCreateMeterObjFile(vnode);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
code = vnodeSaveVnodeCfg(vnode, pCfg, pDesc);
if (code != TSDB_CODE_SUCCESS) {
return TSDB_CODE_VG_INIT_FAILED;
}
code = vnodeInitStoreVnode(vnode);
if (code != TSDB_CODE_SUCCESS) {
return code;
}
return vnodeOpenVnode(vnode);
}
static void vnodeRemoveDataFiles(int vnode) {
char vnodeDir[TSDB_FILENAME_LEN];
char dfilePath[TSDB_FILENAME_LEN];
char linkFile[TSDB_FILENAME_LEN];
struct dirent *de = NULL;
DIR * dir = NULL;
sprintf(vnodeDir, "%s/vnode%d/db", tsDirectory, vnode);
dir = opendir(vnodeDir);
if (dir == NULL) return;
while ((de = readdir(dir)) != NULL) {
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue;
if ((strcmp(de->d_name + strlen(de->d_name) - strlen(".head"), ".head") == 0 ||
strcmp(de->d_name + strlen(de->d_name) - strlen(".data"), ".data") == 0 ||
strcmp(de->d_name + strlen(de->d_name) - strlen(".last"), ".last") == 0) &&
(de->d_type & DT_LNK)) {
sprintf(linkFile, "%s/%s", vnodeDir, de->d_name);
if (!vnodeRemoveDataFileFromLinkFile(linkFile, de->d_name)) {
continue;
}
memset(dfilePath, 0, TSDB_FILENAME_LEN);
int tcode = readlink(linkFile, dfilePath, TSDB_FILENAME_LEN);
remove(linkFile);
if (tcode >= 0) {
remove(dfilePath);
dPrint("Data file %s is removed, link file %s", dfilePath, linkFile);
}
} else {
remove(de->d_name);
}
}
closedir(dir);
rmdir(vnodeDir);
sprintf(vnodeDir, "%s/vnode%d/meterObj.v%d", tsDirectory, vnode, vnode);
remove(vnodeDir);
sprintf(vnodeDir, "%s/vnode%d", tsDirectory, vnode);
rmdir(vnodeDir);
dPrint("vid:%d, vnode is removed, status:%s", vnode, taosGetVnodeStatusStr(vnodeList[vnode].vnodeStatus));
}
int vnodeRemoveVnode(int vnode) {
if (vnodeList == NULL) return TSDB_CODE_SUCCESS;
if (vnodeList[vnode].cfg.maxSessions > 0) {
SVnodeObj* pVnode = &vnodeList[vnode];
if (pVnode->vnodeStatus == TSDB_VN_STATUS_CREATING
|| pVnode->vnodeStatus == TSDB_VN_STATUS_OFFLINE
|| pVnode->vnodeStatus == TSDB_VN_STATUS_DELETING) {
dTrace("vid:%d, status:%s, cannot enter close/delete operation", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
return TSDB_CODE_ACTION_IN_PROGRESS;
} else {
int32_t ret = vnodeCloseVnode(vnode);
if (ret != TSDB_CODE_SUCCESS) {
return ret;
}
dTrace("vid:%d, status:%s, do delete operation", vnode, taosGetVnodeStatusStr(pVnode->vnodeStatus));
vnodeRemoveDataFiles(vnode);
}
} else {
dPrint("vid:%d, max sessions:%d, this vnode already dropped!!!", vnode, vnodeList[vnode].cfg.maxSessions);
vnodeList[vnode].cfg.maxSessions = 0; //reset value
vnodeCalcOpenVnodes();
}
return TSDB_CODE_SUCCESS;
}
int vnodeInitStore() {
int vnode;
int size;
size = sizeof(SVnodeObj) * TSDB_MAX_VNODES;
vnodeList = (SVnodeObj *)malloc(size);
if (vnodeList == NULL) return -1;
memset(vnodeList, 0, size);
if (vnodeInitInfo() < 0) return -1;
for (vnode = 0; vnode < TSDB_MAX_VNODES; ++vnode) {
int code = vnodeInitStoreVnode(vnode);
if (code != TSDB_CODE_SUCCESS) {
// one vnode is failed to recover from commit log, continue for remain
return -1;
}
}
return 0;
}
int vnodeInitVnodes() {
int vnode;
for (vnode = 0; vnode < TSDB_MAX_VNODES; ++vnode) {
if (vnodeOpenVnode(vnode) < 0) return -1;
}
return 0;
}
void vnodeCleanUpOneVnode(int vnode) {
static int again = 0;
if (vnodeList == NULL) return;
pthread_mutex_lock(&dmutex);
if (again) {
pthread_mutex_unlock(&dmutex);
return;
}
again = 1;
if (vnodeList[vnode].pCachePool) {
vnodeList[vnode].vnodeStatus = TSDB_VN_STATUS_OFFLINE;
vnodeClosePeerVnode(vnode);
}
pthread_mutex_unlock(&dmutex);
if (vnodeList[vnode].pCachePool) {
vnodeProcessCommitTimer(vnodeList + vnode, NULL);
while (vnodeList[vnode].commitThread != 0) {
taosMsleep(10);
}
vnodeCleanUpCommit(vnode);
}
}
void vnodeCleanUpVnodes() {
static int again = 0;
if (vnodeList == NULL) return;
pthread_mutex_lock(&dmutex);
if (again) {
pthread_mutex_unlock(&dmutex);
return;
}
again = 1;
for (int vnode = 0; vnode < TSDB_MAX_VNODES; ++vnode) {
if (vnodeList[vnode].pCachePool) {
vnodeList[vnode].vnodeStatus = TSDB_VN_STATUS_OFFLINE;
vnodeClosePeerVnode(vnode);
}
}
pthread_mutex_unlock(&dmutex);
for (int vnode = 0; vnode < TSDB_MAX_VNODES; ++vnode) {
if (vnodeList[vnode].pCachePool) {
vnodeProcessCommitTimer(vnodeList + vnode, NULL);
while (vnodeList[vnode].commitThread != 0) {
taosMsleep(10);
}
vnodeCleanUpCommit(vnode);
}
}
}
void vnodeCalcOpenVnodes() {
int openVnodes = 0;
for (int vnode = 0; vnode <= tsMaxVnode; ++vnode) {
if (vnodeList[vnode].cfg.maxSessions <= 0) continue;
openVnodes++;
}
atomic_store_32(&tsOpenVnodes, openVnodes);
}
void vnodeUpdateHeadFile(int vnode, int oldTables, int newTables) {
//todo rewrite the head file with newTables
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include <stdbool.h>
int vnodeInitInfo() { return 0; }
bool vnodeRemoveDataFileFromLinkFile(char* linkFile, char* de_name) { return true; }
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "taosmsg.h"
#include "vnode.h"
#include "vnodeUtil.h"
#include "vnodeStatus.h"
/* static TAOS *dbConn = NULL; */
void vnodeCloseStreamCallback(void *param);
void vnodeProcessStreamRes(void *param, TAOS_RES *tres, TAOS_ROW row) {
SMeterObj *pObj = (SMeterObj *)param;
dTrace("vid:%d sid:%d id:%s, stream result is ready", pObj->vnode, pObj->sid, pObj->meterId);
// construct data
int32_t contLen = pObj->bytesPerPoint;
char * pTemp = calloc(1, sizeof(SSubmitMsg) + pObj->bytesPerPoint + sizeof(SVMsgHeader));
SSubmitMsg *pMsg = (SSubmitMsg *)(pTemp + sizeof(SVMsgHeader));
pMsg->numOfRows = htons(1);
char ncharBuf[TSDB_MAX_BYTES_PER_ROW] = {0};
int32_t offset = 0;
for (int32_t i = 0; i < pObj->numOfColumns; ++i) {
char *dst = row[i];
if (dst == NULL) {
setNull(pMsg->payLoad + offset, pObj->schema[i].type, pObj->schema[i].bytes);
} else {
// here, we need to transfer nchar(utf8) to unicode(ucs-4)
if (pObj->schema[i].type == TSDB_DATA_TYPE_NCHAR) {
taosMbsToUcs4(row[i], pObj->schema[i].bytes, ncharBuf, TSDB_MAX_BYTES_PER_ROW);
dst = ncharBuf;
}
memcpy(pMsg->payLoad + offset, dst, pObj->schema[i].bytes);
}
offset += pObj->schema[i].bytes;
}
contLen += sizeof(SSubmitMsg);
int32_t numOfPoints = 0;
int32_t code = vnodeInsertPoints(pObj, (char *)pMsg, contLen, TSDB_DATA_SOURCE_SHELL, NULL, pObj->sversion,
&numOfPoints, taosGetTimestamp(vnodeList[pObj->vnode].cfg.precision));
if (code != TSDB_CODE_SUCCESS) {
dError("vid:%d sid:%d id:%s, failed to insert continuous query results", pObj->vnode, pObj->sid, pObj->meterId);
}
assert(numOfPoints >= 0 && numOfPoints <= 1);
tfree(pTemp);
}
static void vnodeGetDBFromMeterId(SMeterObj *pObj, char *db) {
char *st = strstr(pObj->meterId, ".");
char *end = strstr(st + 1, ".");
memcpy(db, st + 1, end - (st + 1));
}
void vnodeOpenStreams(void *param, void *tmrId) {
SVnodeObj *pVnode = (SVnodeObj *)param;
SMeterObj *pObj;
if (pVnode->streamRole == TSDB_VN_STREAM_STATUS_STOP) return;
if (pVnode->meterList == NULL) return;
taosTmrStopA(&pVnode->streamTimer);
pVnode->streamTimer = NULL;
for (int sid = 0; sid < pVnode->cfg.maxSessions; ++sid) {
pObj = pVnode->meterList[sid];
if (pObj == NULL || pObj->sqlLen == 0 || vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPING)) continue;
dTrace("vid:%d sid:%d id:%s, open stream:%s", pObj->vnode, sid, pObj->meterId, pObj->pSql);
if (pVnode->dbConn == NULL) {
char db[64] = {0};
char user[64] = {0};
vnodeGetDBFromMeterId(pObj, db);
sprintf(user, "_%s", pVnode->cfg.acct);
pVnode->dbConn = taos_connect(NULL, user, tsInternalPass, db, 0);
}
if (pVnode->dbConn == NULL) {
dError("vid:%d, failed to connect to mgmt node", pVnode->vnode);
taosTmrReset(vnodeOpenStreams, 1000, param, vnodeTmrCtrl, &pVnode->streamTimer);
return;
}
if (pObj->pStream == NULL) {
pObj->pStream = taos_open_stream(pVnode->dbConn, pObj->pSql, vnodeProcessStreamRes, pObj->lastKey, pObj,
vnodeCloseStreamCallback);
if (pObj->pStream) pVnode->numOfStreams++;
}
}
}
void vnodeCreateStream(SMeterObj *pObj) {
if (pObj->sqlLen <= 0) return;
SVnodeObj *pVnode = vnodeList + pObj->vnode;
if (pVnode->streamRole == TSDB_VN_STREAM_STATUS_STOP) return;
if (pObj->pStream) return;
dTrace("vid:%d sid:%d id:%s stream:%s is created", pObj->vnode, pObj->sid, pObj->meterId, pObj->pSql);
if (pVnode->dbConn == NULL) {
if (pVnode->streamTimer == NULL) taosTmrReset(vnodeOpenStreams, 1000, pVnode, vnodeTmrCtrl, &pVnode->streamTimer);
} else {
pObj->pStream = taos_open_stream(pVnode->dbConn, pObj->pSql, vnodeProcessStreamRes, pObj->lastKey, pObj,
vnodeCloseStreamCallback);
if (pObj->pStream) pVnode->numOfStreams++;
}
}
// Close only one stream
void vnodeRemoveStream(SMeterObj *pObj) {
SVnodeObj *pVnode = vnodeList + pObj->vnode;
if (pObj->sqlLen <= 0) return;
if (pObj->pStream) {
taos_close_stream(pObj->pStream);
pVnode->numOfStreams--;
}
pObj->pStream = NULL;
if (pVnode->numOfStreams == 0) {
taos_close(pVnode->dbConn);
pVnode->dbConn = NULL;
}
dTrace("vid:%d sid:%d id:%d stream is removed", pObj->vnode, pObj->sid, pObj->meterId);
}
// Close all streams in a vnode
void vnodeCloseStream(SVnodeObj *pVnode) {
SMeterObj *pObj;
dPrint("vid:%d, stream is closed, old role %s", pVnode->vnode, taosGetVnodeStreamStatusStr(pVnode->streamRole));
// stop stream computing
for (int sid = 0; sid < pVnode->cfg.maxSessions; ++sid) {
pObj = pVnode->meterList[sid];
if (pObj == NULL) continue;
if (pObj->sqlLen > 0 && pObj->pStream) {
taos_close_stream(pObj->pStream);
pVnode->numOfStreams--;
}
pObj->pStream = NULL;
}
}
void vnodeUpdateStreamRole(SVnodeObj *pVnode) {
/* SMeterObj *pObj; */
int newRole = (pVnode->vnodeStatus == TSDB_VN_STATUS_MASTER) ? TSDB_VN_STREAM_STATUS_START : TSDB_VN_STREAM_STATUS_STOP;
if (newRole != pVnode->streamRole) {
dPrint("vid:%d, stream role is changed from %s to %s",
pVnode->vnode, taosGetVnodeStreamStatusStr(pVnode->streamRole), taosGetVnodeStreamStatusStr(newRole));
pVnode->streamRole = newRole;
if (newRole == TSDB_VN_STREAM_STATUS_START) {
vnodeOpenStreams(pVnode, NULL);
} else {
vnodeCloseStream(pVnode);
}
} else {
dPrint("vid:%d, stream role is keep to %s", pVnode->vnode, taosGetVnodeStreamStatusStr(pVnode->streamRole));
}
}
// Callback function called from client
void vnodeCloseStreamCallback(void *param) {
SMeterObj *pTable = (SMeterObj *)param;
SVnodeObj *pVnode = NULL;
if (pTable == NULL || pTable->sqlLen == 0) return;
pVnode = vnodeList + pTable->vnode;
pTable->sqlLen = 0;
pTable->pSql = NULL;
pTable->pStream = NULL;
pVnode->numOfStreams--;
if (pVnode->numOfStreams == 0) {
taos_close(pVnode->dbConn);
pVnode->dbConn = NULL;
}
vnodeSaveMeterObjToFile(pTable);
}
\ No newline at end of file
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "mnode.h"
#include "os.h"
#include "qast.h"
#include "qextbuffer.h"
#include "tschemautil.h"
#include "tsqlfunction.h"
typedef struct SSyntaxTreeFilterSupporter {
SSchema* pTagSchema;
int32_t numOfTags;
int32_t optr;
} SSyntaxTreeFilterSupporter;
typedef struct SJoinSupporter {
void** val;
void** pTabObjs;
int32_t size;
int16_t type;
int16_t colIndex;
void** qualMeterObj;
int32_t qualSize;
} SJoinSupporter;
typedef struct SMeterNameFilterSupporter {
SPatternCompareInfo info;
char* pattern;
} SMeterNameFilterSupporter;
static void tansformQueryResult(tQueryResultset* pRes);
static bool tSkipListNodeFilterCallback(const void *pNode, void *param);
static int32_t tabObjVGIDComparator(const void* pLeft, const void* pRight) {
STabObj* p1 = *(STabObj**)pLeft;
STabObj* p2 = *(STabObj**)pRight;
int32_t ret = p1->gid.vgId - p2->gid.vgId;
if (ret == 0) {
return ret;
} else {
return ret > 0 ? 1 : -1;
}
}
// monotonic inc in memory address
static int32_t tabObjPointerComparator(const void* pLeft, const void* pRight) {
int64_t ret = (*(STabObj**)(pLeft))->uid - (*(STabObj**)(pRight))->uid;
if (ret == 0) {
return 0;
} else {
return ret > 0 ? 1 : -1;
}
}
static int32_t tabObjResultComparator(const void* p1, const void* p2, void* param) {
tOrderDescriptor* pOrderDesc = (tOrderDescriptor*)param;
STabObj* pNode1 = (STabObj*)p1;
STabObj* pNode2 = (STabObj*)p2;
for (int32_t i = 0; i < pOrderDesc->orderIdx.numOfCols; ++i) {
int32_t colIdx = pOrderDesc->orderIdx.pData[i];
char* f1 = NULL;
char* f2 = NULL;
SSchema schema = {0};
if (colIdx == -1) {
f1 = pNode1->meterId;
f2 = pNode2->meterId;
schema.type = TSDB_DATA_TYPE_BINARY;
schema.bytes = TSDB_TABLE_ID_LEN;
} else {
f1 = mgmtMeterGetTag(pNode1, colIdx, NULL);
f2 = mgmtMeterGetTag(pNode2, colIdx, &schema);
SSchema* pSchema = getColumnModelSchema(pOrderDesc->pColumnModel, colIdx);
assert(schema.type == pSchema->type);
}
int32_t ret = doCompare(f1, f2, schema.type, schema.bytes);
if (ret == 0) {
continue;
} else {
return ret;
}
}
return 0;
}
/**
* update the tag order index according to the tags column index. The tags column index needs to be checked one-by-one,
* since the normal columns may be passed to server for handling the group by on status column.
*
* @param pSuperTableMetaMsg
* @param tableIndex
* @param pOrderIndexInfo
* @param numOfTags
*/
static void mgmtUpdateOrderTagColIndex(SMetricMetaMsg* pMetricMetaMsg, int32_t tableIndex, SColumnOrderInfo* pOrderIndexInfo,
int32_t numOfTags) {
SMetricMetaElemMsg* pElem = (SMetricMetaElemMsg*)((char*)pSuperTableMetaMsg + pSuperTableMetaMsg->metaElem[tableIndex]);
SColIndexEx* groupColumnList = (SColIndexEx*)((char*)pSuperTableMetaMsg + pElem->groupbyTagColumnList);
int32_t numOfGroupbyTags = 0;
for (int32_t i = 0; i < pElem->numOfGroupCols; ++i) {
if (groupColumnList[i].flag == TSDB_COL_TAG) { // ignore this column if it is not a tag column.
pOrderIndexInfo->pData[numOfGroupbyTags++] = groupColumnList[i].colIdx;
assert(groupColumnList[i].colIdx < numOfTags);
}
}
pOrderIndexInfo->numOfCols = numOfGroupbyTags;
}
// todo merge sort function with losertree used
void mgmtReorganizeMetersInMetricMeta(SSuperTableMetaMsg* pSuperTableMetaMsg, int32_t tableIndex, tQueryResultset* pRes) {
if (pRes->num <= 0) { // no result, no need to pagination
return;
}
SMetricMetaElemMsg* pElem = (SMetricMetaElemMsg*)((char*)pSuperTableMetaMsg + pSuperTableMetaMsg->metaElem[tableIndex]);
STabObj* pMetric = mgmtGetTable(pElem->meterId);
SSchema* pTagSchema = (SSchema*)(pMetric->schema + pMetric->numOfColumns * sizeof(SSchema));
/*
* To apply the group limitation and group offset, we should sort the result
* list according to the order condition
*/
tOrderDescriptor* descriptor =
(tOrderDescriptor*)calloc(1, sizeof(tOrderDescriptor) + sizeof(int32_t) * pElem->numOfGroupCols);
descriptor->pColumnModel = createColumnModel(pTagSchema, pMetric->numOfTags, 1);
descriptor->orderIdx.numOfCols = pElem->numOfGroupCols;
int32_t* startPos = NULL;
int32_t numOfSubset = 1;
mgmtUpdateOrderTagColIndex(pMetricMetaMsg, tableIndex, &descriptor->orderIdx, pMetric->numOfTags);
if (descriptor->orderIdx.numOfCols > 0) {
tQSortEx(pRes->pRes, POINTER_BYTES, 0, pRes->num - 1, descriptor, tabObjResultComparator);
startPos = calculateSubGroup(pRes->pRes, pRes->num, &numOfSubset, descriptor, tabObjResultComparator);
} else {
startPos = malloc(2 * sizeof(int32_t));
startPos[0] = 0;
startPos[1] = (int32_t)pRes->num;
}
/*
* sort the result according to vgid to ensure meters with the same vgid is
* continuous in the result list
*/
qsort(pRes->pRes, (size_t)pRes->num, POINTER_BYTES, tabObjVGIDComparator);
free(descriptor->pColumnModel);
free(descriptor);
free(startPos);
}
static void mgmtRetrieveByMeterName(tQueryResultset* pRes, char* str, STabObj* pMetric) {
const char* sep = ",";
char* pToken = NULL;
int32_t s = 4; // initial size
pRes->pRes = malloc(sizeof(char*) * s);
pRes->num = 0;
for (pToken = strsep(&str, sep); pToken != NULL; pToken = strsep(&str, sep)) {
STabObj* pMeterObj = mgmtGetTable(pToken);
if (pMeterObj == NULL) {
mWarn("metric:%s error in metric query expression, invalid meter id:%s", pMetric->meterId, pToken);
continue;
}
if (pRes->num >= s) {
s += (s >> 1); // increase 50% size
pRes->pRes = realloc(pRes->pRes, sizeof(char*) * s);
}
/* not a table created from metric, ignore */
if (pMeterObj->tableType != TSDB_TABLE_TYPE_CHILD_TABLE) {
continue;
}
/*
* queried meter not belongs to this metric, ignore, metric does not have
* uid, so compare according to meterid
*/
STabObj* parentMetric = mgmtGetTable(pMeterObj->pTagData);
if (strncasecmp(parentMetric->meterId, pMetric->meterId, TSDB_TABLE_ID_LEN) != 0 ||
(parentMetric->uid != pMetric->uid)) {
continue;
}
pRes->pRes[pRes->num++] = pMeterObj;
}
}
static bool mgmtTablenameFilterCallback(tSkipListNode* pNode, void* param) {
SMeterNameFilterSupporter* pSupporter = (SMeterNameFilterSupporter*)param;
char name[TSDB_TABLE_ID_LEN] = {0};
// pattern compare for meter name
STabObj* pMeterObj = (STabObj*)pNode->pData;
extractTableName(pMeterObj->meterId, name);
return patternMatch(pSupporter->pattern, name, TSDB_TABLE_ID_LEN, &pSupporter->info) == TSDB_PATTERN_MATCH;
}
static void mgmtRetrieveFromLikeOptr(tQueryResultset* pRes, const char* str, STabObj* pMetric) {
SPatternCompareInfo info = PATTERN_COMPARE_INFO_INITIALIZER;
SMeterNameFilterSupporter supporter = {info, (char*) str};
pRes->num =
tSkipListIterateList(pMetric->pSkipList, (tSkipListNode***)&pRes->pRes, mgmtTablenameFilterCallback, &supporter);
}
static void mgmtFilterByTableNameCond(tQueryResultset* pRes, char* condStr, int32_t len, STabObj* pMetric) {
pRes->num = 0;
if (len <= 0) {
return;
}
char* str = calloc(1, (size_t)len + 1);
memcpy(str, condStr, len);
if (strncasecmp(condStr, QUERY_COND_REL_PREFIX_IN, QUERY_COND_REL_PREFIX_IN_LEN) == 0) { // handle in expression
mgmtRetrieveByMeterName(pRes, str + QUERY_COND_REL_PREFIX_IN_LEN, pMetric);
} else { // handle like expression
assert(strncasecmp(str, QUERY_COND_REL_PREFIX_LIKE, QUERY_COND_REL_PREFIX_LIKE_LEN) == 0);
mgmtRetrieveFromLikeOptr(pRes, str + QUERY_COND_REL_PREFIX_LIKE_LEN, pMetric);
tansformQueryResult(pRes);
}
free(str);
}
UNUSED_FUNC static bool mgmtJoinFilterCallback(tSkipListNode* pNode, void* param) {
SJoinSupporter* pSupporter = (SJoinSupporter*)param;
SSchema s = {0};
char* v = mgmtTableGetTag((STabObj*)pNode->pData, pSupporter->colIndex, &s);
for (int32_t i = 0; i < pSupporter->size; ++i) {
int32_t ret = doCompare(v, pSupporter->val[i], pSupporter->type, s.bytes);
if (ret == 0) {
pSupporter->qualMeterObj[pSupporter->qualSize++] = pSupporter->pTabObjs[i];
/*
* Once a value is qualified according to the join condition, it is remove from the
* candidate list, as well as its corresponding meter object.
*
* last one does not need to move forward
*/
if (i < pSupporter->size - 1) {
memmove(pSupporter->val[i], pSupporter->val[i + 1], pSupporter->size - (i + 1));
}
pSupporter->size -= 1;
return true;
}
}
return false;
}
static void orderResult(SSuperTableMetaMsg* pSuperTableMetaMsg, tQueryResultset* pRes, int16_t colIndex, int32_t tableIndex) {
SMetricMetaElemMsg* pElem = (SMetricMetaElemMsg*)((char*)pSuperTableMetaMsg + pSuperTableMetaMsg->metaElem[tableIndex]);
tOrderDescriptor* descriptor =
(tOrderDescriptor*)calloc(1, sizeof(tOrderDescriptor) + sizeof(int32_t) * 1); // only one column for join
STabObj* pMetric = mgmtGetTable(pElem->meterId);
SSchema* pTagSchema = (SSchema*)(pMetric->schema + pMetric->numOfColumns * sizeof(SSchema));
descriptor->pColumnModel = createColumnModel(pTagSchema, pMetric->numOfTags, 1);
descriptor->orderIdx.pData[0] = colIndex;
descriptor->orderIdx.numOfCols = 1;
// sort results list
tQSortEx(pRes->pRes, POINTER_BYTES, 0, pRes->num - 1, descriptor, tabObjResultComparator);
free(descriptor->pColumnModel);
free(descriptor);
}
// check for duplicate join tags
static int32_t mgmtCheckForDuplicateTagValue(tQueryResultset* pRes, int32_t index, int32_t tagCol) {
SSchema s = {0};
for (int32_t k = 1; k < pRes[index].num; ++k) {
STabObj* pObj1 = pRes[index].pRes[k - 1];
STabObj* pObj2 = pRes[index].pRes[k];
char* val1 = mgmtTableGetTag(pObj1, tagCol, &s);
char* val2 = mgmtTableGetTag(pObj2, tagCol, NULL);
if (doCompare(val1, val2, s.type, s.bytes) == 0) {
return TSDB_CODE_DUPLICATE_TAGS;
}
}
return TSDB_CODE_SUCCESS;
}
int32_t mgmtDoJoin(SSuperTableMetaMsg* pSuperTableMetaMsg, tQueryResultset* pRes) {
if (pSuperTableMetaMsg->numOfMeters == 1) {
return TSDB_CODE_SUCCESS;
}
bool allEmpty = false;
for (int32_t i = 0; i < pSuperTableMetaMsg->numOfMeters; ++i) {
if (pRes[i].num == 0) { // all results are empty if one of them is empty
allEmpty = true;
break;
}
}
if (allEmpty) {
for (int32_t i = 0; i < pSuperTableMetaMsg->numOfMeters; ++i) {
pRes[i].num = 0;
tfree(pRes[i].pRes);
}
return TSDB_CODE_SUCCESS;
}
char* cond = (char*)pSuperTableMetaMsg + pSuperTableMetaMsg->join;
char left[TSDB_TABLE_ID_LEN + 1] = {0};
strcpy(left, cond);
int16_t leftTagColIndex = *(int16_t*)(cond + TSDB_TABLE_ID_LEN);
char right[TSDB_TABLE_ID_LEN + 1] = {0};
strcpy(right, cond + TSDB_TABLE_ID_LEN + sizeof(int16_t));
int16_t rightTagColIndex = *(int16_t*)(cond + TSDB_TABLE_ID_LEN * 2 + sizeof(int16_t));
STabObj* pLeftMetric = mgmtGetTable(left);
STabObj* pRightMetric = mgmtGetTable(right);
// decide the pRes belongs to
int32_t leftIndex = 0;
int32_t rightIndex = 0;
for (int32_t i = 0; i < pSuperTableMetaMsg->numOfMeters; ++i) {
STabObj* pObj = (STabObj*)pRes[i].pRes[0];
STabObj* pMetric1 = mgmtGetTable(pObj->pTagData);
if (pMetric1 == pLeftMetric) {
leftIndex = i;
} else if (pMetric1 == pRightMetric) {
rightIndex = i;
}
}
orderResult(pSuperTableMetaMsg, &pRes[leftIndex], leftTagColIndex, leftIndex);
orderResult(pSuperTableMetaMsg, &pRes[rightIndex], rightTagColIndex, rightIndex);
int32_t i = 0;
int32_t j = 0;
SSchema s = {0};
int32_t res = 0;
// check for duplicated tag values
int32_t ret1 = mgmtCheckForDuplicateTagValue(pRes, leftIndex, leftTagColIndex);
int32_t ret2 = mgmtCheckForDuplicateTagValue(pRes, rightIndex, rightTagColIndex);
if (ret1 != TSDB_CODE_SUCCESS || ret2 != TSDB_CODE_SUCCESS) {
return ret1;
}
while (i < pRes[leftIndex].num && j < pRes[rightIndex].num) {
STabObj* pLeftObj = pRes[leftIndex].pRes[i];
STabObj* pRightObj = pRes[rightIndex].pRes[j];
char* v1 = mgmtTableGetTag(pLeftObj, leftTagColIndex, &s);
char* v2 = mgmtTableGetTag(pRightObj, rightTagColIndex, NULL);
int32_t ret = doCompare(v1, v2, s.type, s.bytes);
if (ret == 0) { // qualified
pRes[leftIndex].pRes[res] = pRes[leftIndex].pRes[i++];
pRes[rightIndex].pRes[res] = pRes[rightIndex].pRes[j++];
res++;
} else if (ret < 0) {
i++;
} else {
j++;
}
}
pRes[leftIndex].num = res;
pRes[rightIndex].num = res;
return TSDB_CODE_SUCCESS;
}
/**
* convert the result pointer to STabObj instead of tSkipListNode
* @param pRes
*/
static void tansformQueryResult(tQueryResultset* pRes) {
if (pRes == NULL || pRes->num == 0) {
return;
}
for (int32_t i = 0; i < pRes->num; ++i) {
pRes->pRes[i] = ((tSkipListNode*)(pRes->pRes[i]))->pData;
}
}
static tQueryResultset* doNestedLoopIntersect(tQueryResultset* pRes1, tQueryResultset* pRes2) {
int32_t num = 0;
void** pResult = pRes1->pRes;
for (int32_t i = 0; i < pRes1->num; ++i) {
for (int32_t j = 0; j < pRes2->num; ++j) {
if (pRes1->pRes[i] == pRes2->pRes[j]) {
pResult[num++] = pRes1->pRes[i];
break;
}
}
}
tQueryResultClean(pRes2);
memset(pRes1->pRes + num, 0, sizeof(void*) * (pRes1->num - num));
pRes1->num = num;
return pRes1;
}
static tQueryResultset* doSortIntersect(tQueryResultset* pRes1, tQueryResultset* pRes2) {
size_t sizePtr = sizeof(void *);
qsort(pRes1->pRes, pRes1->num, sizePtr, tabObjPointerComparator);
qsort(pRes2->pRes, pRes2->num, sizePtr, tabObjPointerComparator);
int32_t i = 0;
int32_t j = 0;
int32_t num = 0;
while (i < pRes1->num && j < pRes2->num) {
if (pRes1->pRes[i] == pRes2->pRes[j]) {
j++;
pRes1->pRes[num++] = pRes1->pRes[i++];
} else if (pRes1->pRes[i] < pRes2->pRes[j]) {
i++;
} else {
j++;
}
}
tQueryResultClean(pRes2);
memset(pRes1->pRes + num, 0, sizeof(void*) * (pRes1->num - num));
pRes1->num = num;
return pRes1;
}
static void queryResultIntersect(tQueryResultset* pFinalRes, tQueryResultset* pRes) {
const int32_t NUM_OF_RES_THRESHOLD = 20;
// for small result, use nested loop join
if (pFinalRes->num <= NUM_OF_RES_THRESHOLD && pRes->num <= NUM_OF_RES_THRESHOLD) {
doNestedLoopIntersect(pFinalRes, pRes);
} else { // for larger result, sort merge is employed
doSortIntersect(pFinalRes, pRes);
}
}
static void queryResultUnion(tQueryResultset* pFinalRes, tQueryResultset* pRes) {
if (pRes->num == 0) {
tQueryResultClean(pRes);
return;
}
int32_t total = pFinalRes->num + pRes->num;
void* tmp = realloc(pFinalRes->pRes, total * POINTER_BYTES);
if (tmp == NULL) {
return;
}
pFinalRes->pRes = tmp;
memcpy(&pFinalRes->pRes[pFinalRes->num], pRes->pRes, POINTER_BYTES * pRes->num);
qsort(pFinalRes->pRes, total, POINTER_BYTES, tabObjPointerComparator);
int32_t num = 1;
for (int32_t i = 1; i < total; ++i) {
if (pFinalRes->pRes[i] != pFinalRes->pRes[i - 1]) {
pFinalRes->pRes[num++] = pFinalRes->pRes[i];
}
}
if (num < total) {
memset(&pFinalRes->pRes[num], 0, POINTER_BYTES * (total - num));
}
pFinalRes->num = num;
tQueryResultClean(pRes);
}
static int32_t compareIntVal(const void* pLeft, const void* pRight) {
DEFAULT_COMP(GET_INT64_VAL(pLeft), GET_INT64_VAL(pRight));
}
static int32_t compareIntDoubleVal(const void* pLeft, const void* pRight) {
DEFAULT_COMP(GET_INT64_VAL(pLeft), GET_DOUBLE_VAL(pRight));
}
static int32_t compareDoubleVal(const void* pLeft, const void* pRight) {
DEFAULT_COMP(GET_DOUBLE_VAL(pLeft), GET_DOUBLE_VAL(pRight));
}
static int32_t compareDoubleIntVal(const void* pLeft, const void* pRight) {
double ret = (*(double*)pLeft) - (*(int64_t*)pRight);
if (fabs(ret) < DBL_EPSILON) {
return 0;
} else {
return ret > 0 ? 1 : -1;
}
}
static int32_t compareStrVal(const void* pLeft, const void* pRight) {
int32_t ret = strcmp(pLeft, pRight);
if (ret == 0) {
return 0;
} else {
return ret > 0 ? 1 : -1;
}
}
static int32_t compareWStrVal(const void* pLeft, const void* pRight) {
int32_t ret = wcscmp(pLeft, pRight);
if (ret == 0) {
return 0;
} else {
return ret > 0 ? 1 : -1;
}
}
static int32_t compareStrPatternComp(const void* pLeft, const void* pRight) {
SPatternCompareInfo pInfo = {'%', '_'};
const char* pattern = pRight;
const char* str = pLeft;
int32_t ret = patternMatch(pattern, str, strlen(str), &pInfo);
return (ret == TSDB_PATTERN_MATCH) ? 0 : 1;
}
static int32_t compareWStrPatternComp(const void* pLeft, const void* pRight) {
SPatternCompareInfo pInfo = {'%', '_'};
const wchar_t* pattern = pRight;
const wchar_t* str = pLeft;
int32_t ret = WCSPatternMatch(pattern, str, wcslen(str), &pInfo);
return (ret == TSDB_PATTERN_MATCH) ? 0 : 1;
}
static __compar_fn_t getFilterComparator(int32_t type, int32_t filterType, int32_t optr) {
__compar_fn_t comparator = NULL;
switch (type) {
case TSDB_DATA_TYPE_TINYINT:
case TSDB_DATA_TYPE_SMALLINT:
case TSDB_DATA_TYPE_INT:
case TSDB_DATA_TYPE_BIGINT:
case TSDB_DATA_TYPE_BOOL: {
if (filterType >= TSDB_DATA_TYPE_BOOL && filterType <= TSDB_DATA_TYPE_BIGINT) {
comparator = compareIntVal;
} else if (filterType >= TSDB_DATA_TYPE_FLOAT && filterType <= TSDB_DATA_TYPE_DOUBLE) {
comparator = compareIntDoubleVal;
}
break;
}
case TSDB_DATA_TYPE_FLOAT:
case TSDB_DATA_TYPE_DOUBLE: {
if (filterType >= TSDB_DATA_TYPE_BOOL && filterType <= TSDB_DATA_TYPE_BIGINT) {
comparator = compareDoubleIntVal;
} else if (filterType >= TSDB_DATA_TYPE_FLOAT && filterType <= TSDB_DATA_TYPE_DOUBLE) {
comparator = compareDoubleVal;
}
break;
}
case TSDB_DATA_TYPE_BINARY: {
assert(filterType == TSDB_DATA_TYPE_BINARY);
if (optr == TSDB_RELATION_LIKE) { /* wildcard query using like operator */
comparator = compareStrPatternComp;
} else { /* normal relational comparator */
comparator = compareStrVal;
}
break;
}
case TSDB_DATA_TYPE_NCHAR: {
assert(filterType == TSDB_DATA_TYPE_NCHAR);
if (optr == TSDB_RELATION_LIKE) {
comparator = compareWStrPatternComp;
} else {
comparator = compareWStrVal;
}
break;
}
default:
comparator = compareIntVal;
break;
}
return comparator;
}
static void getTagColumnInfo(SSyntaxTreeFilterSupporter* pSupporter, SSchema* pSchema, int32_t* index,
int32_t* offset) {
*index = 0;
*offset = 0;
// filter on table name(TBNAME)
if (strcasecmp(pSchema->name, TSQL_TBNAME_L) == 0) {
*index = TSDB_TBNAME_COLUMN_INDEX;
*offset = TSDB_TBNAME_COLUMN_INDEX;
return;
}
while ((*index) < pSupporter->numOfTags) {
if (pSupporter->pTagSchema[*index].bytes == pSchema->bytes &&
pSupporter->pTagSchema[*index].type == pSchema->type &&
strcmp(pSupporter->pTagSchema[*index].name, pSchema->name) == 0) {
break;
} else {
(*offset) += pSupporter->pTagSchema[(*index)++].bytes;
}
}
}
void filterPrepare(void* expr, void* param) {
tSQLBinaryExpr *pExpr = (tSQLBinaryExpr*) expr;
if (pExpr->info != NULL) {
return;
}
int32_t i = 0, offset = 0;
pExpr->info = calloc(1, sizeof(tQueryInfo));
tQueryInfo* pInfo = pExpr->info;
SSyntaxTreeFilterSupporter* pSupporter = (SSyntaxTreeFilterSupporter*)param;
tVariant* pCond = pExpr->pRight->pVal;
SSchema* pSchema = pExpr->pLeft->pSchema;
getTagColumnInfo(pSupporter, pSchema, &i, &offset);
assert((i >= 0 && i < TSDB_MAX_TAGS) || (i == TSDB_TBNAME_COLUMN_INDEX));
assert((offset >= 0 && offset < TSDB_MAX_TAGS_LEN) || (offset == TSDB_TBNAME_COLUMN_INDEX));
pInfo->sch = *pSchema;
pInfo->colIdx = i;
pInfo->optr = pExpr->nSQLBinaryOptr;
pInfo->offset = offset;
pInfo->compare = getFilterComparator(pSchema->type, pCond->nType, pInfo->optr);
tVariantAssign(&pInfo->q, pCond);
tVariantTypeSetType(&pInfo->q, pInfo->sch.type);
}
void tSQLListTraverseDestroyInfo(void* param) {
if (param == NULL) {
return;
}
tQueryInfo* pInfo = (tQueryInfo*)param;
tVariantDestroy(&(pInfo->q));
free(param);
}
static int32_t mgmtFilterMeterByIndex(STabObj* pMetric, tQueryResultset* pRes, char* pCond, int32_t condLen) {
SSchema* pTagSchema = (SSchema*)(pMetric->schema + pMetric->numOfColumns * sizeof(SSchema));
tSQLBinaryExpr* pExpr = NULL;
tSQLBinaryExprFromString(&pExpr, pTagSchema, pMetric->numOfTags, pCond, condLen);
// failed to build expression, no result, return immediately
if (pExpr == NULL) {
mError("metric:%s, no result returned, error in super table query expression:%s", pMetric->meterId, pCond);
tfree(pCond);
return TSDB_CODE_OPS_NOT_SUPPORT;
} else { // query according to the binary expression
SSyntaxTreeFilterSupporter s = {.pTagSchema = pTagSchema, .numOfTags = pMetric->numOfTags};
SBinaryFilterSupp supp = {.fp = (__result_filter_fn_t)tSkipListNodeFilterCallback,
.setupInfoFn = (__do_filter_suppl_fn_t)filterPrepare,
.pExtInfo = &s};
// tSQLBinaryExprTraverse(pExpr, pMetric->pSkipList, pRes, &supp);
tSQLBinaryExprDestroy(&pExpr, tSQLListTraverseDestroyInfo);
}
tansformQueryResult(pRes);
return TSDB_CODE_SUCCESS;
}
int32_t mgmtRetrieveMetersFromSuperTable(SSuperTableMetaMsg* pMsg, int32_t tableIndex, tQueryResultset* pRes) {
SMetricMetaElemMsg* pElem = (SMetricMetaElemMsg*)((char*)pMsg + pMsg->metaElem[tableIndex]);
STabObj* pMetric = mgmtGetTable(pElem->meterId);
char* pCond = NULL;
char* tmpTableNameCond = NULL;
// no table created in accordance with this metric.
if (pMetric->pSkipList == NULL || pMetric->pSkipList->nSize == 0) {
assert(pMetric->numOfMeters == 0);
return TSDB_CODE_SUCCESS;
}
char* pQueryCond = (char*)pMsg + pElem->cond;
int32_t condLen = pElem->condLen;
// transfer the unicode string to mbs binary expression
if (condLen > 0) {
pCond = calloc(1, (condLen + 1) * TSDB_NCHAR_SIZE);
taosUcs4ToMbs(pQueryCond, condLen * TSDB_NCHAR_SIZE, pCond);
condLen = strlen(pCond) + 1;
mTrace("metric:%s len:%d, metric query condition:%s", pMetric->meterId, condLen, pCond);
}
char* tablenameCond = (char*)pMsg + pElem->tableCond;
if (pElem->tableCondLen > 0) {
tmpTableNameCond = calloc(1, pElem->tableCondLen + 1);
strncpy(tmpTableNameCond, tablenameCond, pElem->tableCondLen);
mTrace("metric:%s rel:%d, len:%d, table name cond:%s", pMetric->meterId, pElem->rel, pElem->tableCondLen,
tmpTableNameCond);
}
if (pElem->tableCondLen > 0 || condLen > 0) {
mgmtFilterByTableNameCond(pRes, tmpTableNameCond, pElem->tableCondLen, pMetric);
bool noNextCal = (pRes->num == 0 && pElem->rel == TSDB_RELATION_AND); // no need to calculate next result
if (!noNextCal && condLen > 0) {
tQueryResultset filterRes = {0};
int32_t ret = mgmtFilterMeterByIndex(pMetric, &filterRes, pCond, condLen);
if (ret != TSDB_CODE_SUCCESS) {
tfree(pCond);
tfree(tmpTableNameCond);
return ret;
}
// union or intersect of two results
assert(pElem->rel == TSDB_RELATION_AND || pElem->rel == TSDB_RELATION_OR);
if (pElem->rel == TSDB_RELATION_AND) {
if (filterRes.num == 0 || pRes->num == 0) { // intersect two sets
tQueryResultClean(pRes);
} else {
queryResultIntersect(pRes, &filterRes);
}
} else { // union two sets
queryResultUnion(pRes, &filterRes);
}
tQueryResultClean(&filterRes);
}
} else {
mTrace("metric:%s retrieve all meter, no query condition", pMetric->meterId);
pRes->num = tSkipListIterateList(pMetric->pSkipList, (tSkipListNode***)&pRes->pRes, NULL, NULL);
tansformQueryResult(pRes);
}
tfree(pCond);
tfree(tmpTableNameCond);
mTrace("metric:%s numOfRes:%d", pMetric->meterId, pRes->num);
return TSDB_CODE_SUCCESS;
}
// todo refactor!!!!!
static char* getTagValueFromMeter(STabObj* pTable, int32_t offset, int32_t len, char* param) {
if (offset == TSDB_TBNAME_COLUMN_INDEX) {
extractTableName(pTable->meterId, param);
} else {
char* tags = pTable->pTagData + offset + TSDB_TABLE_ID_LEN; // tag start position
memcpy(param, tags, len); // make sure the value is null-terminated string
}
return param;
}
bool tSkipListNodeFilterCallback(const void* pNode, void* param) {
tQueryInfo* pInfo = (tQueryInfo*)param;
STabObj* pTable = (STabObj*)(((tSkipListNode*)pNode)->pData);
char buf[TSDB_MAX_TAGS_LEN] = {0};
char* val = getTagValueFromMeter(pTable, pInfo->offset, pInfo->sch.bytes, buf);
int8_t type = pInfo->sch.type;
int32_t ret = 0;
if (pInfo->q.nType == TSDB_DATA_TYPE_BINARY || pInfo->q.nType == TSDB_DATA_TYPE_NCHAR) {
ret = pInfo->compare(val, pInfo->q.pz);
} else {
tVariant t = {0};
tVariantCreateFromBinary(&t, val, (uint32_t) pInfo->sch.bytes, type);
ret = pInfo->compare(&t.i64Key, &pInfo->q.i64Key);
}
switch (pInfo->optr) {
case TSDB_RELATION_EQUAL: {
return ret == 0;
}
case TSDB_RELATION_NOT_EQUAL: {
return ret != 0;
}
case TSDB_RELATION_LARGE_EQUAL: {
return ret >= 0;
}
case TSDB_RELATION_LARGE: {
return ret > 0;
}
case TSDB_RELATION_LESS_EQUAL: {
return ret <= 0;
}
case TSDB_RELATION_LESS: {
return ret < 0;
}
case TSDB_RELATION_LIKE: {
return ret == 0;
}
default:
assert(false);
}
return true;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "qast.h"
#include "qextbuffer.h"
#include "taosdef.h"
#include "taosmsg.h"
#include "tlog.h"
#include "tutil.h"
#include "vnodeTagMgmt.h"
#define GET_TAG_VAL_POINTER(s, col, sc, t) ((t *)(&((s)->tags[getColumnModelOffset(sc, col)])))
#define GET_TAG_VAL(s, col, sc, t) (*GET_TAG_VAL_POINTER(s, col, sc, t))
static void tTagsPrints(SMeterSidExtInfo *pMeterInfo, SColumnModel *pSchema, SColumnOrderInfo *pOrder);
static void tSidSetDisplay(tSidSet *pSets);
//todo merge with losertree_compar/ext_comp
int32_t doCompare(char* f1, char* f2, int32_t type, int32_t size) {
switch (type) {
case TSDB_DATA_TYPE_INT: DEFAULT_COMP(GET_INT32_VAL(f1), GET_INT32_VAL(f2));
case TSDB_DATA_TYPE_DOUBLE: DEFAULT_COMP(GET_DOUBLE_VAL(f1), GET_DOUBLE_VAL(f2));
case TSDB_DATA_TYPE_FLOAT: DEFAULT_COMP(GET_FLOAT_VAL(f1), GET_FLOAT_VAL(f2));
case TSDB_DATA_TYPE_BIGINT: DEFAULT_COMP(GET_INT64_VAL(f1), GET_INT64_VAL(f2));
case TSDB_DATA_TYPE_SMALLINT: DEFAULT_COMP(GET_INT16_VAL(f1), GET_INT16_VAL(f2));
case TSDB_DATA_TYPE_TINYINT:
case TSDB_DATA_TYPE_BOOL: DEFAULT_COMP(GET_INT8_VAL(f1), GET_INT8_VAL(f2));
case TSDB_DATA_TYPE_NCHAR: {
int32_t ret = wcsncmp((wchar_t*) f1, (wchar_t*) f2, size/TSDB_NCHAR_SIZE);
if (ret == 0) {
return ret;
}
return (ret < 0) ? -1 : 1;
}
default: {
int32_t ret = strncmp(f1, f2, (size_t)size);
if (ret == 0) {
return ret;
}
return (ret < 0) ? -1 : 1;
}
}
}
int32_t meterSidComparator(const void *p1, const void *p2, void *param) {
tOrderDescriptor *pOrderDesc = (tOrderDescriptor *)param;
SMeterSidExtInfo *s1 = (SMeterSidExtInfo *)p1;
SMeterSidExtInfo *s2 = (SMeterSidExtInfo *)p2;
for (int32_t i = 0; i < pOrderDesc->orderIdx.numOfCols; ++i) {
int32_t colIdx = pOrderDesc->orderIdx.pData[i];
char * f1 = NULL;
char * f2 = NULL;
int32_t type = 0;
int32_t bytes = 0;
if (colIdx == -1) {
f1 = s1->tags;
f2 = s2->tags;
type = TSDB_DATA_TYPE_BINARY;
bytes = TSDB_METER_NAME_LEN;
} else {
f1 = GET_TAG_VAL_POINTER(s1, colIdx, pOrderDesc->pColumnModel, char);
f2 = GET_TAG_VAL_POINTER(s2, colIdx, pOrderDesc->pColumnModel, char);
SSchema *pSchema = getColumnModelSchema(pOrderDesc->pColumnModel, colIdx);
type = pSchema->type;
bytes = pSchema->bytes;
}
int32_t ret = doCompare(f1, f2, type, bytes);
if (ret == 0) {
continue;
} else {
return ret;
}
}
return 0;
}
static void median(void **pMeterSids, size_t size, int32_t s1, int32_t s2, tOrderDescriptor *pOrderDesc,
__ext_compar_fn_t compareFn) {
int32_t midIdx = ((s2 - s1) >> 1) + s1;
if (compareFn(pMeterSids[midIdx], pMeterSids[s1], pOrderDesc) == 1) {
tsDataSwap(&pMeterSids[midIdx], &pMeterSids[s1], TSDB_DATA_TYPE_BINARY, size);
}
if (compareFn(pMeterSids[midIdx], pMeterSids[s2], pOrderDesc) == 1) {
tsDataSwap(&pMeterSids[midIdx], &pMeterSids[s1], TSDB_DATA_TYPE_BINARY, size);
tsDataSwap(&pMeterSids[midIdx], &pMeterSids[s2], TSDB_DATA_TYPE_BINARY, size);
} else if (compareFn(pMeterSids[s1], pMeterSids[s2], pOrderDesc) == 1) {
tsDataSwap(&pMeterSids[s1], &pMeterSids[s2], TSDB_DATA_TYPE_BINARY, size);
}
assert(compareFn(pMeterSids[midIdx], pMeterSids[s1], pOrderDesc) <= 0 &&
compareFn(pMeterSids[s1], pMeterSids[s2], pOrderDesc) <= 0);
#ifdef _DEBUG_VIEW
tTagsPrints(pMeterSids[s1], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx);
tTagsPrints(pMeterSids[midIdx], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx);
tTagsPrints(pMeterSids[s2], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx);
#endif
}
static void tInsertSort(void **pMeterSids, size_t size, int32_t startPos, int32_t endPos, void *param,
__ext_compar_fn_t compareFn) {
for (int32_t i = startPos + 1; i <= endPos; ++i) {
for (int32_t j = i; j > startPos; --j) {
if (compareFn(pMeterSids[j], pMeterSids[j - 1], param) == -1) {
tsDataSwap(&pMeterSids[j], &pMeterSids[j - 1], TSDB_DATA_TYPE_BINARY, size);
} else {
break;
}
}
}
}
void tQSortEx(void **pMeterSids, size_t size, int32_t start, int32_t end, void *param, __ext_compar_fn_t compareFn) {
tOrderDescriptor *pOrderDesc = (tOrderDescriptor *)param;
// short array sort, incur another sort procedure instead of quick sort process
if (end - start + 1 <= 8) {
tInsertSort(pMeterSids, size, start, end, pOrderDesc, compareFn);
return;
}
median(pMeterSids, size, start, end, pOrderDesc, compareFn);
int32_t s = start, e = end;
int32_t endRightS = end, startLeftS = start;
while (s < e) {
while (e > s) {
int32_t ret = compareFn(pMeterSids[e], pMeterSids[s], pOrderDesc);
if (ret < 0) {
break;
}
/*
* move the data that equals to pivotal value to the right end of the list
*/
if (ret == 0 && e != endRightS) {
tsDataSwap(&pMeterSids[e], &pMeterSids[endRightS--], TSDB_DATA_TYPE_BINARY, size);
}
e--;
}
if (e != s) {
tsDataSwap(&pMeterSids[e], &pMeterSids[s], TSDB_DATA_TYPE_BINARY, size);
}
while (s < e) {
int32_t ret = compareFn(pMeterSids[s], pMeterSids[e], pOrderDesc);
if (ret > 0) {
break;
}
if (ret == 0 && s != startLeftS) {
tsDataSwap(&pMeterSids[s], &pMeterSids[startLeftS++], TSDB_DATA_TYPE_BINARY, size);
}
s++;
}
if (e != s) {
tsDataSwap(&pMeterSids[s], &pMeterSids[e], TSDB_DATA_TYPE_BINARY, size);
}
}
int32_t rightPartStart = e + 1;
if (endRightS != end && e < end) {
int32_t left = rightPartStart;
int32_t right = end;
while (right > endRightS && left <= endRightS) {
tsDataSwap(&pMeterSids[left++], &pMeterSids[right--], TSDB_DATA_TYPE_BINARY, size);
}
rightPartStart += (end - endRightS);
}
int32_t leftPartEnd = e - 1;
if (startLeftS != end && s > start) {
int32_t left = start;
int32_t right = leftPartEnd;
while (left < startLeftS && right >= startLeftS) {
tsDataSwap(&pMeterSids[left++], &pMeterSids[right--], TSDB_DATA_TYPE_BINARY, size);
}
leftPartEnd -= (startLeftS - start);
}
if (leftPartEnd > start) {
tQSortEx(pMeterSids, size, start, leftPartEnd, pOrderDesc, compareFn);
}
if (rightPartStart < end) {
tQSortEx(pMeterSids, size, rightPartStart, end, pOrderDesc, compareFn);
}
}
int32_t *calculateSubGroup(void **pSids, int32_t numOfMeters, int32_t *numOfSubset, tOrderDescriptor *pOrderDesc,
__ext_compar_fn_t compareFn) {
int32_t *starterPos = (int32_t *)malloc((numOfMeters + 1) * sizeof(int32_t)); // add additional buffer
starterPos[0] = 0;
*numOfSubset = 1;
for (int32_t i = 1; i < numOfMeters; ++i) {
int32_t ret = compareFn(pSids[i - 1], pSids[i], pOrderDesc);
if (ret != 0) {
assert(ret == -1);
starterPos[(*numOfSubset)++] = i;
}
}
starterPos[*numOfSubset] = numOfMeters;
assert(*numOfSubset <= numOfMeters);
return starterPos;
}
tSidSet *tSidSetCreate(struct SMeterSidExtInfo **pMeterSidExtInfo, int32_t numOfMeters, SSchema *pSchema,
int32_t numOfTags, SColIndexEx *colList, int32_t numOfCols) {
tSidSet *pSidSet = (tSidSet *)calloc(1, sizeof(tSidSet) + numOfCols * sizeof(int16_t));
if (pSidSet == NULL) {
return NULL;
}
pSidSet->numOfSids = numOfMeters;
pSidSet->pSids = pMeterSidExtInfo;
pSidSet->pColumnModel = createColumnModel(pSchema, numOfTags, 1);
pSidSet->orderIdx.numOfCols = numOfCols;
/*
* in case of "group by tbname,normal_col", the normal_col is ignored
*/
int32_t numOfTagCols = 0;
for(int32_t i = 0; i < numOfCols; ++i) {
if (colList[i].flag == TSDB_COL_TAG) {
pSidSet->orderIdx.pData[numOfTagCols++] = colList[i].colIdx;
}
}
pSidSet->orderIdx.numOfCols = numOfTagCols;
pSidSet->starterPos = NULL;
return pSidSet;
}
void tSidSetDestroy(tSidSet **pSets) {
if ((*pSets) != NULL) {
tfree((*pSets)->starterPos);
tfree((*pSets)->pColumnModel)(*pSets)->pSids = NULL;
tfree(*pSets);
}
}
void tTagsPrints(SMeterSidExtInfo *pMeterInfo, SColumnModel *pSchema, SColumnOrderInfo *pOrder) {
if (pSchema == NULL) {
return;
}
printf("sid: %-5d tags(", pMeterInfo->sid);
for (int32_t i = 0; i < pOrder->numOfCols; ++i) {
int32_t colIndex = pOrder->pData[i];
// it is the tbname column
if (colIndex == -1) {
printf("%s, ", pMeterInfo->tags);
continue;
}
SSchema* s = getColumnModelSchema(pSchema, colIndex);
switch (s->type) {
case TSDB_DATA_TYPE_INT:
printf("%d, ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, int32_t));
break;
case TSDB_DATA_TYPE_DOUBLE:
printf("%lf, ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, double));
break;
case TSDB_DATA_TYPE_FLOAT:
printf("%f, ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, float));
break;
case TSDB_DATA_TYPE_BIGINT:
printf("%" PRId64 ", ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, int64_t));
break;
case TSDB_DATA_TYPE_SMALLINT:
printf("%d, ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, int16_t));
break;
case TSDB_DATA_TYPE_TINYINT:
printf("%d, ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, int8_t));
break;
case TSDB_DATA_TYPE_BINARY:
printf("%s, ", GET_TAG_VAL_POINTER(pMeterInfo, colIndex, pSchema, char));
break;
case TSDB_DATA_TYPE_NCHAR: {
char *data = GET_TAG_VAL_POINTER(pMeterInfo, colIndex, pSchema, char);
char buffer[512] = {0};
taosUcs4ToMbs(data, s->bytes, buffer);
printf("%s, ", buffer);
break;
}
case TSDB_DATA_TYPE_BOOL:
printf("%d, ", GET_TAG_VAL(pMeterInfo, colIndex, pSchema, int8_t));
break;
default:
assert(false);
}
}
printf(")\n");
}
/*
* display all the subset groups for debug purpose only
*/
static void UNUSED_FUNC tSidSetDisplay(tSidSet *pSets) {
printf("%d meters.\n", pSets->numOfSids);
for (int32_t i = 0; i < pSets->numOfSids; ++i) {
printf("%d\t", pSets->pSids[i]->sid);
}
printf("\n");
printf("total number of subset group is: %d\n", pSets->numOfSubSet);
for (int32_t i = 0; i < pSets->numOfSubSet; ++i) {
int32_t s = pSets->starterPos[i];
int32_t e = pSets->starterPos[i + 1];
printf("the %d-th subgroup: \n", i + 1);
for (int32_t j = s; j < e; ++j) {
tTagsPrints(pSets->pSids[j], pSets->pColumnModel, &pSets->orderIdx);
}
}
}
void tSidSetSort(tSidSet *pSets) {
pTrace("number of meters in sort: %d", pSets->numOfSids);
SColumnOrderInfo *pOrderIdx = &pSets->orderIdx;
if (pOrderIdx->numOfCols == 0 || pSets->numOfSids <= 1 || pSets->pColumnModel == NULL) { // no group by tags clause
pSets->numOfSubSet = 1;
pSets->starterPos = (int32_t *)malloc(sizeof(int32_t) * (pSets->numOfSubSet + 1));
pSets->starterPos[0] = 0;
pSets->starterPos[1] = pSets->numOfSids;
pTrace("all meters belong to one subgroup, no need to subgrouping ops");
#ifdef _DEBUG_VIEW
tSidSetDisplay(pSets);
#endif
} else {
tOrderDescriptor *descriptor =
(tOrderDescriptor *)calloc(1, sizeof(tOrderDescriptor) + sizeof(int16_t) * pSets->orderIdx.numOfCols);
descriptor->pColumnModel = pSets->pColumnModel;
descriptor->orderIdx = pSets->orderIdx;
memcpy(descriptor->orderIdx.pData, pOrderIdx->pData, sizeof(int16_t) * pSets->orderIdx.numOfCols);
tQSortEx((void **)pSets->pSids, POINTER_BYTES, 0, pSets->numOfSids - 1, descriptor, meterSidComparator);
pSets->starterPos =
calculateSubGroup((void **)pSets->pSids, pSets->numOfSids, &pSets->numOfSubSet, descriptor, meterSidComparator);
#ifdef _DEBUG_VIEW
tSidSetDisplay(pSets);
#endif
tfree(descriptor);
}
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _DEFAULT_SOURCE
#include "os.h"
#include "qast.h"
#include "tscUtil.h"
#include "tschemautil.h"
#include "vnode.h"
#include "vnodeDataFilterFunc.h"
#include "vnodeStatus.h"
#include "vnodeUtil.h"
int vnodeCheckFileIntegrity(FILE* fp) {
/*
int savedSessions, savedMeterSize;
fseek(fp, TSDB_FILE_HEADER_LEN/3, SEEK_SET);
fscanf(fp, "%d %d", &savedSessions, &savedMeterSize);
if ( (savedSessions != tsSessionsPerVnode) || (savedMeterSize != tsMeterSizeOnFile) ) {
dError("file structure is changed");
return -1;
}
uint64_t checkSum = 0, savedCheckSum=0;
checkSum = taosGetCheckSum(fp, TSDB_FILE_HEADER_LEN);
fseek(fp, TSDB_FILE_HEADER_LEN - cksumsize, SEEK_SET);
fread(&savedCheckSum, cksumsize, 1, fp);
if ( savedCheckSum != checkSum ) {
dError("check sum is not matched:0x%x 0x%x", checkSum, savedCheckSum);
return -1;
}
*/
return 0;
}
void vnodeCreateFileHeaderFd(int fd) {
char temp[TSDB_FILE_HEADER_LEN / 4];
int lineLen;
lineLen = sizeof(temp);
// write the first line`
memset(temp, 0, lineLen);
*(int16_t*)temp = vnodeFileVersion;
sprintf(temp + sizeof(int16_t), "tsdb version: %s\n", version);
/* *((int16_t *)(temp + TSDB_FILE_HEADER_LEN/8)) = vnodeFileVersion; */
lseek(fd, 0, SEEK_SET);
twrite(fd, temp, lineLen);
// second line
memset(temp, 0, lineLen);
twrite(fd, temp, lineLen);
// the third/forth line is the dynamic info
memset(temp, 0, lineLen);
twrite(fd, temp, lineLen);
twrite(fd, temp, lineLen);
}
void vnodeGetHeadFileHeaderInfo(int fd, SVnodeHeadInfo* pHeadInfo) {
lseek(fd, TSDB_FILE_HEADER_LEN / 4, SEEK_SET);
read(fd, pHeadInfo, sizeof(SVnodeHeadInfo));
}
void vnodeUpdateHeadFileHeader(int fd, SVnodeHeadInfo* pHeadInfo) {
lseek(fd, TSDB_FILE_HEADER_LEN / 4, SEEK_SET);
twrite(fd, pHeadInfo, sizeof(SVnodeHeadInfo));
}
void vnodeCreateFileHeader(FILE* fp) {
char temp[TSDB_FILE_HEADER_LEN / 4];
int lineLen;
lineLen = sizeof(temp);
// write the first line
memset(temp, 0, lineLen);
*(int16_t*)temp = vnodeFileVersion;
sprintf(temp + sizeof(int16_t), "tsdb version: %s\n", version);
/* *((int16_t *)(temp + TSDB_FILE_HEADER_LEN/8)) = vnodeFileVersion; */
fseek(fp, 0, SEEK_SET);
fwrite(temp, lineLen, 1, fp);
// second line
memset(temp, 0, lineLen);
fwrite(temp, lineLen, 1, fp);
// the third line is the dynamic info
memset(temp, 0, lineLen);
fwrite(temp, lineLen, 1, fp);
fwrite(temp, lineLen, 1, fp);
}
SSqlGroupbyExpr* vnodeCreateGroupbyExpr(SQueryMeterMsg* pQueryMsg, int32_t* code) {
if (pQueryMsg->numOfGroupCols == 0) {
return NULL;
}
// using group by tag columns
SSqlGroupbyExpr* pGroupbyExpr =
(SSqlGroupbyExpr*)malloc(sizeof(SSqlGroupbyExpr) + pQueryMsg->numOfGroupCols * sizeof(SColIndexEx));
if (pGroupbyExpr == NULL) {
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
return NULL;
}
SColIndexEx* pGroupbyColInfo = (SColIndexEx*)pQueryMsg->groupbyTagIds;
pGroupbyExpr->numOfGroupCols = pQueryMsg->numOfGroupCols;
pGroupbyExpr->orderType = pQueryMsg->orderType;
pGroupbyExpr->orderIndex = pQueryMsg->orderByIdx;
memcpy(pGroupbyExpr->columnInfo, pGroupbyColInfo, sizeof(SColIndexEx) * pGroupbyExpr->numOfGroupCols);
// TODO: update the colIndexInBuf for each column in group by clause
return pGroupbyExpr;
}
static SSchema* toSchema(SQueryMeterMsg* pQuery, SColumnInfo* pCols, int32_t numOfCols) {
char* start = (char*)pQuery->colNameList;
char* end = start;
SSchema* pSchema = calloc(1, sizeof(SSchema) * numOfCols);
for (int32_t i = 0; i < numOfCols; ++i) {
pSchema[i].type = pCols[i].type;
pSchema[i].bytes = pCols[i].bytes;
pSchema[i].colId = pCols[i].colId;
end = strstr(start, ",");
memcpy(pSchema[i].name, start, end - start);
start = end + 1;
}
return pSchema;
}
static int32_t id_compar(const void* left, const void* right) {
DEFAULT_COMP(GET_INT16_VAL(left), GET_INT16_VAL(right));
}
static int32_t vnodeBuildExprFromArithmeticStr(SSqlFunctionExpr* pExpr, SQueryMeterMsg* pQueryMsg) {
SSqlBinaryExprInfo* pBinaryExprInfo = &pExpr->pBinExprInfo;
SColumnInfo* pColMsg = pQueryMsg->colList;
tSQLBinaryExpr* pBinExpr = NULL;
SSchema* pSchema = toSchema(pQueryMsg, pColMsg, pQueryMsg->numOfCols);
dTrace("qmsg:%p create binary expr from string:%s", pQueryMsg, pExpr->pBase.arg[0].argValue.pz);
tSQLBinaryExprFromString(&pBinExpr, pSchema, pQueryMsg->numOfCols, pExpr->pBase.arg[0].argValue.pz,
pExpr->pBase.arg[0].argBytes);
if (pBinExpr == NULL) {
dError("qmsg:%p failed to create arithmetic expression string from:%s", pQueryMsg, pExpr->pBase.arg[0].argValue.pz);
return TSDB_CODE_APP_ERROR;
}
pBinaryExprInfo->pBinExpr = pBinExpr;
int32_t num = 0;
int16_t ids[TSDB_MAX_COLUMNS] = {0};
tSQLBinaryExprTrv(pBinExpr, &num, ids);
qsort(ids, num, sizeof(int16_t), id_compar);
int32_t i = 0, j = 0;
while (i < num && j < num) {
if (ids[i] == ids[j]) {
j++;
} else {
ids[++i] = ids[j++];
}
}
assert(i <= num);
// there may be duplicated referenced columns.
num = i + 1;
pBinaryExprInfo->pReqColumns = malloc(sizeof(SColIndexEx) * num);
for (int32_t k = 0; k < num; ++k) {
SColIndexEx* pColIndex = &pBinaryExprInfo->pReqColumns[k];
pColIndex->colId = ids[k];
}
pBinaryExprInfo->numOfCols = num;
free(pSchema);
return TSDB_CODE_SUCCESS;
}
static int32_t getColumnIndexInSource(SQueryMeterMsg* pQueryMsg, SSqlFuncExprMsg* pExprMsg) {
int32_t j = 0;
while(j < pQueryMsg->numOfCols) {
if (pExprMsg->colInfo.colId == pQueryMsg->colList[j].colId) {
break;
}
j += 1;
}
return j;
}
bool vnodeValidateExprColumnInfo(SQueryMeterMsg* pQueryMsg, SSqlFuncExprMsg* pExprMsg) {
int32_t j = getColumnIndexInSource(pQueryMsg, pExprMsg);
return j < pQueryMsg->numOfCols;
}
SSqlFunctionExpr* vnodeCreateSqlFunctionExpr(SQueryMeterMsg* pQueryMsg, int32_t* code) {
SSqlFunctionExpr* pExprs = (SSqlFunctionExpr*)calloc(1, sizeof(SSqlFunctionExpr) * pQueryMsg->numOfOutputCols);
if (pExprs == NULL) {
tfree(pQueryMsg->pSqlFuncExprs);
*code = TSDB_CODE_SERV_OUT_OF_MEMORY;
return NULL;
}
bool isSuperTable = QUERY_IS_STABLE_QUERY(pQueryMsg->queryType);
int16_t tagLen = 0;
SSchema* pTagSchema = (SSchema*)pQueryMsg->pTagSchema;
for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
pExprs[i].pBase = *((SSqlFuncExprMsg**)pQueryMsg->pSqlFuncExprs)[i];
pExprs[i].resBytes = 0;
int16_t type = 0;
int16_t bytes = 0;
SColIndexEx* pColumnIndexExInfo = &pExprs[i].pBase.colInfo;
// tag column schema is kept in pQueryMsg->pColumnModel
if (TSDB_COL_IS_TAG(pColumnIndexExInfo->flag)) {
if (pColumnIndexExInfo->colIdx >= pQueryMsg->numOfTagsCols) {
*code = TSDB_CODE_INVALID_QUERY_MSG;
tfree(pExprs);
return NULL;
}
type = pTagSchema[pColumnIndexExInfo->colIdx].type;
bytes = pTagSchema[pColumnIndexExInfo->colIdx].bytes;
} else { // parse the arithmetic expression
if (pExprs[i].pBase.functionId == TSDB_FUNC_ARITHM) {
*code = vnodeBuildExprFromArithmeticStr(&pExprs[i], pQueryMsg);
if (*code != TSDB_CODE_SUCCESS) {
tfree(pExprs);
return NULL;
}
type = TSDB_DATA_TYPE_DOUBLE;
bytes = tDataTypeDesc[type].nSize;
} else { // parse the normal column
int32_t j = getColumnIndexInSource(pQueryMsg, &pExprs[i].pBase);
assert(j < pQueryMsg->numOfCols);
SColumnInfo* pCol = &pQueryMsg->colList[j];
type = pCol->type;
bytes = pCol->bytes;
}
}
int32_t param = pExprs[i].pBase.arg[0].argValue.i64;
if (getResultDataInfo(type, bytes, pExprs[i].pBase.functionId, param, &pExprs[i].resType, &pExprs[i].resBytes,
&pExprs[i].interResBytes, 0, isSuperTable) != TSDB_CODE_SUCCESS) {
*code = TSDB_CODE_INVALID_QUERY_MSG;
return NULL;
}
if (pExprs[i].pBase.functionId == TSDB_FUNC_TAG_DUMMY || pExprs[i].pBase.functionId == TSDB_FUNC_TS_DUMMY) {
tagLen += pExprs[i].resBytes;
}
assert(isValidDataType(pExprs[i].resType, pExprs[i].resBytes));
}
//get the correct result size for top/bottom query, according to the number of tags columns in selection clause
// TODO refactor
for(int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
pExprs[i].pBase = *((SSqlFuncExprMsg**)pQueryMsg->pSqlFuncExprs)[i];
int16_t functId = pExprs[i].pBase.functionId;
if (functId == TSDB_FUNC_TOP || functId == TSDB_FUNC_BOTTOM) {
int32_t j = getColumnIndexInSource(pQueryMsg, &pExprs[i].pBase);
assert(j < pQueryMsg->numOfCols);
SColumnInfo* pCol = &pQueryMsg->colList[j];
int16_t type = pCol->type;
int16_t bytes = pCol->bytes;
int32_t ret = getResultDataInfo(type, bytes, pExprs[i].pBase.functionId, pExprs[i].pBase.arg[0].argValue.i64,
&pExprs[i].resType, &pExprs[i].resBytes, &pExprs[i].interResBytes, tagLen, isSuperTable);
assert(ret == TSDB_CODE_SUCCESS);
}
}
tfree(pQueryMsg->pSqlFuncExprs);
return pExprs;
}
bool vnodeIsValidVnodeCfg(SVnodeCfg* pCfg) {
if (pCfg == NULL) return false;
if (pCfg->maxSessions <= 0 || pCfg->cacheBlockSize <= 0 || pCfg->replications <= 0 || pCfg->replications > 20 ||
pCfg->daysPerFile <= 0 || pCfg->daysToKeep <= 0) {
return false;
}
return true;
}
/**
* compare if schema of two tables are identical.
* when multi-table query is issued, the schemas of all requested tables
* should be identical. Otherwise,query process will abort.
*/
bool vnodeMeterSchemaIdentical(SColumn* pSchema1, int32_t numOfCols1, SColumn* pSchema2, int32_t numOfCols2) {
if (!VALIDNUMOFCOLS(numOfCols1) || !VALIDNUMOFCOLS(numOfCols2) || numOfCols1 != numOfCols2) {
return false;
}
return memcmp((char*)pSchema1, (char*)pSchema2, sizeof(SColumn) * numOfCols1) == 0;
}
void vnodeFreeFields(SQuery* pQuery) {
if (pQuery == NULL || pQuery->pFields == NULL) {
return;
}
for (int32_t i = 0; i < pQuery->numOfBlocks; ++i) {
tfree(pQuery->pFields[i]);
}
/*
* pQuery->pFields does not need to be released, it is allocated at the last part of pBlock
* so free(pBlock) can release this memory at the same time.
*/
pQuery->pFields = NULL;
pQuery->numOfBlocks = 0;
}
void vnodeUpdateFilterColumnIndex(SQuery* pQuery) {
for (int32_t i = 0; i < pQuery->numOfFilterCols; ++i) {
for (int16_t j = 0; j < pQuery->numOfCols; ++j) {
if (pQuery->pFilterInfo[i].info.data.colId == pQuery->colList[j].data.colId) {
pQuery->pFilterInfo[i].info.colIdx = pQuery->colList[j].colIdx;
pQuery->pFilterInfo[i].info.colIdxInBuf = pQuery->colList[j].colIdxInBuf;
// supplementary scan is also require this column
pQuery->colList[j].req[1] = 1;
break;
}
}
}
// set the column index in buffer for arithmetic operation
if (pQuery->pSelectExpr == NULL) {
return;
}
for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
SSqlBinaryExprInfo* pBinExprInfo = &pQuery->pSelectExpr[i].pBinExprInfo;
if (pBinExprInfo->pBinExpr == NULL) {
continue;
}
for (int16_t j = 0; j < pBinExprInfo->numOfCols; ++j) {
for (int32_t k = 0; k < pQuery->numOfCols; ++k) {
if (pBinExprInfo->pReqColumns[j].colId == pQuery->colList[k].data.colId) {
pBinExprInfo->pReqColumns[j].colIdxInBuf = pQuery->colList[k].colIdxInBuf;
assert(pQuery->colList[k].colIdxInBuf == k);
break;
}
}
}
}
}
// TODO support k<12 and k<>9
int32_t vnodeCreateFilterInfo(void* pQInfo, SQuery* pQuery) {
for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
if (pQuery->colList[i].data.numOfFilters > 0) {
pQuery->numOfFilterCols++;
}
}
if (pQuery->numOfFilterCols == 0) {
return TSDB_CODE_SUCCESS;
}
pQuery->pFilterInfo = calloc(1, sizeof(SSingleColumnFilterInfo) * pQuery->numOfFilterCols);
for (int32_t i = 0, j = 0; i < pQuery->numOfCols; ++i) {
if (pQuery->colList[i].data.numOfFilters > 0) {
SSingleColumnFilterInfo* pFilterInfo = &pQuery->pFilterInfo[j];
memcpy(&pFilterInfo->info, &pQuery->colList[i], sizeof(SColumnInfoEx));
pFilterInfo->info.data.filters = NULL;
pFilterInfo->numOfFilters = pQuery->colList[i].data.numOfFilters;
pFilterInfo->pFilters = calloc(pFilterInfo->numOfFilters, sizeof(SColumnFilterElem));
for(int32_t f = 0; f < pFilterInfo->numOfFilters; ++f) {
SColumnFilterElem *pSingleColFilter = &pFilterInfo->pFilters[f];
pSingleColFilter->filterInfo = pQuery->colList[i].data.filters[f];
int32_t lower = pSingleColFilter->filterInfo.lowerRelOptr;
int32_t upper = pSingleColFilter->filterInfo.upperRelOptr;
if (lower == TSDB_RELATION_INVALID && upper == TSDB_RELATION_INVALID) {
dError("QInfo:%p invalid filter info", pQInfo);
return TSDB_CODE_INVALID_QUERY_MSG;
}
int16_t type = pQuery->colList[i].data.type;
int16_t bytes = pQuery->colList[i].data.bytes;
__filter_func_t *rangeFilterArray = vnodeGetRangeFilterFuncArray(type);
__filter_func_t *filterArray = vnodeGetValueFilterFuncArray(type);
if (rangeFilterArray == NULL && filterArray == NULL) {
dError("QInfo:%p failed to get filter function, invalid data type:%d", pQInfo, type);
return TSDB_CODE_INVALID_QUERY_MSG;
}
if ((lower == TSDB_RELATION_LARGE_EQUAL || lower == TSDB_RELATION_LARGE) &&
(upper == TSDB_RELATION_LESS_EQUAL || upper == TSDB_RELATION_LESS)) {
if (lower == TSDB_RELATION_LARGE_EQUAL) {
if (upper == TSDB_RELATION_LESS_EQUAL) {
pSingleColFilter->fp = rangeFilterArray[4];
} else {
pSingleColFilter->fp = rangeFilterArray[2];
}
} else {
if (upper == TSDB_RELATION_LESS_EQUAL) {
pSingleColFilter->fp = rangeFilterArray[3];
} else {
pSingleColFilter->fp = rangeFilterArray[1];
}
}
} else { // set callback filter function
if (lower != TSDB_RELATION_INVALID) {
pSingleColFilter->fp = filterArray[lower];
if (upper != TSDB_RELATION_INVALID) {
dError("pQInfo:%p failed to get filter function, invalid filter condition", pQInfo, type);
return TSDB_CODE_INVALID_QUERY_MSG;
}
} else {
pSingleColFilter->fp = filterArray[upper];
}
}
assert (pSingleColFilter->fp != NULL);
pSingleColFilter->bytes = bytes;
}
j++;
}
}
return TSDB_CODE_SUCCESS;
}
bool vnodeDoFilterData(SQuery* pQuery, int32_t elemPos) {
for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
char* pElem = pFilterInfo->pData + pFilterInfo->info.data.bytes * elemPos;
if(isNull(pElem, pFilterInfo->info.data.type)) {
return false;
}
int32_t num = pFilterInfo->numOfFilters;
bool qualified = false;
for(int32_t j = 0; j < num; ++j) {
SColumnFilterElem* pFilterElem = &pFilterInfo->pFilters[j];
if (pFilterElem->fp(pFilterElem, pElem, pElem)) {
qualified = true;
break;
}
}
if (!qualified) {
return false;
}
}
return true;
}
bool vnodeFilterData(SQuery* pQuery, int32_t* numOfActualRead, int32_t index) {
(*numOfActualRead)++;
if (!vnodeDoFilterData(pQuery, index)) {
return false;
}
if (pQuery->limit.offset > 0) {
pQuery->limit.offset--; // ignore this qualified row
return false;
}
return true;
}
bool vnodeIsProjectionQuery(SSqlFunctionExpr* pExpr, int32_t numOfOutput) {
for (int32_t i = 0; i < numOfOutput; ++i) {
if (pExpr[i].pBase.functionId != TSDB_FUNC_PRJ) {
return false;
}
}
return true;
}
/*
* the pTable->state may be changed by vnodeIsSafeToDeleteMeter and import/update processor, the check of
* the state will not always be correct.
*
* The import/update/deleting is actually blocked by current query processing if the check of meter state is
* passed, but later queries are denied.
*
* 1. vnodeIsSafeToDelete will wait for this complete, since it also use the vmutex to check the numOfQueries
* 2. import will check the numOfQueries again after setting state to be TSDB_METER_STATE_IMPORTING, while the
* vmutex is also used.
* 3. insert has nothing to do with the query processing.
*/
int32_t vnodeIncQueryRefCount(SQueryMeterMsg* pQueryMsg, SMeterSidExtInfo** pSids, SMeterObj** pMeterObjList,
int32_t* numOfIncTables) {
SVnodeObj* pVnode = &vnodeList[pQueryMsg->vnode];
int32_t num = 0;
int32_t index = 0;
int32_t code = TSDB_CODE_SUCCESS;
for (int32_t i = 0; i < pQueryMsg->numOfSids; ++i) {
SMeterObj* pTable = pVnode->meterList[pSids[i]->sid];
/*
* If table is missing or is in dropping status, config it from management node, and ignore it
* during query processing. The error code of TSDB_CODE_NOT_ACTIVE_TABLE will never return to client.
* The missing table needs to be removed from pSids list
*/
if (pTable == NULL || vnodeIsMeterState(pTable, TSDB_METER_STATE_DROPPING)) {
dWarn("qmsg:%p, vid:%d sid:%d, not there or will be dropped, ignore this table in query", pQueryMsg,
pQueryMsg->vnode, pSids[i]->sid);
vnodeSendMeterCfgMsg(pQueryMsg->vnode, pSids[i]->sid);
continue;
} else if (pTable->uid != pSids[i]->uid || pTable->sid != pSids[i]->sid) {
code = TSDB_CODE_TABLE_ID_MISMATCH;
dError("qmsg:%p, vid:%d sid:%d id:%s uid:%" PRIu64 ", id mismatch. sid:%d uid:%" PRId64 " in msg", pQueryMsg,
pQueryMsg->vnode, pTable->sid, pTable->meterId, pTable->uid, pSids[i]->sid, pSids[i]->uid);
vnodeSendMeterCfgMsg(pQueryMsg->vnode, pSids[i]->sid);
continue;
} else if (pTable->state > TSDB_METER_STATE_INSERTING) { //update or import
code = TSDB_CODE_ACTION_IN_PROGRESS;
dTrace("qmsg:%p, vid:%d sid:%d id:%s, it is in state:%s, wait!", pQueryMsg, pQueryMsg->vnode, pSids[i]->sid,
pTable->meterId, taosGetTableStatusStr(pTable->state));
continue;
}
/*
* vnodeIsSafeToDeleteMeter will wait for this function complete, and then it can
* check if the numOfQueries is 0 or not.
*/
pMeterObjList[(*numOfIncTables)++] = pTable;
atomic_fetch_add_32(&pTable->numOfQueries, 1);
pSids[index++] = pSids[i];
// output for meter more than one query executed
if (pTable->numOfQueries > 1) {
dTrace("qmsg:%p, vid:%d sid:%d id:%s, inc query ref, numOfQueries:%d", pQueryMsg, pTable->vnode, pTable->sid,
pTable->meterId, pTable->numOfQueries);
num++;
}
}
dTrace("qmsg:%p, query meters: %d, inc query ref %d, numOfQueries on %d meters are 1, queried meters:%d after "
"filter missing meters", pQueryMsg, pQueryMsg->numOfSids, *numOfIncTables, (*numOfIncTables) - num, index);
assert(pQueryMsg->numOfSids >= (*numOfIncTables) && pQueryMsg->numOfSids >= index);
pQueryMsg->numOfSids = index;
return code;
}
void vnodeDecQueryRefCount(SQueryMeterMsg* pQueryMsg, SMeterObj** pMeterObjList, int32_t numOfIncTables) {
int32_t num = 0;
for (int32_t i = 0; i < numOfIncTables; ++i) {
SMeterObj* pTable = pMeterObjList[i];
if (pTable != NULL) { // here, do not need to lock to perform operations
atomic_fetch_sub_32(&pTable->numOfQueries, 1);
if (pTable->numOfQueries > 0) {
dTrace("qmsg:%p, vid:%d sid:%d id:%s dec query ref, numOfQueries:%d", pQueryMsg, pTable->vnode, pTable->sid,
pTable->meterId, pTable->numOfQueries);
num++;
}
}
}
dTrace("qmsg:%p, dec query ref for %d meters, numOfQueries on %d meters are 0", pQueryMsg, numOfIncTables, numOfIncTables - num);
}
void vnodeUpdateQueryColumnIndex(SQuery* pQuery, SMeterObj* pMeterObj) {
if (pQuery == NULL || pMeterObj == NULL) {
return;
}
int32_t i = 0, j = 0;
while (i < pQuery->numOfCols && j < pMeterObj->numOfColumns) {
if (pQuery->colList[i].data.colId == pMeterObj->schema[j].colId) {
pQuery->colList[i++].colIdx = (int16_t)j++;
} else if (pQuery->colList[i].data.colId < pMeterObj->schema[j].colId) {
pQuery->colList[i++].colIdx = -1;
} else if (pQuery->colList[i].data.colId > pMeterObj->schema[j].colId) {
j++;
}
}
while (i < pQuery->numOfCols) {
pQuery->colList[i++].colIdx = -1; // not such column in current meter
}
// sql expression has not been created yet
if (pQuery->pSelectExpr == NULL) {
return;
}
for(int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
SSqlFuncExprMsg* pSqlExprMsg = &pQuery->pSelectExpr[k].pBase;
if (pSqlExprMsg->functionId == TSDB_FUNC_ARITHM || pSqlExprMsg->colInfo.flag == TSDB_COL_TAG) {
continue;
}
SColIndexEx* pColIndexEx = &pSqlExprMsg->colInfo;
for(int32_t f = 0; f < pQuery->numOfCols; ++f) {
if (pColIndexEx->colId == pQuery->colList[f].data.colId) {
pColIndexEx->colIdx = pQuery->colList[f].colIdx;
break;
}
}
}
}
int32_t vnodeSetMeterState(SMeterObj* pMeterObj, int32_t state) {
return atomic_val_compare_exchange_32(&pMeterObj->state, TSDB_METER_STATE_READY, state);
}
void vnodeClearMeterState(SMeterObj* pMeterObj, int32_t state) {
pMeterObj->state &= (~state);
}
bool vnodeIsMeterState(SMeterObj* pMeterObj, int32_t state) {
if (state == TSDB_METER_STATE_READY) {
return pMeterObj->state == TSDB_METER_STATE_READY;
} else if (state == TSDB_METER_STATE_DROPPING) {
return pMeterObj->state >= state;
} else {
return (((pMeterObj->state) & state) == state);
}
}
void vnodeSetMeterDeleting(SMeterObj* pMeterObj) {
if (pMeterObj == NULL) {
return;
}
pMeterObj->state |= TSDB_METER_STATE_DROPPING;
}
int32_t vnodeSetMeterInsertImportStateEx(SMeterObj* pObj, int32_t st) {
int32_t code = TSDB_CODE_SUCCESS;
int32_t state = vnodeSetMeterState(pObj, st);
if (state != TSDB_METER_STATE_READY) {//return to denote import is not performed
if (vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPING)) {
dTrace("vid:%d sid:%d id:%s, meter is deleted, state:%d", pObj->vnode, pObj->sid, pObj->meterId,
pObj->state);
code = TSDB_CODE_NOT_ACTIVE_TABLE;
} else {// waiting for 300ms by default and try again
dTrace("vid:%d sid:%d id:%s, try submit again since in state:%d", pObj->vnode, pObj->sid,
pObj->meterId, pObj->state);
code = TSDB_CODE_ACTION_IN_PROGRESS;
}
}
return code;
}
bool vnodeIsSafeToDeleteMeter(SVnodeObj* pVnode, int32_t sid) {
SMeterObj* pObj = pVnode->meterList[sid];
if (pObj == NULL || vnodeIsMeterState(pObj, TSDB_METER_STATE_DROPPED)) {
return true;
}
int32_t prev = vnodeSetMeterState(pObj, TSDB_METER_STATE_DROPPING);
/*
* if the meter is not in ready/deleting state, it must be in insert/import/update,
* set the deleting state and wait the procedure to be completed
*/
if (prev != TSDB_METER_STATE_READY && prev < TSDB_METER_STATE_DROPPING) {
vnodeSetMeterDeleting(pObj);
dWarn("vid:%d sid:%d id:%s, can not be deleted, state:%d, wait", pObj->vnode, pObj->sid, pObj->meterId, prev);
return false;
}
bool ready = true;
/*
* the query will be stopped ASAP, since the state of meter is set to TSDB_METER_STATE_DROPPING,
* and new query will abort since the meter is deleted.
*/
pthread_mutex_lock(&pVnode->vmutex);
if (pObj->numOfQueries > 0) {
dWarn("vid:%d sid:%d id:%s %d queries executing on it, wait query to be finished",
pObj->vnode, pObj->sid, pObj->meterId, pObj->numOfQueries);
ready = false;
}
pthread_mutex_unlock(&pVnode->vmutex);
return ready;
}
void vnodeFreeColumnInfo(SColumnInfo* pColumnInfo) {
if (pColumnInfo == NULL) {
return;
}
if (pColumnInfo->numOfFilters > 0) {
if (pColumnInfo->type == TSDB_DATA_TYPE_BINARY) {
for (int32_t i = 0; i < pColumnInfo->numOfFilters; ++i) {
tfree(pColumnInfo->filters[i].pz);
pColumnInfo->filters[i].len = 0;
}
}
tfree(pColumnInfo->filters);
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册