tsdbRead.c 182.3 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Haojun Liao 已提交
16
#include "osDef.h"
H
Hongze Cheng 已提交
17
#include "tsdb.h"
18
#include "tsimplehash.h"
19

H
Hongze Cheng 已提交
20
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
21
#define getCurrentKeyInLastBlock(_r) ((_r)->currentKey)
H
Hongze Cheng 已提交
22

H
Haojun Liao 已提交
23
typedef enum {
H
Haojun Liao 已提交
24 25 26
  READER_STATUS_SUSPEND = 0x1,
  READER_STATUS_NORMAL = 0x2,
} EReaderStatus;
H
Haojun Liao 已提交
27

28 29 30 31 32 33
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

D
dapan1121 已提交
34 35 36 37 38
typedef enum {
  READ_MODE_COUNT_ONLY = 0x1,
  READ_MODE_ALL,
} EReadMode;

39
typedef struct {
dengyihao's avatar
dengyihao 已提交
40
  STbDataIter* iter;
41 42 43 44
  int32_t      index;
  bool         hasVal;
} SIterInfo;

45 46
typedef struct {
  int32_t numOfBlocks;
47
  int32_t numOfLastFiles;
48 49
} SBlockNumber;

50
typedef struct SBlockIndex {
51 52
  int32_t     ordinalIndex;
  int64_t     inFileOffset;
H
Haojun Liao 已提交
53
  STimeWindow window;  // todo replace it with overlap flag.
54 55
} SBlockIndex;

H
Haojun Liao 已提交
56
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
57 58
  uint64_t  uid;
  TSKEY     lastKey;
59
  TSKEY     lastKeyInStt;       // last accessed key in stt
H
Hongze Cheng 已提交
60
  SMapData  mapData;            // block info (compressed)
61
  SArray*   pBlockList;         // block data index list, SArray<SBlockIndex>
H
Hongze Cheng 已提交
62 63 64 65 66 67
  SIterInfo iter;               // mem buffer skip list iterator
  SIterInfo iiter;              // imem buffer skip list iterator
  SArray*   delSkyline;         // delete info for this table
  int32_t   fileDelIndex;       // file block delete index
  int32_t   lastBlockDelIndex;  // delete index for last block
  bool      iterInit;           // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
68 69 70
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
71
  int64_t uid;
72
  int64_t offset;
H
Haojun Liao 已提交
73
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
74 75

typedef struct SBlockOrderSupporter {
76 77 78 79
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
80 81 82
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
83 84 85
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
86
  int64_t headFileLoad;
87
  double  headFileLoadTime;
88
  int64_t smaDataLoad;
89
  double  smaLoadTime;
90 91
  int64_t lastBlockLoad;
  double  lastBlockLoadTime;
H
Haojun Liao 已提交
92 93
  int64_t composedBlocks;
  double  buildComposedBlockTime;
H
Haojun Liao 已提交
94
  double  createScanInfoList;
X
Xiaoyu Wang 已提交
95 96 97
  //  double  getTbFromMemTime;
  //  double  getTbFromIMemTime;
  double initDelSkylineIterTime;
H
Hongze Cheng 已提交
98 99 100
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
101 102 103 104 105 106 107
  SArray*        pColAgg;
  SColumnDataAgg tsColAgg;
  int16_t*       colId;
  int16_t*       slotId;
  int32_t        numOfCols;
  char**         buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
  bool           smaValid;  // the sma on all queried columns are activated
H
Hongze Cheng 已提交
108 109
} SBlockLoadSuppInfo;

110
typedef struct SLastBlockReader {
H
Hongze Cheng 已提交
111 112 113 114 115
  STimeWindow        window;
  SVersionRange      verRange;
  int32_t            order;
  uint64_t           uid;
  SMergeTree         mergeTree;
116
  SSttBlockLoadInfo* pInfo;
117
  int64_t            currentKey;
118 119
} SLastBlockReader;

120
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
121 122 123
  int32_t           numOfFiles;  // number of total files
  int32_t           index;       // current accessed index in the list
  SArray*           pFileList;   // data file list
124
  int32_t           order;
H
Hongze Cheng 已提交
125
  SLastBlockReader* pLastBlockReader;  // last file block reader
126
} SFilesetIter;
H
Haojun Liao 已提交
127 128

typedef struct SFileDataBlockInfo {
129
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
130
  uint64_t uid;
131
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
132 133 134
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
H
Haojun Liao 已提交
135 136 137 138 139 140
  int32_t    numOfBlocks;
  int32_t    index;
  SArray*    blockList;  // SArray<SFileDataBlockInfo>
  int32_t    order;
  SDataBlk   block;  // current SDataBlk data
  SSHashObj* pTableMap;
H
Haojun Liao 已提交
141 142 143
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
144 145 146 147
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
148 149
} SFileBlockDumpInfo;

150
typedef struct STableUidList {
151 152
  uint64_t* tableUidList;  // access table uid list in uid ascending order list
  int32_t   currentIndex;  // index in table uid list
153
} STableUidList;
154

H
Haojun Liao 已提交
155
typedef struct SReaderStatus {
H
Hongze Cheng 已提交
156 157
  bool                  loadFromFile;       // check file stage
  bool                  composedDataBlock;  // the returned data block is a composed block or not
158
  bool                  mapDataCleaned;     // mapData has been cleaned up alreay or not
H
Haojun Liao 已提交
159
  SSHashObj*            pTableMap;          // SHash<STableBlockScanInfo>
160
  STableBlockScanInfo** pTableIter;         // table iterator used in building in-memory buffer data blocks.
161
  STableUidList         uidList;            // check tables in uid order, to avoid the repeatly load of blocks in STT.
H
Hongze Cheng 已提交
162 163 164 165 166
  SFileBlockDumpInfo    fBlockDumpInfo;
  SDFileSet*            pCurrentFileset;  // current opened file set
  SBlockData            fileBlockData;
  SFilesetIter          fileIter;
  SDataBlockIter        blockIter;
167
  SLDataIter*           pLDataIter;
H
Haojun Liao 已提交
168
  SRowMerger            merger;
169
  SColumnInfoData*      pPrimaryTsCol;      // primary time stamp output col info data
H
Haojun Liao 已提交
170 171
} SReaderStatus;

172
typedef struct SBlockInfoBuf {
H
Hongze Cheng 已提交
173 174 175
  int32_t currentIndex;
  SArray* pData;
  int32_t numPerBucket;
D
dapan1121 已提交
176
  int32_t numOfTables;
177 178
} SBlockInfoBuf;

H
Haojun Liao 已提交
179 180 181 182 183 184 185
typedef struct STsdbReaderAttr {
  STSchema*     pSchema;
  EReadMode     readMode;
  uint64_t      rowsNum;
  STimeWindow   window;
  bool          freeBlock;
  SVersionRange verRange;
H
Haojun Liao 已提交
186
  int16_t       order;
H
Haojun Liao 已提交
187 188
} STsdbReaderAttr;

189 190 191 192 193 194
typedef struct SResultBlockInfo {
  SSDataBlock* pResBlock;
  bool         freeBlock;
  int64_t      capacity;
} SResultBlockInfo;

H
Hongze Cheng 已提交
195
struct STsdbReader {
H
Haojun Liao 已提交
196
  STsdb*             pTsdb;
197 198
  SVersionRange      verRange;
  TdThreadMutex      readerMutex;
H
Haojun Liao 已提交
199 200
  EReaderStatus      flag;
  int32_t            code;
H
Haojun Liao 已提交
201 202
  uint64_t           suid;
  int16_t            order;
D
dapan1121 已提交
203 204
  EReadMode          readMode;
  uint64_t           rowsNum;
H
Haojun Liao 已提交
205
  STimeWindow        window;  // the primary query time window that applies to all queries
206
  SResultBlockInfo   resBlockInfo;
H
Haojun Liao 已提交
207
  SReaderStatus      status;
208 209
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
210
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
211
  STsdbReadSnap*     pReadSnap;
212
  SIOCostSummary     cost;
213
  SHashObj**         pIgnoreTables;
H
Haojun Liao 已提交
214 215 216 217 218 219
  STSchema*          pSchema;      // the newest version schema
  SSHashObj*         pSchemaMap;   // keep the retrieved schema info, to avoid the overhead by repeatly load schema
  SDataFReader*      pFileReader;  // the file reader
  SDelFReader*       pDelFReader;  // the del file reader
  SArray*            pDelIdx;      // del file block index;
  SBlockInfoBuf      blockInfoBuf;
220
  EContentData       step;
H
Haojun Liao 已提交
221
  STsdbReader*       innerReader[2];
H
Hongze Cheng 已提交
222
};
H
Hongze Cheng 已提交
223

H
Haojun Liao 已提交
224
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
225 226
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
227
static TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
H
Haojun Liao 已提交
228
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
H
Hongze Cheng 已提交
229
static int32_t  doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
230
                                       SRowMerger* pMerger, SVersionRange* pVerRange, const char* id);
H
Haojun Liao 已提交
231
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, STsdbReader* pReader);
H
Hongze Cheng 已提交
232
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow,
H
Haojun Liao 已提交
233
                                     STableBlockScanInfo* pScanInfo);
234
static int32_t  doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
H
Hongze Cheng 已提交
235
                                         int32_t rowIndex);
236
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
237
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order,
H
Hongze Cheng 已提交
238
                               SVersionRange* pVerRange);
239

H
Hongze Cheng 已提交
240
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
H
Haojun Liao 已提交
241
                                        TSDBROW* pResRow, STsdbReader* pReader, bool* freeTSRow);
H
Hongze Cheng 已提交
242
static int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo,
H
Hongze Cheng 已提交
243
                                  STsdbReader* pReader, SRow** pTSRow);
244 245
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                                     STsdbReader* pReader);
246

dengyihao's avatar
dengyihao 已提交
247 248 249 250
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
251
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Hongze Cheng 已提交
252 253
static bool          hasDataInLastBlock(SLastBlockReader* pLastBlockReader);
static int32_t       doBuildDataBlock(STsdbReader* pReader);
C
Cary Xu 已提交
254
static TSDBKEY       getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
255
static bool          hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo);
256
static void          initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter);
257
static int32_t       getInitialDelIndex(const SArray* pDelSkyline, int32_t order);
C
Cary Xu 已提交
258

H
Haojun Liao 已提交
259
static STableBlockScanInfo* getTableBlockScanInfo(SSHashObj* pTableMap, uint64_t uid, const char* id);
H
Haojun Liao 已提交
260

C
Cary Xu 已提交
261
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
H
Haojun Liao 已提交
262

263 264
static int32_t setColumnIdSlotList(SBlockLoadSuppInfo* pSupInfo, SColumnInfo* pCols, const int32_t* pSlotIdList,
                                   int32_t numOfCols) {
265
  pSupInfo->smaValid = true;
266
  pSupInfo->numOfCols = numOfCols;
267
  pSupInfo->colId = taosMemoryMalloc(numOfCols * (sizeof(int16_t) * 2 + POINTER_BYTES));
H
Haojun Liao 已提交
268 269
  if (pSupInfo->colId == NULL) {
    taosMemoryFree(pSupInfo->colId);
H
Haojun Liao 已提交
270 271
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
272

H
Haojun Liao 已提交
273
  pSupInfo->slotId = (int16_t*)((char*)pSupInfo->colId + (sizeof(int16_t) * numOfCols));
274
  pSupInfo->buildBuf = (char**)((char*)pSupInfo->slotId + (sizeof(int16_t) * numOfCols));
H
Haojun Liao 已提交
275
  for (int32_t i = 0; i < numOfCols; ++i) {
H
Haojun Liao 已提交
276 277
    pSupInfo->colId[i] = pCols[i].colId;
    pSupInfo->slotId[i] = pSlotIdList[i];
278

H
Haojun Liao 已提交
279 280
    if (IS_VAR_DATA_TYPE(pCols[i].type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCols[i].bytes);
H
Haojun Liao 已提交
281 282
    } else {
      pSupInfo->buildBuf[i] = NULL;
283
    }
H
Haojun Liao 已提交
284
  }
H
Hongze Cheng 已提交
285

H
Haojun Liao 已提交
286 287
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
288

H
Haojun Liao 已提交
289
static int32_t updateBlockSMAInfo(STSchema* pSchema, SBlockLoadSuppInfo* pSupInfo) {
290 291
  int32_t i = 0, j = 0;

H
Hongze Cheng 已提交
292
  while (i < pSchema->numOfCols && j < pSupInfo->numOfCols) {
293
    STColumn* pTCol = &pSchema->columns[i];
H
Haojun Liao 已提交
294
    if (pTCol->colId == pSupInfo->colId[j]) {
295 296
      if (!IS_BSMA_ON(pTCol)) {
        pSupInfo->smaValid = false;
H
Haojun Liao 已提交
297
        return TSDB_CODE_SUCCESS;
298 299 300 301
      }

      i += 1;
      j += 1;
H
Haojun Liao 已提交
302
    } else if (pTCol->colId < pSupInfo->colId[j]) {
303 304 305
      // do nothing
      i += 1;
    } else {
H
Haojun Liao 已提交
306
      return TSDB_CODE_INVALID_PARA;
307 308
    }
  }
H
Haojun Liao 已提交
309 310

  return TSDB_CODE_SUCCESS;
311 312
}

313
static int32_t initBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
H
Hongze Cheng 已提交
314
  int32_t num = numOfTables / pBuf->numPerBucket;
315 316 317 318 319
  int32_t remainder = numOfTables % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

H
Hongze Cheng 已提交
320
  for (int32_t i = 0; i < num; ++i) {
321 322 323 324
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
325

326 327 328 329 330 331 332
    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
333
    }
334
    taosArrayPush(pBuf->pData, &p);
H
Haojun Liao 已提交
335
  }
H
Hongze Cheng 已提交
336

D
dapan1121 已提交
337 338 339 340 341 342 343 344 345 346 347
  pBuf->numOfTables = numOfTables;

  return TSDB_CODE_SUCCESS;
}

static int32_t ensureBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
  if (numOfTables <= pBuf->numOfTables) {
    return TSDB_CODE_SUCCESS;
  }

  if (pBuf->numOfTables > 0) {
348
    STableBlockScanInfo** p = (STableBlockScanInfo**)taosArrayPop(pBuf->pData);
D
dapan1121 已提交
349
    taosMemoryFree(*p);
D
dapan1121 已提交
350 351
    pBuf->numOfTables /= pBuf->numPerBucket;
  }
352

D
dapan1121 已提交
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  int32_t num = (numOfTables - pBuf->numOfTables) / pBuf->numPerBucket;
  int32_t remainder = (numOfTables - pBuf->numOfTables) % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

  for (int32_t i = 0; i < num; ++i) {
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
    taosArrayPush(pBuf->pData, &p);
  }

  pBuf->numOfTables = numOfTables;

H
Haojun Liao 已提交
378 379
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
380

381 382
static void clearBlockScanInfoBuf(SBlockInfoBuf* pBuf) {
  size_t num = taosArrayGetSize(pBuf->pData);
H
Hongze Cheng 已提交
383
  for (int32_t i = 0; i < num; ++i) {
384 385 386 387 388 389 390 391 392
    char** p = taosArrayGet(pBuf->pData, i);
    taosMemoryFree(*p);
  }

  taosArrayDestroy(pBuf->pData);
}

static void* getPosInBlockInfoBuf(SBlockInfoBuf* pBuf, int32_t index) {
  int32_t bucketIndex = index / pBuf->numPerBucket;
H
Hongze Cheng 已提交
393
  char**  pBucket = taosArrayGet(pBuf->pData, bucketIndex);
394 395 396
  return (*pBucket) + (index % pBuf->numPerBucket) * sizeof(STableBlockScanInfo);
}

H
Haojun Liao 已提交
397 398 399 400 401 402 403 404 405 406
static int32_t uidComparFunc(const void* p1, const void* p2) {
  uint64_t pu1 = *(uint64_t*)p1;
  uint64_t pu2 = *(uint64_t*)p2;
  if (pu1 == pu2) {
    return 0;
  } else {
    return (pu1 < pu2) ? -1 : 1;
  }
}

407
// NOTE: speedup the whole processing by preparing the buffer for STableBlockScanInfo in batch model
H
Haojun Liao 已提交
408
static SSHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, SBlockInfoBuf* pBuf, const STableKeyInfo* idList,
X
Xiaoyu Wang 已提交
409
                                         STableUidList* pUidList, int32_t numOfTables) {
H
Haojun Liao 已提交
410
  // allocate buffer in order to load data blocks from file
411
  // todo use simple hash instead, optimize the memory consumption
H
Haojun Liao 已提交
412
  SSHashObj* pTableMap = tSimpleHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT));
413
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
414 415 416
    return NULL;
  }

H
Haojun Liao 已提交
417
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
418
  initBlockScanInfoBuf(pBuf, numOfTables);
H
Haojun Liao 已提交
419

H
Haojun Liao 已提交
420 421
  pUidList->tableUidList = taosMemoryMalloc(numOfTables * sizeof(uint64_t));
  if (pUidList->tableUidList == NULL) {
H
Haojun Liao 已提交
422
    tSimpleHashCleanup(pTableMap);
H
Haojun Liao 已提交
423 424
    return NULL;
  }
H
Haojun Liao 已提交
425

H
Haojun Liao 已提交
426
  pUidList->currentIndex = 0;
H
Haojun Liao 已提交
427

428
  for (int32_t j = 0; j < numOfTables; ++j) {
H
Haojun Liao 已提交
429
    STableBlockScanInfo* pScanInfo = getPosInBlockInfoBuf(pBuf, j);
H
Haojun Liao 已提交
430

431
    pScanInfo->uid = idList[j].uid;
H
Haojun Liao 已提交
432
    pUidList->tableUidList[j] = idList[j].uid;
H
Haojun Liao 已提交
433

434
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
H
Haojun Liao 已提交
435
      int64_t skey = pTsdbReader->window.skey;
436
      pScanInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
H
Haojun Liao 已提交
437
      pScanInfo->lastKeyInStt = skey;
wmmhello's avatar
wmmhello 已提交
438
    } else {
H
Haojun Liao 已提交
439
      int64_t ekey = pTsdbReader->window.ekey;
440
      pScanInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
H
Haojun Liao 已提交
441
      pScanInfo->lastKeyInStt = ekey;
H
Haojun Liao 已提交
442
    }
wmmhello's avatar
wmmhello 已提交
443

H
Haojun Liao 已提交
444
    tSimpleHashPut(pTableMap, &pScanInfo->uid, sizeof(uint64_t), &pScanInfo, POINTER_BYTES);
H
Hongze Cheng 已提交
445 446
    tsdbTrace("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, pScanInfo->uid,
              pScanInfo->lastKey, pTsdbReader->idStr);
H
Haojun Liao 已提交
447 448
  }

H
Haojun Liao 已提交
449
  taosSort(pUidList->tableUidList, numOfTables, sizeof(uint64_t), uidComparFunc);
H
Haojun Liao 已提交
450

H
Haojun Liao 已提交
451 452 453 454
  pTsdbReader->cost.createScanInfoList = (taosGetTimestampUs() - st) / 1000.0;
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, elapsed time:%.2f ms, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->cost.createScanInfoList,
            pTsdbReader->idStr);
455

456
  return pTableMap;
H
Hongze Cheng 已提交
457
}
H
Hongze Cheng 已提交
458

H
Haojun Liao 已提交
459 460 461 462 463
static void resetAllDataBlockScanInfo(SSHashObj* pTableMap, int64_t ts, int32_t step) {
  void   *p = NULL;
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pTableMap, p, &iter)) != NULL) {
H
Hongze Cheng 已提交
464
    STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;
465 466

    pInfo->iterInit = false;
H
Haojun Liao 已提交
467
    pInfo->iter.hasVal = false;
468
    pInfo->iiter.hasVal = false;
H
Haojun Liao 已提交
469

470 471
    if (pInfo->iter.iter != NULL) {
      pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
472 473
    }

H
Haojun Liao 已提交
474 475
    if (pInfo->iiter.iter != NULL) {
      pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
476 477
    }

478 479
    pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    pInfo->lastKey = ts;
480
    pInfo->lastKeyInStt = ts + step;
481 482 483
  }
}

484 485
static void clearBlockScanInfo(STableBlockScanInfo* p) {
  p->iterInit = false;
H
Haojun Liao 已提交
486 487

  p->iter.hasVal = false;
488
  p->iiter.hasVal = false;
489

490 491 492
  if (p->iter.iter != NULL) {
    p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
  }
493

494 495 496
  if (p->iiter.iter != NULL) {
    p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
  }
497

498 499 500 501
  p->delSkyline = taosArrayDestroy(p->delSkyline);
  p->pBlockList = taosArrayDestroy(p->pBlockList);
  tMapDataClear(&p->mapData);
}
502

H
Haojun Liao 已提交
503
static void destroyAllBlockScanInfo(SSHashObj* pTableMap) {
504
  void* p = NULL;
H
Haojun Liao 已提交
505 506 507
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pTableMap, p, &iter)) != NULL) {
508
    clearBlockScanInfo(*(STableBlockScanInfo**)p);
509 510
  }

H
Haojun Liao 已提交
511
  tSimpleHashCleanup(pTableMap);
512 513
}

514
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) { return pWindow->skey > pWindow->ekey; }
H
Hongze Cheng 已提交
515

516 517 518
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
519
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
520

521
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
522
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
523

dengyihao's avatar
dengyihao 已提交
524
  STimeWindow win = *pWindow;
525 526 527 528 529 530
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
531

H
Haojun Liao 已提交
532
// init file iterator
533
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet, STsdbReader* pReader) {
H
Hongze Cheng 已提交
534
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
535

536 537
  pIter->index = ASCENDING_TRAVERSE(pReader->order) ? -1 : numOfFileset;
  pIter->order = pReader->order;
H
Hongze Cheng 已提交
538
  pIter->pFileList = aDFileSet;
539
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
540

541 542 543 544
  if (pIter->pLastBlockReader == NULL) {
    pIter->pLastBlockReader = taosMemoryCalloc(1, sizeof(struct SLastBlockReader));
    if (pIter->pLastBlockReader == NULL) {
      int32_t code = TSDB_CODE_OUT_OF_MEMORY;
545
      tsdbError("failed to prepare the last block iterator, since:%s %s", tstrerror(code), pReader->idStr);
546 547
      return code;
    }
548 549
  }

550 551 552 553 554 555 556 557
  SLastBlockReader* pLReader = pIter->pLastBlockReader;
  pLReader->order = pReader->order;
  pLReader->window = pReader->window;
  pLReader->verRange = pReader->verRange;

  pLReader->uid = 0;
  tMergeTreeClose(&pLReader->mergeTree);

558
  if (pLReader->pInfo == NULL) {
559
    // here we ignore the first column, which is always be the primary timestamp column
560 561 562
    SBlockLoadSuppInfo* pInfo = &pReader->suppInfo;

    int32_t numOfStt = pReader->pTsdb->pVnode->config.sttTrigger;
X
Xiaoyu Wang 已提交
563
    pLReader->pInfo = tCreateLastBlockLoadInfo(pReader->pSchema, &pInfo->colId[1], pInfo->numOfCols - 1, numOfStt);
H
Haojun Liao 已提交
564 565 566 567
    if (pLReader->pInfo == NULL) {
      tsdbDebug("init fileset iterator failed, code:%s %s", tstrerror(terrno), pReader->idStr);
      return terrno;
    }
568 569
  }

570
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
H
Haojun Liao 已提交
571 572 573
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
574
static int32_t filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader, bool* hasNext) {
575 576
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
577
  pIter->index += step;
D
dapan1121 已提交
578
  int32_t code = 0;
579 580

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
D
dapan1121 已提交
581 582
    *hasNext = false;
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
583 584
  }

H
Haojun Liao 已提交
585 586 587
  SIOCostSummary* pSum = &pReader->cost;
  getLastBlockLoadInfo(pIter->pLastBlockReader->pInfo, &pSum->lastBlockLoad, &pReader->cost.lastBlockLoadTime);

588 589
  pIter->pLastBlockReader->uid = 0;
  tMergeTreeClose(&pIter->pLastBlockReader->mergeTree);
590
  resetLastBlockLoadInfo(pIter->pLastBlockReader->pInfo);
591

H
Haojun Liao 已提交
592 593
  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
594

595
  while (1) {
H
Haojun Liao 已提交
596 597 598
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
599

600
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
601

D
dapan1121 已提交
602
    code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
603 604 605
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
606

607 608
    pReader->cost.headFileLoad += 1;

609 610 611 612 613 614 615
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
D
dapan1121 已提交
616 617
      *hasNext = false;
      return TSDB_CODE_SUCCESS;
618 619 620 621
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
622
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
D
dapan1121 已提交
623 624
        *hasNext = false;
        return TSDB_CODE_SUCCESS;
625
      }
626 627
      continue;
    }
C
Cary Xu 已提交
628

629
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
630
              pReader->window.ekey, pReader->idStr);
D
dapan1121 已提交
631 632
    *hasNext = true;
    return TSDB_CODE_SUCCESS;
633
  }
634

635
_err:
D
dapan1121 已提交
636 637
  *hasNext = false;
  return code;
H
Haojun Liao 已提交
638 639
}

640
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
641 642
  pIter->order = order;
  pIter->index = -1;
643
  pIter->numOfBlocks = 0;
644 645 646 647 648 649 650
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
}

L
Liu Jicong 已提交
651
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
652

H
Haojun Liao 已提交
653
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
654 655
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
656 657
}

658 659 660 661 662 663 664 665
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
H
Haojun Liao 已提交
666
    SColumnInfoData colInfo = {0};
667 668 669 670 671 672 673 674 675 676 677 678 679
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }
  return pResBlock;
}

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
static int32_t tsdbInitReaderLock(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexInit(&pReader->readerMutex, NULL);

  qTrace("tsdb/read: %p, post-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbUninitReaderLock(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexDestroy(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbAcquireReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-take read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexLock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-take read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbTryAcquireReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexTryLock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbReleaseReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-untake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexUnlock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-untake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

735 736 737 738 739 740
void tsdbReleaseDataBlock(STsdbReader* pReader) {
  SReaderStatus* pStatus = &pReader->status;
  if (!pStatus->composedDataBlock) {
    tsdbReleaseReader(pReader);
  }
}
741

742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
static int32_t initResBlockInfo(SResultBlockInfo* pResBlockInfo, int64_t capacity, SSDataBlock* pResBlock, SQueryTableDataCond* pCond) {
  pResBlockInfo->capacity = capacity;
  pResBlockInfo->pResBlock = pResBlock;
  terrno = 0;

  if (pResBlockInfo->pResBlock == NULL) {
    pResBlockInfo->freeBlock = true;
    pResBlockInfo->pResBlock = createResBlock(pCond, pResBlockInfo->capacity);
  } else {
    pResBlockInfo->freeBlock = false;
  }

  return terrno;
}

757
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, STsdbReader** ppReader, int32_t capacity,
H
Haojun Liao 已提交
758
                                SSDataBlock* pResBlock, const char* idstr) {
H
Haojun Liao 已提交
759
  int32_t      code = 0;
760
  int8_t       level = 0;
H
Haojun Liao 已提交
761
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
762 763
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
764
    goto _end;
H
Hongze Cheng 已提交
765 766
  }

C
Cary Xu 已提交
767
  if (VND_IS_TSMA(pVnode)) {
K
kailixu 已提交
768
    tsdbDebug("vgId:%d, tsma is selected to query, %s", TD_VID(pVnode), idstr);
C
Cary Xu 已提交
769 770
  }

H
Haojun Liao 已提交
771
  initReaderStatus(&pReader->status);
772

L
Liu Jicong 已提交
773
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
774 775
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
776

777
  pReader->idStr = (idstr != NULL) ? taosStrdup(idstr) : NULL;
dengyihao's avatar
dengyihao 已提交
778
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
779
  pReader->type = pCond->type;
780
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
781
  pReader->blockInfoBuf.numPerBucket = 1000;  // 1000 tables per bucket
H
Hongze Cheng 已提交
782

783 784 785
  code = initResBlockInfo(&pReader->resBlockInfo, capacity, pResBlock, pCond);
  if (code != TSDB_CODE_SUCCESS) {
    goto _end;
H
Haojun Liao 已提交
786
  }
787

H
Haojun Liao 已提交
788 789 790 791 792
  if (pCond->numOfCols <= 0) {
    tsdbError("vgId:%d, invalid column number %d in query cond, %s", TD_VID(pVnode), pCond->numOfCols, idstr);
    code = TSDB_CODE_INVALID_PARA;
    goto _end;
  }
H
Hongze Cheng 已提交
793

794 795
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
796
  pSup->pColAgg = taosArrayInit(pCond->numOfCols, sizeof(SColumnDataAgg));
H
Haojun Liao 已提交
797
  if (pSup->pColAgg == NULL) {
798 799 800
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
801

802
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
803 804
  setColumnIdSlotList(pSup, pCond->colList, pCond->pSlotList, pCond->numOfCols);

H
Hongze Cheng 已提交
805
  code = tBlockDataCreate(&pReader->status.fileBlockData);
H
Haojun Liao 已提交
806 807 808 809 810
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

811
  if (pReader->suppInfo.colId[0] != PRIMARYKEY_TIMESTAMP_COL_ID) {
812
    tsdbError("the first column isn't primary timestamp, %d, %s", pReader->suppInfo.colId[0], pReader->idStr);
K
kailixu 已提交
813
    code = TSDB_CODE_INVALID_PARA;
814 815 816
    goto _end;
  }

817
  pReader->status.pPrimaryTsCol = taosArrayGet(pReader->resBlockInfo.pResBlock->pDataBlock, pSup->slotId[0]);
818 819 820 821
  int32_t type = pReader->status.pPrimaryTsCol->info.type;
  if (type != TSDB_DATA_TYPE_TIMESTAMP) {
    tsdbError("the first column isn't primary timestamp in result block, actual: %s, %s", tDataTypes[type].name,
              pReader->idStr);
K
kailixu 已提交
822
    code = TSDB_CODE_INVALID_PARA;
823 824
    goto _end;
  }
825

826
  tsdbInitReaderLock(pReader);
827

H
Hongze Cheng 已提交
828 829
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
830

H
Haojun Liao 已提交
831 832
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
833 834 835
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
836

H
Haojun Liao 已提交
837
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
X
Xiaoyu Wang 已提交
838
  int64_t    st = taosGetTimestampUs();
839 840 841
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
842
    goto _end;
H
Haojun Liao 已提交
843
  }
H
Hongze Cheng 已提交
844

H
Haojun Liao 已提交
845
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
846

847 848
  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
849
  if (num == 0) {
850
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
851 852
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
853

H
Haojun Liao 已提交
854
  // todo binary search to the start position
855 856
  int64_t et1 = taosGetTimestampUs();

X
Xiaoyu Wang 已提交
857
  SBlockIdx*     pBlockIdx = NULL;
858
  STableUidList* pList = &pReader->status.uidList;
H
Haojun Liao 已提交
859

H
Haojun Liao 已提交
860
  int32_t i = 0, j = 0;
X
Xiaoyu Wang 已提交
861
  while (i < num && j < numOfTables) {
H
Haojun Liao 已提交
862
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Hongze Cheng 已提交
863
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
864
      i += 1;
H
Haojun Liao 已提交
865 866 867
      continue;
    }

H
Haojun Liao 已提交
868 869
    if (pBlockIdx->uid < pList->tableUidList[j]) {
      i += 1;
H
Haojun Liao 已提交
870 871 872
      continue;
    }

H
Haojun Liao 已提交
873
    if (pBlockIdx->uid > pList->tableUidList[j]) {
H
Haojun Liao 已提交
874
      j += 1;
H
Haojun Liao 已提交
875
      continue;
H
Haojun Liao 已提交
876 877
    }

H
Haojun Liao 已提交
878
    if (pBlockIdx->uid == pList->tableUidList[j]) {
H
Haojun Liao 已提交
879
      // this block belongs to a table that is not queried.
H
Haojun Liao 已提交
880 881
      STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockIdx->uid, pReader->idStr);
      if (pScanInfo == NULL) {
882
        tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
883
        return terrno;
H
Haojun Liao 已提交
884 885 886 887 888 889 890
      }

      if (pScanInfo->pBlockList == NULL) {
        pScanInfo->pBlockList = taosArrayInit(4, sizeof(SBlockIndex));
      }

      taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
891

H
Haojun Liao 已提交
892
      i += 1;
H
Haojun Liao 已提交
893
      j += 1;
894
    }
H
Haojun Liao 已提交
895
  }
H
Hongze Cheng 已提交
896

897
  int64_t et2 = taosGetTimestampUs();
H
Haojun Liao 已提交
898 899 900
  tsdbDebug("load block index for %d/%d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
            numOfTables, (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0,
            pReader->idStr);
901 902 903

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

904
_end:
905
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
906 907
  return code;
}
H
Hongze Cheng 已提交
908

909 910 911 912 913 914 915 916 917 918 919 920
static void doCleanupTableScanInfo(STableBlockScanInfo* pScanInfo) {
  // reset the index in last block when handing a new file
  tMapDataClear(&pScanInfo->mapData);
  taosArrayClear(pScanInfo->pBlockList);
}

static void cleanupTableScanInfo(SReaderStatus* pStatus) {
  if (pStatus->mapDataCleaned) {
    return;
  }

  SSHashObj* pTableMap = pStatus->pTableMap;
921
  STableBlockScanInfo** px = NULL;
H
Haojun Liao 已提交
922 923
  int32_t iter = 0;

dengyihao's avatar
dengyihao 已提交
924
  while (1) {
H
Haojun Liao 已提交
925
    px = tSimpleHashIterate(pTableMap, px, &iter);
926 927 928 929
    if (px == NULL) {
      break;
    }

930
    doCleanupTableScanInfo(*px);
931
  }
932 933

  pStatus->mapDataCleaned = true;
934 935
}

936
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum, SArray* pTableScanInfoList) {
937 938 939 940
  size_t  sizeInDisk = 0;
  size_t  numOfTables = taosArrayGetSize(pIndexList);

  int64_t st = taosGetTimestampUs();
941
  cleanupTableScanInfo(&pReader->status);
942

943 944
  // set the flag for the new file
  pReader->status.mapDataCleaned = false;
dengyihao's avatar
dengyihao 已提交
945
  for (int32_t i = 0; i < numOfTables; ++i) {
X
Xiaoyu Wang 已提交
946
    SBlockIdx*           pBlockIdx = taosArrayGet(pIndexList, i);
H
Haojun Liao 已提交
947 948 949 950
    STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockIdx->uid, pReader->idStr);
    if (pScanInfo == NULL) {
      return terrno;
    }
H
Hongze Cheng 已提交
951

952
    tMapDataReset(&pScanInfo->mapData);
H
Hongze Cheng 已提交
953
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);
H
Haojun Liao 已提交
954
    taosArrayEnsureCap(pScanInfo->pBlockList, pScanInfo->mapData.nItem);
955

956
    sizeInDisk += pScanInfo->mapData.nData;
957 958 959 960 961 962 963 964 965 966 967 968 969

    int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
    STimeWindow w = pReader->window;
    if (ASCENDING_TRAVERSE(pReader->order)) {
      w.skey = pScanInfo->lastKey + step;
    } else {
      w.ekey = pScanInfo->lastKey + step;
    }

    if (isEmptyQueryTimeWindow(&w)) {
      continue;
    }

H
Haojun Liao 已提交
970
    SDataBlk block = {0};
971
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Haojun Liao 已提交
972
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
H
Hongze Cheng 已提交
973

974
      // 1. time range check
975 976
      // if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
      if (block.minKey.ts > w.ekey || block.maxKey.ts < w.skey) {
H
Haojun Liao 已提交
977 978
        continue;
      }
H
Hongze Cheng 已提交
979

980
      // 2. version range check
H
Hongze Cheng 已提交
981
      if (block.minVer > pReader->verRange.maxVer || block.maxVer < pReader->verRange.minVer) {
982 983
        continue;
      }
984

985
      SBlockIndex bIndex = {.ordinalIndex = j, .inFileOffset = block.aSubBlock->offset};
986
      bIndex.window = (STimeWindow){.skey = block.minKey.ts, .ekey = block.maxKey.ts};
987

H
Haojun Liao 已提交
988 989
      void* p1 = taosArrayPush(pScanInfo->pBlockList, &bIndex);
      if (p1 == NULL) {
990
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
991 992
        return TSDB_CODE_OUT_OF_MEMORY;
      }
993

994
      pBlockNum->numOfBlocks += 1;
H
Haojun Liao 已提交
995
    }
H
Hongze Cheng 已提交
996

H
Haojun Liao 已提交
997
    if (taosArrayGetSize(pScanInfo->pBlockList) > 0) {
998
      taosArrayPush(pTableScanInfoList, &pScanInfo);
999 1000 1001
    }
  }

H
Hongze Cheng 已提交
1002
  pBlockNum->numOfLastFiles = pReader->pFileReader->pSet->nSttF;
1003
  int32_t total = pBlockNum->numOfLastFiles + pBlockNum->numOfBlocks;
1004

1005
  double el = (taosGetTimestampUs() - st) / 1000.0;
H
Hongze Cheng 已提交
1006
  tsdbDebug(
1007
      "load block of %ld tables completed, blocks:%d in %d tables, last-files:%d, block-info-size:%.2f Kb, elapsed "
1008
      "time:%.2f ms %s",
1009 1010
      numOfTables, pBlockNum->numOfBlocks, (int32_t)taosArrayGetSize(pTableScanInfoList), pBlockNum->numOfLastFiles,
      sizeInDisk / 1000.0, el, pReader->idStr);
1011

1012
  pReader->cost.numOfBlocks += total;
1013
  pReader->cost.headFileLoadTime += el;
1014

H
Haojun Liao 已提交
1015 1016
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1017

1018
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
1019
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
1020
  pDumpInfo->allDumped = true;
1021
  pDumpInfo->lastKey = maxKey + step;
H
Haojun Liao 已提交
1022 1023
}

D
dapan1121 已提交
1024
static int32_t doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
1025
                            SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
1026
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
H
Hongze Cheng 已提交
1027
    if (!COL_VAL_IS_VALUE(pColVal)) {
1028
      colDataSetNULL(pColInfoData, rowIndex);
H
Haojun Liao 已提交
1029 1030
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
D
dapan1121 已提交
1031
      if (pColVal->value.nData > pColInfoData->info.bytes) {
1032 1033
        tsdbWarn("column cid:%d actual data len %d is bigger than schema len %d", pColVal->cid, pColVal->value.nData,
                 pColInfoData->info.bytes);
D
dapan1121 已提交
1034 1035
        return TSDB_CODE_TDB_INVALID_TABLE_SCHEMA_VER;
      }
1036 1037 1038 1039
      if (pColVal->value.nData > 0) {  // pData may be null, if nData is 0
        memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      }

1040
      colDataSetVal(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
H
Haojun Liao 已提交
1041 1042
    }
  } else {
1043
    colDataSetVal(pColInfoData, rowIndex, (const char*)&pColVal->value, !COL_VAL_IS_VALUE(pColVal));
H
Haojun Liao 已提交
1044
  }
D
dapan1121 已提交
1045 1046

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1047 1048
}

1049
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
H
Haojun Liao 已提交
1050 1051 1052
  size_t num = taosArrayGetSize(pBlockIter->blockList);
  if (num == 0) {
    ASSERT(pBlockIter->numOfBlocks == num);
1053 1054
    return NULL;
  }
1055 1056 1057

  SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pBlockInfo;
1058 1059
}

H
Hongze Cheng 已提交
1060
static SDataBlk* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
1061

C
Cary Xu 已提交
1062 1063 1064 1065 1066 1067
static int doBinarySearchKey(TSKEY* keyList, int num, int pos, TSKEY key, int order) {
  // start end position
  int s, e;
  s = pos;

  // check
H
Haojun Liao 已提交
1068
  ASSERT(pos >= 0 && pos < num && num > 0);
C
Cary Xu 已提交
1069 1070
  if (order == TSDB_ORDER_ASC) {
    // find the first position which is smaller than the key
H
Hongze Cheng 已提交
1071 1072
    e = num - 1;
    if (key < keyList[pos]) return -1;
C
Cary Xu 已提交
1073 1074
    while (1) {
      // check can return
H
Hongze Cheng 已提交
1075 1076 1077
      if (key >= keyList[e]) return e;
      if (key <= keyList[s]) return s;
      if (e - s <= 1) return s;
C
Cary Xu 已提交
1078 1079

      // change start or end position
H
Hongze Cheng 已提交
1080
      int mid = s + (e - s + 1) / 2;
C
Cary Xu 已提交
1081 1082
      if (keyList[mid] > key)
        e = mid;
H
Hongze Cheng 已提交
1083
      else if (keyList[mid] < key)
C
Cary Xu 已提交
1084 1085 1086 1087
        s = mid;
      else
        return mid;
    }
H
Hongze Cheng 已提交
1088
  } else {  // DESC
C
Cary Xu 已提交
1089
    // find the first position which is bigger than the key
H
Hongze Cheng 已提交
1090 1091
    e = 0;
    if (key > keyList[pos]) return -1;
C
Cary Xu 已提交
1092 1093
    while (1) {
      // check can return
H
Hongze Cheng 已提交
1094 1095 1096
      if (key <= keyList[e]) return e;
      if (key >= keyList[s]) return s;
      if (s - e <= 1) return s;
C
Cary Xu 已提交
1097 1098

      // change start or end position
H
Hongze Cheng 已提交
1099
      int mid = s - (s - e + 1) / 2;
C
Cary Xu 已提交
1100 1101
      if (keyList[mid] < key)
        e = mid;
H
Hongze Cheng 已提交
1102
      else if (keyList[mid] > key)
C
Cary Xu 已提交
1103 1104 1105 1106 1107 1108 1109
        s = mid;
      else
        return mid;
    }
  }
}

H
Haojun Liao 已提交
1110
static int32_t getEndPosInDataBlock(STsdbReader* pReader, SBlockData* pBlockData, SDataBlk* pBlock, int32_t pos) {
C
Cary Xu 已提交
1111 1112
  // NOTE: reverse the order to find the end position in data block
  int32_t endPos = -1;
H
Hongze Cheng 已提交
1113
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
C
Cary Xu 已提交
1114 1115 1116 1117 1118 1119

  if (asc && pReader->window.ekey >= pBlock->maxKey.ts) {
    endPos = pBlock->nRow - 1;
  } else if (!asc && pReader->window.skey <= pBlock->minKey.ts) {
    endPos = 0;
  } else {
C
Cary Xu 已提交
1120 1121
    int64_t key = asc ? pReader->window.ekey : pReader->window.skey;
    endPos = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, pReader->order);
C
Cary Xu 已提交
1122 1123 1124 1125 1126
  }

  return endPos;
}

H
Haojun Liao 已提交
1127
static void copyPrimaryTsCol(const SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Haojun Liao 已提交
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
                             int32_t dumpedRows, bool asc) {
  if (asc) {
    memcpy(pColData->pData, &pBlockData->aTSKEY[pDumpInfo->rowIndex], dumpedRows * sizeof(int64_t));
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    memcpy(pColData->pData, &pBlockData->aTSKEY[startIndex], dumpedRows * sizeof(int64_t));

    // todo: opt perf by extract the loop
    // reverse the array list
    int32_t  mid = dumpedRows >> 1u;
    int64_t* pts = (int64_t*)pColData->pData;
    for (int32_t j = 0; j < mid; ++j) {
      int64_t t = pts[j];
      pts[j] = pts[dumpedRows - j - 1];
      pts[dumpedRows - j - 1] = t;
    }
  }
}

H
Haojun Liao 已提交
1147 1148
// a faster version of copy procedure.
static void copyNumericCols(const SColData* pData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Hongze Cheng 已提交
1149
                            int32_t dumpedRows, bool asc) {
H
Haojun Liao 已提交
1150 1151 1152 1153 1154 1155 1156 1157
  uint8_t* p = NULL;
  if (asc) {
    p = pData->pData + tDataTypes[pData->type].bytes * pDumpInfo->rowIndex;
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    p = pData->pData + tDataTypes[pData->type].bytes * startIndex;
  }

H
Hongze Cheng 已提交
1158
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1159

H
Haojun Liao 已提交
1160
  // make sure it is aligned to 8bit, the allocated memory address is aligned to 256bit
1161
  //  ASSERT((((uint64_t)pColData->pData) & (0x8 - 1)) == 0);
H
Haojun Liao 已提交
1162 1163 1164 1165 1166 1167

  // 1. copy data in a batch model
  memcpy(pColData->pData, p, dumpedRows * tDataTypes[pData->type].bytes);

  // 2. reverse the array list in case of descending order scan data block
  if (!asc) {
H
Hongze Cheng 已提交
1168
    switch (pColData->info.type) {
H
Haojun Liao 已提交
1169 1170 1171
      case TSDB_DATA_TYPE_TIMESTAMP:
      case TSDB_DATA_TYPE_DOUBLE:
      case TSDB_DATA_TYPE_BIGINT:
H
Hongze Cheng 已提交
1172
      case TSDB_DATA_TYPE_UBIGINT: {
H
Haojun Liao 已提交
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185
        int32_t  mid = dumpedRows >> 1u;
        int64_t* pts = (int64_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_BOOL:
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_UTINYINT: {
H
Hongze Cheng 已提交
1186
        int32_t mid = dumpedRows >> 1u;
H
Haojun Liao 已提交
1187 1188
        int8_t* pts = (int8_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
1189
          int8_t t = pts[j];
H
Haojun Liao 已提交
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_SMALLINT:
      case TSDB_DATA_TYPE_USMALLINT: {
        int32_t  mid = dumpedRows >> 1u;
        int16_t* pts = (int16_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_FLOAT:
      case TSDB_DATA_TYPE_INT:
      case TSDB_DATA_TYPE_UINT: {
        int32_t  mid = dumpedRows >> 1u;
        int32_t* pts = (int32_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
1214
          int32_t t = pts[j];
H
Haojun Liao 已提交
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }
    }
  }

  // 3. if the  null value exists, check items one-by-one
  if (pData->flag != HAS_VALUE) {
    int32_t rowIndex = 0;

    for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step, rowIndex++) {
      uint8_t v = tColDataGetBitValue(pData, j);
      if (v == 0 || v == 1) {
        colDataSetNull_f(pColData->nullbitmap, rowIndex);
        pColData->hasNull = true;
      }
    }
  }
}

1237
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
1238 1239 1240 1241
  SReaderStatus*      pStatus = &pReader->status;
  SDataBlockIter*     pBlockIter = &pStatus->blockIter;
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Hongze Cheng 已提交
1242

1243
  SBlockData*         pBlockData = &pStatus->fileBlockData;
C
Cary Xu 已提交
1244
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
H
Hongze Cheng 已提交
1245
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
1246
  SSDataBlock*        pResBlock = pReader->resBlockInfo.pResBlock;
H
Haojun Liao 已提交
1247
  int32_t             numOfOutputCols = pSupInfo->numOfCols;
D
dapan1121 已提交
1248
  int32_t             code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1249

H
Haojun Liao 已提交
1250
  SColVal cv = {0};
1251
  int64_t st = taosGetTimestampUs();
1252 1253
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
1254

1255 1256
  // no data exists, return directly.
  if (pBlockData->nRow == 0 || pBlockData->aTSKEY == 0) {
X
Xiaoyu Wang 已提交
1257 1258
    tsdbWarn("%p no need to copy since no data in blockData, table uid:%" PRIu64 " has been dropped, %s", pReader,
             pBlockInfo->uid, pReader->idStr);
1259 1260 1261 1262
    pResBlock->info.rows = 0;
    return 0;
  }

1263 1264
  if ((pDumpInfo->rowIndex == 0 && asc) || (pDumpInfo->rowIndex == pBlock->nRow - 1 && (!asc))) {
    if (asc && pReader->window.skey <= pBlock->minKey.ts) {
1265 1266 1267
      // pDumpInfo->rowIndex = 0;
    } else if (!asc && pReader->window.ekey >= pBlock->maxKey.ts) {
      // pDumpInfo->rowIndex = pBlock->nRow - 1;
H
Haojun Liao 已提交
1268
    } else {  // find the appropriate the start position in current block, and set it to be the current rowIndex
1269
      int32_t pos = asc ? pBlock->nRow - 1 : 0;
C
Cary Xu 已提交
1270 1271 1272
      int32_t order = asc ? TSDB_ORDER_DESC : TSDB_ORDER_ASC;
      int64_t key = asc ? pReader->window.skey : pReader->window.ekey;
      pDumpInfo->rowIndex = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, order);
H
Haojun Liao 已提交
1273 1274 1275 1276 1277 1278 1279 1280 1281

      if (pDumpInfo->rowIndex < 0) {
        tsdbError(
            "%p failed to locate the start position in current block, global index:%d, table index:%d, brange:%" PRId64
            "-%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 " %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->minVer,
            pBlock->maxVer, pReader->idStr);
        return TSDB_CODE_INVALID_PARA;
      }
1282
    }
C
Cary Xu 已提交
1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
  }

  // time window check
  int32_t endIndex = getEndPosInDataBlock(pReader, pBlockData, pBlock, pDumpInfo->rowIndex);
  if (endIndex == -1) {
    setBlockAllDumped(pDumpInfo, pReader->window.ekey, pReader->order);
    return TSDB_CODE_SUCCESS;
  }

  endIndex += step;
H
Haojun Liao 已提交
1293
  int32_t dumpedRows = asc ? (endIndex - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex - endIndex);
1294 1295
  if (dumpedRows > pReader->resBlockInfo.capacity) {  // output buffer check
    dumpedRows = pReader->resBlockInfo.capacity;
1296 1297
  }

H
Haojun Liao 已提交
1298
  int32_t i = 0;
C
Cary Xu 已提交
1299 1300
  int32_t rowIndex = 0;

H
Haojun Liao 已提交
1301 1302
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
1303
    copyPrimaryTsCol(pBlockData, pDumpInfo, pColData, dumpedRows, asc);
1304 1305 1306
    i += 1;
  }

1307
  int32_t colIndex = 0;
H
Hongze Cheng 已提交
1308
  int32_t num = pBlockData->nColData;
1309
  while (i < numOfOutputCols && colIndex < num) {
1310 1311
    rowIndex = 0;

H
Hongze Cheng 已提交
1312
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
H
Haojun Liao 已提交
1313
    if (pData->cid < pSupInfo->colId[i]) {
1314
      colIndex += 1;
H
Haojun Liao 已提交
1315 1316
    } else if (pData->cid == pSupInfo->colId[i]) {
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
1317

H
Hongze Cheng 已提交
1318
      if (pData->flag == HAS_NONE || pData->flag == HAS_NULL || pData->flag == (HAS_NULL | HAS_NONE)) {
1319
        colDataSetNNULL(pColData, 0, dumpedRows);
C
Cary Xu 已提交
1320
      } else {
H
Haojun Liao 已提交
1321
        if (IS_MATHABLE_TYPE(pColData->info.type)) {
H
Haojun Liao 已提交
1322 1323
          copyNumericCols(pData, pDumpInfo, pColData, dumpedRows, asc);
        } else {  // varchar/nchar type
H
Haojun Liao 已提交
1324
          for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step) {
C
Cary Xu 已提交
1325
            tColDataGetValue(pData, j, &cv);
D
dapan1121 已提交
1326 1327 1328 1329
            code = doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
            if (code) {
              return code;
            }
C
Cary Xu 已提交
1330 1331
          }
        }
H
Haojun Liao 已提交
1332
      }
C
Cary Xu 已提交
1333

1334
      colIndex += 1;
1335
      i += 1;
1336
    } else {  // the specified column does not exist in file block, fill with null data
H
Haojun Liao 已提交
1337
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
1338
      colDataSetNNULL(pColData, 0, dumpedRows);
1339
      i += 1;
H
Haojun Liao 已提交
1340
    }
1341 1342
  }

1343
  // fill the mis-matched columns with null value
1344
  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
1345
    pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
1346
    colDataSetNNULL(pColData, 0, dumpedRows);
1347
    i += 1;
H
Haojun Liao 已提交
1348
  }
H
Haojun Liao 已提交
1349

1350
  pResBlock->info.dataLoad = 1;
H
Haojun Liao 已提交
1351 1352
  pResBlock->info.rows = dumpedRows;
  pDumpInfo->rowIndex += step * dumpedRows;
1353

1354
  // check if current block are all handled
C
Cary Xu 已提交
1355 1356
  if (pDumpInfo->rowIndex >= 0 && pDumpInfo->rowIndex < pBlock->nRow) {
    int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1357 1358 1359
    if (outOfTimeWindow(ts, &pReader->window)) {  // the remain data has out of query time window, ignore current block
      setBlockAllDumped(pDumpInfo, ts, pReader->order);
    }
C
Cary Xu 已提交
1360
  } else {
1361 1362
    int64_t ts = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
    setBlockAllDumped(pDumpInfo, ts, pReader->order);
C
Cary Xu 已提交
1363
  }
H
Haojun Liao 已提交
1364

1365
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
1366
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
1367

1368
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
1369
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
1370
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", uid:%" PRIu64 " elapsed time:%.2f ms, %s",
H
Haojun Liao 已提交
1371
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, dumpedRows,
H
Haojun Liao 已提交
1372
            unDumpedRows, pBlock->minVer, pBlock->maxVer, pBlockInfo->uid, elapsedTime, pReader->idStr);
1373 1374 1375 1376

  return TSDB_CODE_SUCCESS;
}

1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396
static FORCE_INLINE STSchema* getTableSchemaImpl(STsdbReader* pReader, uint64_t uid) {
  ASSERT(pReader->pSchema == NULL);

  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, -1, &pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS || pReader->pSchema == NULL) {
    terrno = code;
    tsdbError("failed to get table schema, uid:%" PRIu64 ", it may have been dropped, ver:-1, %s", uid, pReader->idStr);
    return NULL;
  }

  code = tsdbRowMergerInit(&pReader->status.merger, pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    tsdbError("failed to init merger, code:%s, %s", tstrerror(code), pReader->idStr);
    return NULL;
  }

  return pReader->pSchema;
}

1397 1398
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData,
                                   uint64_t uid) {
1399 1400 1401
  int32_t   code = 0;
  STSchema* pSchema = pReader->pSchema;
  int64_t   st = taosGetTimestampUs();
1402

1403
  tBlockDataReset(pBlockData);
1404 1405 1406 1407 1408 1409 1410

  if (pReader->pSchema == NULL) {
    pSchema = getTableSchemaImpl(pReader, uid);
    if (pSchema == NULL) {
      tsdbDebug("%p table uid:%" PRIu64 " has been dropped, no data existed, %s", pReader, uid, pReader->idStr);
      return code;
    }
1411 1412 1413
  }

  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
X
Xiaoyu Wang 已提交
1414
  TABLEID             tid = {.suid = pReader->suid, .uid = uid};
1415
  code = tBlockDataInit(pBlockData, &tid, pSchema, &pSup->colId[1], pSup->numOfCols - 1);
1416 1417 1418 1419
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1420
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
1421
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1422

H
Hongze Cheng 已提交
1423
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
1424
  code = tsdbReadDataBlock(pReader->pFileReader, pBlock, pBlockData);
1425 1426 1427
  if (code != TSDB_CODE_SUCCESS) {
    tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
              ", rows:%d, code:%s %s",
1428
              pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
1429 1430 1431
              tstrerror(code), pReader->idStr);
    return code;
  }
1432

1433
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
1434

1435 1436 1437 1438
  tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
1439 1440 1441

  pReader->cost.blockLoadTime += elapsedTime;
  pDumpInfo->allDumped = false;
1442

H
Haojun Liao 已提交
1443
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1444
}
H
Hongze Cheng 已提交
1445

H
Haojun Liao 已提交
1446 1447 1448
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
1449

H
Haojun Liao 已提交
1450 1451 1452 1453
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
1454

H
Haojun Liao 已提交
1455 1456
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
1457

H
Haojun Liao 已提交
1458 1459
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
1460 1461
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
1462

H
Haojun Liao 已提交
1463 1464 1465 1466
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
1467

H
Haojun Liao 已提交
1468 1469
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1470

H
Haojun Liao 已提交
1471
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
1472
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
1473
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
1474

H
Haojun Liao 已提交
1475
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
1476

H
Haojun Liao 已提交
1477 1478
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
1479

H
Haojun Liao 已提交
1480 1481 1482 1483 1484 1485 1486
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
1487

1488
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
1489
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
1490

1491 1492 1493
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

H
Haojun Liao 已提交
1494
static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter, const char* idStr) {
1495 1496
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
1497
    STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pBlockIter->pTableMap, pBlockInfo->uid, idStr);
H
Haojun Liao 已提交
1498
    if (pScanInfo == NULL) {
H
Haojun Liao 已提交
1499
      return terrno;
H
Haojun Liao 已提交
1500 1501
    }

H
Haojun Liao 已提交
1502 1503
    SBlockIndex* pIndex = taosArrayGet(pScanInfo->pBlockList, pBlockInfo->tbBlockIdx);
    tMapDataGetItemByIdx(&pScanInfo->mapData, pIndex->ordinalIndex, &pBlockIter->block, tGetDataBlk);
1504
  }
1505 1506 1507 1508 1509 1510

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1511
}
H
Hongze Cheng 已提交
1512

1513
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks, SArray* pTableList) {
1514
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
1515

1516
  SBlockOrderSupporter sup = {0};
1517
  pBlockIter->numOfBlocks = numOfBlocks;
1518
  taosArrayClear(pBlockIter->blockList);
1519
  pBlockIter->pTableMap = pReader->status.pTableMap;
1520

1521
  // access data blocks according to the offset of each block in asc/desc order.
1522
  int32_t numOfTables = taosArrayGetSize(pTableList);
H
Haojun Liao 已提交
1523

1524
  int64_t st = taosGetTimestampUs();
1525
  int32_t code = initBlockOrderSupporter(&sup, numOfTables);
1526 1527 1528
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
1529

1530
  int32_t cnt = 0;
H
Haojun Liao 已提交
1531

1532
  for (int32_t i = 0; i < numOfTables; ++i) {
1533 1534
    STableBlockScanInfo* pTableScanInfo = taosArrayGetP(pTableList, i);
    ASSERT(pTableScanInfo->pBlockList != NULL && taosArrayGetSize(pTableScanInfo->pBlockList) > 0);
H
Haojun Liao 已提交
1535

1536 1537
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
1538

1539 1540 1541
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1542
      return TSDB_CODE_OUT_OF_MEMORY;
1543
    }
H
Haojun Liao 已提交
1544

1545
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
1546

1547 1548 1549
    for (int32_t k = 0; k < num; ++k) {
      SBlockIndex* pIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
      sup.pDataBlockInfo[sup.numOfTables][k] =
1550
          (SBlockOrderWrapper){.uid = pTableScanInfo->uid, .offset = pIndex->inFileOffset};
1551 1552 1553 1554 1555
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
1556

H
Haojun Liao 已提交
1557 1558 1559 1560
  if (numOfBlocks != cnt && sup.numOfTables != numOfTables) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_INVALID_PARA;
  }
H
Haojun Liao 已提交
1561

1562
  // since there is only one table qualified, blocks are not sorted
1563 1564
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
1565 1566
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
1567
    }
1568

1569
    int64_t et = taosGetTimestampUs();
1570
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
1571
              pReader, numOfBlocks, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
1572

1573
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1574
    cleanupBlockOrderSupporter(&sup);
H
Haojun Liao 已提交
1575
    doSetCurrentBlock(pBlockIter, pReader->idStr);
1576
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1577
  }
H
Haojun Liao 已提交
1578

1579 1580
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
1581

1582
  SMultiwayMergeTreeInfo* pTree = NULL;
H
Haojun Liao 已提交
1583 1584

  uint8_t ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
1585 1586
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1587
    return TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
1588
  }
H
Haojun Liao 已提交
1589

1590 1591 1592 1593
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
1594

1595 1596
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
1597

1598 1599 1600 1601
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
1602

1603 1604
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
1605
  }
H
Haojun Liao 已提交
1606

1607
  int64_t et = taosGetTimestampUs();
H
Hongze Cheng 已提交
1608 1609
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, numOfBlocks,
            (et - st) / 1000.0, pReader->idStr);
1610 1611
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
1612

1613
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1614
  doSetCurrentBlock(pBlockIter, pReader->idStr);
1615

1616
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1617
}
H
Hongze Cheng 已提交
1618

H
Haojun Liao 已提交
1619
static bool blockIteratorNext(SDataBlockIter* pBlockIter, const char* idStr) {
1620 1621
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

1622
  int32_t step = asc ? 1 : -1;
1623
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
1624 1625 1626
    return false;
  }

1627
  pBlockIter->index += step;
H
Haojun Liao 已提交
1628
  doSetCurrentBlock(pBlockIter, idStr);
1629

1630 1631 1632
  return true;
}

1633 1634 1635
/**
 * This is an two rectangles overlap cases.
 */
H
Hongze Cheng 已提交
1636
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SDataBlk* pBlock) {
1637 1638
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
H
Hongze Cheng 已提交
1639 1640
         (pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
         (pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
H
Haojun Liao 已提交
1641
}
H
Hongze Cheng 已提交
1642

1643
static bool getNeighborBlockOfSameTable(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
1644
                                        int32_t* nextIndex, int32_t order, SBlockIndex* pBlockIndex) {
1645
  bool asc = ASCENDING_TRAVERSE(order);
H
Haojun Liao 已提交
1646
  if (asc && pBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
1647
    return false;
1648 1649
  }

H
Haojun Liao 已提交
1650
  if (!asc && pBlockInfo->tbBlockIdx == 0) {
1651
    return false;
1652 1653
  }

1654
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1655
  *nextIndex = pBlockInfo->tbBlockIdx + step;
1656 1657
  *pBlockIndex = *(SBlockIndex*)taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
  //  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, pIndex->ordinalIndex, pBlock, tGetDataBlk);
1658
  return true;
1659 1660 1661
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
1662
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1663 1664
  int32_t index = pBlockIter->index;

1665
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  return -1;
}

1677
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1678
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
1679 1680 1681 1682
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1683 1684 1685 1686 1687
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1688

1689 1690 1691
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1692

H
Haojun Liao 已提交
1693
  doSetCurrentBlock(pBlockIter, "");
1694 1695 1696
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
1697
// todo: this attribute could be acquired during extractin the global ordered block list.
1698
static bool overlapWithNeighborBlock(SDataBlk* pBlock, SBlockIndex* pNeighborBlockIndex, int32_t order) {
1699 1700
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
1701
    return pBlock->maxKey.ts == pNeighborBlockIndex->window.skey;
1702
  } else {
1703
    return pBlock->minKey.ts == pNeighborBlockIndex->window.ekey;
1704
  }
H
Haojun Liao 已提交
1705
}
H
Hongze Cheng 已提交
1706

H
Hongze Cheng 已提交
1707
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SDataBlk* pBlock) {
H
Haojun Liao 已提交
1708
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1709

1710
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1711
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1712
}
H
Hongze Cheng 已提交
1713

H
Hongze Cheng 已提交
1714
static bool keyOverlapFileBlock(TSDBKEY key, SDataBlk* pBlock, SVersionRange* pVerRange) {
H
Hongze Cheng 已提交
1715 1716
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVer >= pVerRange->minVer) &&
         (pBlock->minVer <= pVerRange->maxVer);
H
Haojun Liao 已提交
1717 1718
}

H
Hongze Cheng 已提交
1719 1720
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock,
                                       int32_t startIndex) {
1721 1722
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

1723
  for (int32_t i = startIndex; i < num; i += 1) {
1724 1725
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
H
Hongze Cheng 已提交
1726
      if (p->version >= pBlock->minVer) {
1727 1728 1729
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
H
Hongze Cheng 已提交
1730
      if (p->version >= pBlock->minVer) {
1731 1732
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
H
Hongze Cheng 已提交
1733 1734
          if (pnext->ts >= pBlock->minKey.ts) {
            return true;
1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

H
Hongze Cheng 已提交
1748
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock, int32_t order) {
1749 1750 1751 1752
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1753
  // ts is not overlap
1754
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1755
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1756 1757 1758 1759 1760
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1761
  if (ASCENDING_TRAVERSE(order)) {
1762
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, pBlockScanInfo->fileDelIndex);
1763 1764
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1765
    while (1) {
1766 1767 1768 1769
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
1770 1771
        if (p->ts == pBlock->minKey.ts && p->version < pBlock->maxVer && index > 0) {
          index -= 1;
1772
        }
1773
        break;
1774 1775 1776
      }
    }

1777
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, index);
1778
  }
1779 1780
}

C
Cary Xu 已提交
1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793
typedef struct {
  bool overlapWithNeighborBlock;
  bool hasDupTs;
  bool overlapWithDelInfo;
  bool overlapWithLastBlock;
  bool overlapWithKeyInBuf;
  bool partiallyRequired;
  bool moreThanCapcity;
} SDataBlockToLoadInfo;

static void getBlockToLoadInfo(SDataBlockToLoadInfo* pInfo, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                               STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader,
                               STsdbReader* pReader) {
1794 1795
  int32_t     neighborIndex = 0;
  SBlockIndex bIndex = {0};
1796

1797
  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pScanInfo, &neighborIndex, pReader->order, &bIndex);
1798

1799
  // overlap with neighbor
1800
  if (hasNeighbor) {
1801
    pInfo->overlapWithNeighborBlock = overlapWithNeighborBlock(pBlock, &bIndex, pReader->order);
1802 1803
  }

1804
  // has duplicated ts of different version in this block
C
Cary Xu 已提交
1805 1806
  pInfo->hasDupTs = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  pInfo->overlapWithDelInfo = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1807

1808 1809 1810
  if (hasDataInLastBlock(pLastBlockReader)) {
    int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
    pInfo->overlapWithLastBlock = !(pBlock->maxKey.ts < tsLast || pBlock->minKey.ts > tsLast);
1811 1812
  }

1813
  pInfo->moreThanCapcity = pBlock->nRow > pReader->resBlockInfo.capacity;
C
Cary Xu 已提交
1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827
  pInfo->partiallyRequired = dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock);
  pInfo->overlapWithKeyInBuf = keyOverlapFileBlock(keyInBuf, pBlock, &pReader->verRange);
}

// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
// 5. delete info should not overlap with current block data
// 6. current block should not contain the duplicated ts
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
1828

C
Cary Xu 已提交
1829 1830 1831
  bool loadDataBlock =
      (info.overlapWithNeighborBlock || info.hasDupTs || info.partiallyRequired || info.overlapWithKeyInBuf ||
       info.moreThanCapcity || info.overlapWithDelInfo || info.overlapWithLastBlock);
1832 1833 1834 1835

  // log the reason why load the datablock for profile
  if (loadDataBlock) {
    tsdbDebug("%p uid:%" PRIu64
X
Xiaoyu Wang 已提交
1836
              " need to load the datablock, overlapneighbor:%d, hasDup:%d, partiallyRequired:%d, "
1837
              "overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithlastBlock:%d, %s",
C
Cary Xu 已提交
1838 1839 1840
              pReader, pBlockInfo->uid, info.overlapWithNeighborBlock, info.hasDupTs, info.partiallyRequired,
              info.overlapWithKeyInBuf, info.moreThanCapcity, info.overlapWithDelInfo, info.overlapWithLastBlock,
              pReader->idStr);
1841 1842 1843
  }

  return loadDataBlock;
H
Haojun Liao 已提交
1844 1845
}

C
Cary Xu 已提交
1846 1847 1848 1849 1850 1851 1852 1853 1854
static bool isCleanFileDataBlock(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                 STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
  bool isCleanFileBlock = !(info.overlapWithNeighborBlock || info.hasDupTs || info.overlapWithKeyInBuf ||
                            info.overlapWithDelInfo || info.overlapWithLastBlock);
  return isCleanFileBlock;
}

1855
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1856
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1857 1858
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1859

1860
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
1861 1862

  int64_t st = taosGetTimestampUs();
1863
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->resBlockInfo.capacity, pReader);
H
Haojun Liao 已提交
1864

H
Haojun Liao 已提交
1865
  blockDataUpdateTsWindow(pBlock, pReader->suppInfo.slotId[0]);
H
Haojun Liao 已提交
1866
  pBlock->info.id.uid = pBlockScanInfo->uid;
1867

1868
  setComposedBlockFlag(pReader, true);
1869

1870
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
D
dapan1121 已提交
1871
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%" PRId64 ", brange:%" PRId64
X
Xiaoyu Wang 已提交
1872
            " - %" PRId64 ", uid:%" PRIu64 ",  %s",
1873
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
1874
            pBlockScanInfo->uid, pReader->idStr);
1875 1876

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1877 1878 1879
  return code;
}

1880
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key,
1881
                                            SFileBlockDumpInfo* pDumpInfo, bool* copied) {
1882 1883 1884
  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
D
dapan1121 已提交
1885 1886 1887
  int32_t code = TSDB_CODE_SUCCESS;

  *copied = false;
1888 1889
  bool asc = (pReader->order == TSDB_ORDER_ASC);
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && asc) || (pDumpInfo->rowIndex > 0 && (!asc))) {
1890
    int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
1891 1892

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1893
    if (nextKey != key) {  // merge is not needed
1894
      code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
D
dapan1121 已提交
1895 1896 1897
      if (code) {
        return code;
      }
1898
      pDumpInfo->rowIndex += step;
D
dapan1121 已提交
1899
      *copied = true;
1900 1901 1902
    }
  }

D
dapan1121 已提交
1903
  return code;
1904 1905
}

1906
static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo,
1907
                                  SVersionRange* pVerRange) {
X
Xiaoyu Wang 已提交
1908
  int32_t step = ASCENDING_TRAVERSE(pLastBlockReader->order) ? 1 : -1;
H
Haojun Liao 已提交
1909

1910 1911
  while (1) {
    bool hasVal = tMergeTreeNext(&pLastBlockReader->mergeTree);
1912
    if (!hasVal) {  // the next value will be the accessed key in stt
1913
      pScanInfo->lastKeyInStt += step;
1914 1915 1916
      return false;
    }

1917
    TSDBROW* pRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
1918
    int64_t key = pRow->pBlockData->aTSKEY[pRow->iRow];
1919
    int64_t ver = pRow->pBlockData->aVersion[pRow->iRow];
1920

1921
    pLastBlockReader->currentKey = key;
1922
    pScanInfo->lastKeyInStt = key;
1923 1924

    if (!hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->lastBlockDelIndex, key, ver, pLastBlockReader->order, pVerRange)) {
1925 1926 1927 1928 1929 1930
      return true;
    }
  }
}

static bool tryCopyDistinctRowFromSttBlock(TSDBROW* fRow, SLastBlockReader* pLastBlockReader,
1931 1932
                                           STableBlockScanInfo* pScanInfo, int64_t ts, STsdbReader* pReader,
                                           bool* copied) {
D
dapan1121 已提交
1933 1934 1935 1936
  int32_t code = TSDB_CODE_SUCCESS;

  *copied = false;

1937
  bool hasVal = nextRowFromLastBlocks(pLastBlockReader, pScanInfo, &pReader->verRange);
1938 1939 1940
  if (hasVal) {
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 != ts) {
1941
      code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
D
dapan1121 已提交
1942 1943 1944
      if (code) {
        return code;
      }
1945

D
dapan1121 已提交
1946 1947
      *copied = true;
      return code;
1948 1949
    }
  } else {
1950
    code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
D
dapan1121 已提交
1951 1952 1953
    if (code) {
      return code;
    }
1954

D
dapan1121 已提交
1955 1956
    *copied = true;
    return code;
1957 1958
  }

D
dapan1121 已提交
1959
  return code;
1960 1961
}

H
Haojun Liao 已提交
1962 1963 1964
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
  // always set the newest schema version in pReader->pSchema
  if (pReader->pSchema == NULL) {
1965 1966
    STSchema* ps = getTableSchemaImpl(pReader, uid);
    if (ps == NULL) {
H
Haojun Liao 已提交
1967 1968
      return NULL;
    }
H
Haojun Liao 已提交
1969 1970
  }

1971
  if (pReader->pSchema && sversion == pReader->pSchema->version) {
H
Haojun Liao 已提交
1972 1973 1974
    return pReader->pSchema;
  }

1975 1976
  void** p = tSimpleHashGet(pReader->pSchemaMap, &sversion, sizeof(sversion));
  if (p != NULL) {
1977
    return *(STSchema**)p;
H
Haojun Liao 已提交
1978 1979
  }

1980
  STSchema* ptr = NULL;
1981
  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &ptr);
1982
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
1983 1984
    terrno = code;
    return NULL;
H
Haojun Liao 已提交
1985
  } else {
1986 1987 1988 1989 1990 1991
    code = tSimpleHashPut(pReader->pSchemaMap, &sversion, sizeof(sversion), &ptr, POINTER_BYTES);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = code;
      return NULL;
    }
    return ptr;
H
Haojun Liao 已提交
1992
  }
H
Haojun Liao 已提交
1993 1994
}

1995
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
1996
                                     SIterInfo* pIter, int64_t key, SLastBlockReader* pLastBlockReader) {
1997
  SRowMerger*         pMerger = &pReader->status.merger;
H
Hongze Cheng 已提交
1998
  SRow*               pTSRow = NULL;
1999 2000 2001
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2002
  int64_t tsLast = INT64_MIN;
2003
  if (hasDataInLastBlock(pLastBlockReader)) {
2004 2005
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
2006

H
Hongze Cheng 已提交
2007 2008
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2009

2010 2011 2012 2013 2014 2015 2016 2017 2018
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

2019 2020
  int64_t minKey = 0;
  if (pReader->order == TSDB_ORDER_ASC) {
H
Hongze Cheng 已提交
2021
    minKey = INT64_MAX;  // chosen the minimum value
2022
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2023 2024
      minKey = tsLast;
    }
2025

2026 2027 2028
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2029

2030
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2031 2032 2033 2034
      minKey = key;
    }
  } else {
    minKey = INT64_MIN;
2035
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2036 2037 2038 2039 2040 2041 2042
      minKey = tsLast;
    }

    if (minKey < k.ts) {
      minKey = k.ts;
    }

2043
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2044 2045
      minKey = key;
    }
2046 2047
  }

2048
  // todo remove init
2049 2050
  bool init = false;

2051
  // ASC: file block ---> last block -----> imem -----> mem
H
Hongze Cheng 已提交
2052
  // DESC: mem -----> imem -----> last block -----> file block
2053 2054
  if (pReader->order == TSDB_ORDER_ASC) {
    if (minKey == key) {
2055
      init = true;
2056
      int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2057 2058 2059
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
2060
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2061 2062
    }

2063
    if (minKey == tsLast) {
2064
      TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2065
      if (init) {
2066
        tsdbRowMergerAdd(pMerger, fRow1, NULL);
H
Haojun Liao 已提交
2067
      } else {
2068
        init = true;
2069
        int32_t code = tsdbRowMergerAdd(pMerger, fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2070 2071 2072
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2073
      }
2074
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2075
    }
2076

2077
    if (minKey == k.ts) {
K
kailixu 已提交
2078 2079 2080 2081
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      if (pSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
2082
      if (init) {
2083
        tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2084
      } else {
2085
        init = true;
2086
        int32_t code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2087 2088 2089 2090
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
      }
H
Haojun Liao 已提交
2091
      int32_t code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2092 2093
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2094 2095 2096 2097 2098
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2099
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2100 2101 2102 2103
      if (pSchema == NULL) {
        return terrno;
      }

2104
      int32_t   code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2105 2106 2107 2108
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2109
      code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
2110
      if (code != TSDB_CODE_SUCCESS || pMerger->pTSchema == NULL) {
H
Haojun Liao 已提交
2111 2112
        return code;
      }
2113 2114
    }

2115
    if (minKey == tsLast) {
2116
      TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2117
      if (init) {
2118
        tsdbRowMergerAdd(pMerger, fRow1, NULL);
H
Haojun Liao 已提交
2119
      } else {
2120
        init = true;
2121
        int32_t code = tsdbRowMergerAdd(pMerger, fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2122 2123 2124
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2125
      }
2126
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2127 2128 2129
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2130
      if (init) {
2131
        tsdbRowMergerAdd(pMerger, &fRow, NULL);
H
Haojun Liao 已提交
2132
      } else {
2133
        init = true;
2134
        int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2135 2136 2137
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2138
      }
H
Haojun Liao 已提交
2139
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2140
    }
2141 2142
  }

2143
  int32_t code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2144 2145 2146 2147
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2148
  code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2149 2150

  taosMemoryFree(pTSRow);
2151
  tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2152 2153

  return code;
2154 2155
}

2156 2157 2158
static int32_t doMergeFileBlockAndLastBlock(SLastBlockReader* pLastBlockReader, STsdbReader* pReader,
                                            STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                            bool mergeBlockData) {
2159
  SRowMerger* pMerger = &pReader->status.merger;
2160
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2161 2162 2163 2164 2165 2166 2167 2168 2169 2170

  int64_t  tsLastBlock = getCurrentKeyInLastBlock(pLastBlockReader);
  bool     copied = false;
  int32_t  code = TSDB_CODE_SUCCESS;
  SRow*    pTSRow = NULL;
  TSDBROW* pRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);

  // create local variable to hold the row value
  TSDBROW  fRow = {.iRow = pRow->iRow, .type = TSDBROW_COL_FMT, .pBlockData = pRow->pBlockData};

2171
  tsdbTrace("fRow ptr:%p, %d, uid:%" PRIu64 ", %s", pRow->pBlockData, pRow->iRow, pLastBlockReader->uid, pReader->idStr);
2172

2173 2174
  // only last block exists
  if ((!mergeBlockData) || (tsLastBlock != pBlockData->aTSKEY[pDumpInfo->rowIndex])) {
2175
    code = tryCopyDistinctRowFromSttBlock(&fRow, pLastBlockReader, pBlockScanInfo, tsLastBlock, pReader, &copied);
D
dapan1121 已提交
2176 2177 2178
    if (code) {
      return code;
    }
2179

D
dapan1121 已提交
2180
    if (copied) {
2181
      pBlockScanInfo->lastKey = tsLastBlock;
2182 2183
      return TSDB_CODE_SUCCESS;
    } else {
2184
      code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2185 2186 2187
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2188

2189 2190
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tsdbRowMergerAdd(pMerger, pRow1, NULL);
2191
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, pMerger, &pReader->verRange, pReader->idStr);
2192

2193
      code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2194 2195 2196
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2197

2198
      code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2199 2200

      taosMemoryFree(pTSRow);
2201
      tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2202 2203 2204 2205

      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2206 2207
    }
  } else {  // not merge block data
2208
    code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2209 2210 2211 2212
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2213
    doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, pMerger, &pReader->verRange, pReader->idStr);
2214 2215

    // merge with block data if ts == key
H
Haojun Liao 已提交
2216
    if (tsLastBlock == pBlockData->aTSKEY[pDumpInfo->rowIndex]) {
H
Haojun Liao 已提交
2217
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2218 2219
    }

2220
    code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2221 2222 2223 2224
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2225
    code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2226 2227

    taosMemoryFree(pTSRow);
2228
    tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2229 2230 2231 2232

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
2233
  }
2234 2235 2236 2237

  return TSDB_CODE_SUCCESS;
}

2238 2239
static int32_t mergeFileBlockAndLastBlock(STsdbReader* pReader, SLastBlockReader* pLastBlockReader, int64_t key,
                                          STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
2240
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2241 2242 2243 2244 2245 2246 2247 2248 2249 2250
  SRowMerger* pMerger = &pReader->status.merger;

  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }
2251

2252
  if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
2253
    // no last block available, only data block exists
2254
    if (!hasDataInLastBlock(pLastBlockReader)) {
2255 2256 2257 2258 2259 2260 2261 2262 2263
      return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
    }

    // row in last file block
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
    int64_t ts = getCurrentKeyInLastBlock(pLastBlockReader);
    ASSERT(ts >= key);

    if (ASCENDING_TRAVERSE(pReader->order)) {
2264
      if (key < ts) {  // imem, mem are all empty, file blocks (data blocks and last block) exist
2265 2266
        return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
      } else if (key == ts) {
H
Haojun Liao 已提交
2267 2268
        SRow*       pTSRow = NULL;
        int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2269 2270 2271 2272
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

H
Haojun Liao 已提交
2273
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2274

2275 2276
        TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
2277

H
Haojun Liao 已提交
2278
        doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, ts, pMerger, &pReader->verRange, pReader->idStr);
2279

H
Haojun Liao 已提交
2280
        code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2281 2282 2283 2284
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

2285
        code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2286

2287
        taosMemoryFree(pTSRow);
2288
        tsdbRowMergerClear(pMerger);
2289
        return code;
2290
      } else {
2291
        return TSDB_CODE_SUCCESS;
2292
      }
2293
    } else {  // desc order
2294
      return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, pBlockData, true);
2295
    }
2296
  } else {  // only last block exists
2297
    return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, NULL, false);
H
Haojun Liao 已提交
2298
  }
2299 2300
}

2301 2302
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                     SLastBlockReader* pLastBlockReader) {
2303
  SRowMerger*         pMerger = &pReader->status.merger;
H
Hongze Cheng 已提交
2304
  SRow*               pTSRow = NULL;
H
Haojun Liao 已提交
2305
  int32_t             code = TSDB_CODE_SUCCESS;
2306 2307 2308
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SArray*             pDelList = pBlockScanInfo->delSkyline;

2309 2310
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
2311

2312
  int64_t tsLast = INT64_MIN;
2313 2314 2315
  if (hasDataInLastBlock(pLastBlockReader)) {
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
2316

H
Hongze Cheng 已提交
2317
  int64_t key = hasDataInFileBlock(pBlockData, pDumpInfo) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2318

2319 2320 2321 2322 2323 2324
  TSDBKEY   k = TSDBROW_KEY(pRow);
  TSDBKEY   ik = TSDBROW_KEY(piRow);
  STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
  if (pSchema == NULL) {
    return code;
  }
2325

2326 2327 2328 2329
  STSchema* piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
  if (piSchema == NULL) {
    return code;
  }
2330

2331 2332 2333 2334 2335 2336 2337 2338 2339
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

2340
  int64_t minKey = 0;
2341 2342 2343 2344 2345
  if (ASCENDING_TRAVERSE(pReader->order)) {
    minKey = INT64_MAX;  // let's find the minimum
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2346

2347 2348 2349
    if (minKey > ik.ts) {
      minKey = ik.ts;
    }
2350

2351
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2352 2353
      minKey = key;
    }
2354

2355
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2356 2357 2358
      minKey = tsLast;
    }
  } else {
H
Hongze Cheng 已提交
2359
    minKey = INT64_MIN;  // let find the maximum ts value
2360 2361 2362 2363 2364 2365 2366 2367
    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < ik.ts) {
      minKey = ik.ts;
    }

2368
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2369 2370 2371
      minKey = key;
    }

2372
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2373 2374
      minKey = tsLast;
    }
2375 2376 2377 2378
  }

  bool init = false;

2379 2380 2381 2382
  // ASC: file block -----> last block -----> imem -----> mem
  // DESC: mem -----> imem -----> last block -----> file block
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (minKey == key) {
2383
      init = true;
2384
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2385
      code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2386 2387 2388 2389
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2390
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2391 2392
    }

2393
    if (minKey == tsLast) {
2394
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2395
      if (init) {
2396
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
H
Haojun Liao 已提交
2397
      } else {
2398
        init = true;
2399
        code = tsdbRowMergerAdd(pMerger, pRow1, pReader->pSchema);
H
Haojun Liao 已提交
2400 2401 2402
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2403
      }
H
Haojun Liao 已提交
2404

2405
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2406 2407 2408
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2409
      if (init) {
2410
        tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2411
      } else {
2412
        init = true;
2413
        code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2414 2415 2416
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2417
      }
H
Haojun Liao 已提交
2418

H
Haojun Liao 已提交
2419
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2420 2421
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2422
      }
2423 2424
    }

2425
    if (minKey == k.ts) {
H
Haojun Liao 已提交
2426
      if (init) {
2427
        tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2428
      } else {
2429
        // STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2430
        code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2431 2432 2433
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2434
      }
H
Haojun Liao 已提交
2435
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2436 2437
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2438 2439 2440 2441 2442
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2443
      code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2444 2445 2446 2447
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2448
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2449 2450 2451
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2452 2453 2454
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2455
      if (init) {
2456
        tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2457
      } else {
2458
        init = true;
2459
        code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2460 2461 2462
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2463
      }
H
Haojun Liao 已提交
2464
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2465 2466
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2467 2468 2469 2470
      }
    }

    if (minKey == tsLast) {
2471
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2472
      if (init) {
2473
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
H
Haojun Liao 已提交
2474
      } else {
2475
        init = true;
2476
        code = tsdbRowMergerAdd(pMerger, pRow1, pReader->pSchema);
H
Haojun Liao 已提交
2477 2478 2479
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2480
      }
2481
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2482 2483 2484
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2485
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2486
      if (!init) {
2487
        code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2488 2489 2490
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
H
Haojun Liao 已提交
2491
      } else {
2492
        tsdbRowMergerAdd(pMerger, &fRow, NULL);
2493
      }
H
Haojun Liao 已提交
2494
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2495 2496 2497
    }
  }

2498
  code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2499 2500 2501 2502
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2503
  code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2504 2505

  taosMemoryFree(pTSRow);
2506
  tsdbRowMergerClear(pMerger);
2507
  return code;
2508 2509
}

2510 2511 2512 2513 2514 2515 2516 2517 2518
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

  int32_t code = TSDB_CODE_SUCCESS;

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
2519 2520
    // startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
    startKey = (TSDBKEY){.ts = pBlockScanInfo->lastKey + 1, .version = pReader->verRange.minVer};
2521
  } else {
2522 2523
    // startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
    startKey = (TSDBKEY){.ts = pBlockScanInfo->lastKey - 1, .version = pReader->verRange.maxVer};
2524 2525 2526
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));
D
dapan1121 已提交
2527
  int64_t st = 0;
2528 2529 2530 2531 2532 2533 2534 2535 2536

  STbData* d = NULL;
  if (pReader->pReadSnap->pMem != NULL) {
    d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
    if (d != NULL) {
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);

H
Haojun Liao 已提交
2537
        tsdbDebug("%p uid:%" PRIu64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
2538
                  "-%" PRId64 " %s",
2539 2540
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2541
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
2542 2543 2544 2545 2546
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2547
    tsdbDebug("%p uid:%" PRIu64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2548 2549 2550 2551 2552 2553 2554 2555 2556 2557
  }

  STbData* di = NULL;
  if (pReader->pReadSnap->pIMem != NULL) {
    di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
    if (di != NULL) {
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);

H
Haojun Liao 已提交
2558
        tsdbDebug("%p uid:%" PRIu64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
2559
                  "-%" PRId64 " %s",
2560 2561
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2562
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
2563 2564 2565 2566 2567
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2568
    tsdbDebug("%p uid:%" PRIu64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2569 2570
  }

2571
  st = taosGetTimestampUs();
2572
  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);
2573
  pReader->cost.initDelSkylineIterTime += (taosGetTimestampUs() - st) / 1000.0;
2574 2575 2576 2577 2578

  pBlockScanInfo->iterInit = true;
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
2579 2580
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
2581 2582 2583 2584 2585 2586 2587 2588
  // it is an multi-table data block
  if (pBlockData->aUid != NULL) {
    uint64_t uid = pBlockData->aUid[pDumpInfo->rowIndex];
    if (uid != pBlockScanInfo->uid) {  // move to next row
      return false;
    }
  }

2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

2600
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, pReader->order,
2601
                     &pReader->verRange)) {
2602 2603 2604
    return false;
  }

2605 2606 2607
  return true;
}

2608
static bool initLastBlockReader(SLastBlockReader* pLBlockReader, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
2609
  // the last block reader has been initialized for this table.
2610
  if (pLBlockReader->uid == pScanInfo->uid) {
2611
    return hasDataInLastBlock(pLBlockReader);
2612 2613
  }

2614 2615
  if (pLBlockReader->uid != 0) {
    tMergeTreeClose(&pLBlockReader->mergeTree);
2616 2617
  }

2618 2619
  initMemDataIterator(pScanInfo, pReader);
  pLBlockReader->uid = pScanInfo->uid;
2620

2621 2622
  STimeWindow w = pLBlockReader->window;
  if (ASCENDING_TRAVERSE(pLBlockReader->order)) {
2623
    w.skey = pScanInfo->lastKeyInStt;
2624
  } else {
2625
    w.ekey = pScanInfo->lastKeyInStt;
2626 2627
  }

X
Xiaoyu Wang 已提交
2628 2629
  tsdbDebug("init last block reader, window:%" PRId64 "-%" PRId64 ", uid:%" PRIu64 ", %s", w.skey, w.ekey,
            pScanInfo->uid, pReader->idStr);
2630 2631
  int32_t code = tMergeTreeOpen(&pLBlockReader->mergeTree, (pLBlockReader->order == TSDB_ORDER_DESC),
                                pReader->pFileReader, pReader->suid, pScanInfo->uid, &w, &pLBlockReader->verRange,
2632
                                pLBlockReader->pInfo, false, pReader->idStr, false, pReader->status.pLDataIter);
2633 2634 2635 2636
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

2637
  return nextRowFromLastBlocks(pLBlockReader, pScanInfo, &pReader->verRange);
2638 2639
}

H
Hongze Cheng 已提交
2640
static bool hasDataInLastBlock(SLastBlockReader* pLastBlockReader) { return pLastBlockReader->mergeTree.pIter != NULL; }
2641

2642
bool hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo) {
H
Haojun Liao 已提交
2643
  if ((pBlockData->nRow > 0) && (pBlockData->nRow != pDumpInfo->totalRows)) {
2644
    return false;  // this is an invalid result.
2645
  }
2646
  return pBlockData->nRow > 0 && (!pDumpInfo->allDumped);
2647
}
2648

2649 2650
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                              STsdbReader* pReader) {
2651
  SRowMerger*         pMerger = &pReader->status.merger;
2652
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2653 2654
  bool                copied = false;
  int32_t             code = tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo, &copied);
D
dapan1121 已提交
2655 2656 2657
  if (code) {
    return code;
  }
2658

2659 2660 2661 2662 2663 2664 2665 2666 2667
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

D
dapan1121 已提交
2668
  if (copied) {
2669
    pBlockScanInfo->lastKey = key;
2670 2671
    return TSDB_CODE_SUCCESS;
  } else {
C
Cary Xu 已提交
2672 2673
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);

H
Hongze Cheng 已提交
2674
    SRow*      pTSRow = NULL;
2675
    code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2676 2677 2678 2679
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
2680
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2681
    code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2682 2683 2684 2685
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2686
    code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2687 2688

    taosMemoryFree(pTSRow);
2689
    tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2690
    return code;
2691 2692 2693
  }
}

H
Haojun Liao 已提交
2694 2695
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
                                          SBlockData* pBlockData, SLastBlockReader* pLastBlockReader) {
2696 2697
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2698
  TSDBROW *pRow = NULL, *piRow = NULL;
2699
  int64_t key = (pBlockData->nRow > 0 && (!pDumpInfo->allDumped)) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2700 2701 2702
  if (pBlockScanInfo->iter.hasVal) {
    pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  }
C
Cary Xu 已提交
2703

2704 2705 2706
  if (pBlockScanInfo->iiter.hasVal) {
    piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
  }
C
Cary Xu 已提交
2707

2708 2709 2710 2711
  // two levels of mem-table does contain the valid rows
  if (pRow != NULL && piRow != NULL) {
    return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
  }
2712

2713 2714 2715 2716
  // imem + file + last block
  if (pBlockScanInfo->iiter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key, pLastBlockReader);
  }
2717

2718 2719 2720
  // mem + file + last block
  if (pBlockScanInfo->iter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key, pLastBlockReader);
2721
  }
2722 2723 2724

  // files data blocks + last block
  return mergeFileBlockAndLastBlock(pReader, pLastBlockReader, key, pBlockScanInfo, pBlockData);
2725 2726
}

H
Haojun Liao 已提交
2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766
static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pBlockScanInfo,
                                     STsdbReader* pReader, bool* loadNeighbor) {
  int32_t     code = TSDB_CODE_SUCCESS;
  int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
  int32_t     nextIndex = -1;
  SBlockIndex nxtBIndex = {0};

  *loadNeighbor = false;
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);

  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pBlockScanInfo, &nextIndex, pReader->order, &nxtBIndex);
  if (!hasNeighbor) {  // do nothing
    return code;
  }

  if (overlapWithNeighborBlock(pBlock, &nxtBIndex, pReader->order)) {  // load next block
    SReaderStatus*  pStatus = &pReader->status;
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

    // 1. find the next neighbor block in the scan block list
    SFileDataBlockInfo fb = {.uid = pBlockInfo->uid, .tbBlockIdx = nextIndex};
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);

    // 2. remove it from the scan block list
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);

    // 3. load the neighbor block, and set it to be the currently accessed file data block
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pBlockInfo->uid);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    // 4. check the data values
    initBlockDumpInfo(pReader, pBlockIter);
    *loadNeighbor = true;
  }

  return code;
}

2767
static void updateComposedBlockInfo(STsdbReader* pReader, double el, STableBlockScanInfo* pBlockScanInfo) {
2768
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779

  pResBlock->info.id.uid = (pBlockScanInfo != NULL) ? pBlockScanInfo->uid : 0;
  pResBlock->info.dataLoad = 1;
  blockDataUpdateTsWindow(pResBlock, pReader->suppInfo.slotId[0]);

  setComposedBlockFlag(pReader, true);

  pReader->cost.composedBlocks += 1;
  pReader->cost.buildComposedBlockTime += el;
}

2780
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2781 2782
  int32_t code = TSDB_CODE_SUCCESS;

2783
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
2784

H
Hongze Cheng 已提交
2785
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
C
Cary Xu 已提交
2786 2787
  SLastBlockReader*   pLastBlockReader = pReader->status.fileIter.pLastBlockReader;

2788
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
C
Cary Xu 已提交
2789
  int64_t st = taosGetTimestampUs();
2790
  int32_t step = asc ? 1 : -1;
2791
  double  el = 0;
2792 2793
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2794 2795 2796

  STableBlockScanInfo* pBlockScanInfo = NULL;
  if (pBlockInfo != NULL) {
D
dapan1121 已提交
2797
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
2798 2799 2800 2801
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
      return code;
    }
    
H
Haojun Liao 已提交
2802 2803
    pBlockScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr);
    if (pBlockScanInfo == NULL) {
H
Haojun Liao 已提交
2804 2805 2806
      goto _end;
    }

H
Hongze Cheng 已提交
2807
    TSDBKEY   keyInBuf = getCurrentKeyInBuf(pBlockScanInfo, pReader);
C
Cary Xu 已提交
2808 2809

    // it is a clean block, load it directly
H
Hongze Cheng 已提交
2810
    if (isCleanFileDataBlock(pReader, pBlockInfo, pBlock, pBlockScanInfo, keyInBuf, pLastBlockReader) &&
2811
        pBlock->nRow <= pReader->resBlockInfo.capacity) {
2812
      if (asc || ((!asc) && (!hasDataInLastBlock(pLastBlockReader)))) {
D
dapan1121 已提交
2813 2814 2815 2816
        code = copyBlockDataToSDataBlock(pReader);
        if (code) {
          goto _end;
        }
2817 2818

        // record the last key value
H
Hongze Cheng 已提交
2819
        pBlockScanInfo->lastKey = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
H
Haojun Liao 已提交
2820 2821
        goto _end;
      }
C
Cary Xu 已提交
2822 2823
    }
  } else {  // file blocks not exist
2824
    pBlockScanInfo = *pReader->status.pTableIter;
D
dapan1121 已提交
2825
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockScanInfo->uid, sizeof(pBlockScanInfo->uid))) {
2826 2827 2828
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
      return code;
    }
2829 2830
  }

2831
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
2832

2833
  while (1) {
2834
    bool hasBlockData = false;
2835
    {
2836 2837
      while (pBlockData->nRow > 0 &&
             pBlockData->uid == pBlockScanInfo->uid) {  // find the first qualified row in data block
2838 2839 2840 2841 2842
        if (isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
          hasBlockData = true;
          break;
        }

2843 2844
        pDumpInfo->rowIndex += step;

H
Hongze Cheng 已提交
2845
        SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
2846
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
H
Haojun Liao 已提交
2847
          pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);  // NOTE: get the new block info
H
Haojun Liao 已提交
2848

H
Haojun Liao 已提交
2849 2850 2851 2852 2853
          // continue check for the next file block if the last ts in the current block
          // is overlapped with the next neighbor block
          bool loadNeighbor = false;
          code = loadNeighborIfOverlap(pBlockInfo, pBlockScanInfo, pReader, &loadNeighbor);
          if ((!loadNeighbor) || (code != 0)) {
2854 2855
            setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
            break;
2856
          }
2857 2858
        }
      }
2859
    }
2860

2861
    // no data in last block and block, no need to proceed.
2862
    if (hasBlockData == false) {
2863
      break;
2864 2865
    }

D
dapan1121 已提交
2866 2867 2868 2869
    code = buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
    if (code) {
      goto _end;
    }
2870

2871
    // currently loaded file data block is consumed
2872
    if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
H
Hongze Cheng 已提交
2873
      SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
2874
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2875 2876 2877
      break;
    }

H
Haojun Liao 已提交
2878
    if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
2879
      break;
2880 2881 2882
    }
  }

H
Hongze Cheng 已提交
2883
_end:
2884 2885
  el = (taosGetTimestampUs() - st) / 1000.0;
  updateComposedBlockInfo(pReader, el, pBlockScanInfo);
2886

2887
  if (pResBlock->info.rows > 0) {
2888 2889
    tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
              ", elapsed time:%.2f ms %s",
H
Haojun Liao 已提交
2890
              pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
H
Haojun Liao 已提交
2891
              pResBlock->info.rows, el, pReader->idStr);
2892
  }
2893

H
Haojun Liao 已提交
2894
  return code;
2895 2896 2897 2898
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

2899 2900 2901 2902 2903 2904 2905 2906
int32_t getInitialDelIndex(const SArray* pDelSkyline, int32_t order) {
  if (pDelSkyline == NULL) {
    return 0;
  }

  return ASCENDING_TRAVERSE(order) ? 0 : taosArrayGetSize(pDelSkyline) - 1;
}

dengyihao's avatar
dengyihao 已提交
2907 2908
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
2909 2910 2911
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
2912

2913
  int32_t code = 0;
2914 2915
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
2916
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
2917
  if (pDelFile && taosArrayGetSize(pReader->pDelIdx) > 0) {
2918
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
2919
    SDelIdx* pIdx = taosArraySearch(pReader->pDelIdx, &idx, tCmprDelIdx, TD_EQ);
2920

H
Haojun Liao 已提交
2921
    if (pIdx != NULL) {
2922
      code = tsdbReadDelData(pReader->pDelFReader, pIdx, pDelData);
2923 2924 2925
    }
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
2926
    }
2927
  }
2928

2929 2930 2931 2932 2933 2934 2935
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
2936 2937
  }

2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
2952 2953 2954 2955 2956 2957 2958
  int32_t index = getInitialDelIndex(pBlockScanInfo->delSkyline, pReader->order);

  pBlockScanInfo->iter.index = index;
  pBlockScanInfo->iiter.index = index;
  pBlockScanInfo->fileDelIndex = index;
  pBlockScanInfo->lastBlockDelIndex = index;

2959 2960
  return code;

2961 2962 2963
_err:
  taosArrayDestroy(pDelData);
  return code;
2964 2965
}

C
Cary Xu 已提交
2966
TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
2967
  bool asc = ASCENDING_TRAVERSE(pReader->order);
X
Xiaoyu Wang 已提交
2968
  //  TSKEY initialVal = asc? TSKEY_MIN:TSKEY_MAX;
2969

X
Xiaoyu Wang 已提交
2970
  TSDBKEY key = {.ts = TSKEY_INITIAL_VAL}, ikey = {.ts = TSKEY_INITIAL_VAL};
2971

X
Xiaoyu Wang 已提交
2972
  bool     hasKey = false, hasIKey = false;
2973
  TSDBROW* pRow = getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
2974
  if (pRow != NULL) {
2975
    hasKey = true;
2976 2977 2978
    key = TSDBROW_KEY(pRow);
  }

2979 2980 2981 2982
  TSDBROW* pIRow = getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
  if (pIRow != NULL) {
    hasIKey = true;
    ikey = TSDBROW_KEY(pIRow);
2983 2984
  }

2985
  if (hasKey) {
X
Xiaoyu Wang 已提交
2986
    if (hasIKey) {  // has data in mem & imem
2987 2988
      if (asc) {
        return key.ts <= ikey.ts ? key : ikey;
X
Xiaoyu Wang 已提交
2989 2990
      } else {
        return key.ts <= ikey.ts ? ikey : key;
2991 2992 2993
      }
    } else {  // no data in imem
      return key;
2994
    }
2995 2996 2997 2998
  } else {
    // no data in mem & imem, return the initial value
    // only imem has data, return ikey
    return ikey;
2999 3000 3001
  }
}

3002
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum, SArray* pTableList) {
H
Haojun Liao 已提交
3003
  SReaderStatus* pStatus = &pReader->status;
3004
  pBlockNum->numOfBlocks = 0;
3005
  pBlockNum->numOfLastFiles = 0;
3006

H
Haojun Liao 已提交
3007
  size_t  numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
3008
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
3009 3010

  while (1) {
H
Haojun Liao 已提交
3011
    // only check here, since the iterate data in memory is very fast.
H
Haojun Liao 已提交
3012 3013 3014
    if (pReader->code != TSDB_CODE_SUCCESS) {
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
H
Haojun Liao 已提交
3015 3016
    }

dengyihao's avatar
dengyihao 已提交
3017
    bool    hasNext = false;
D
dapan1121 已提交
3018
    int32_t code = filesetIteratorNext(&pStatus->fileIter, pReader, &hasNext);
H
Haojun Liao 已提交
3019
    if (code != TSDB_CODE_SUCCESS) {
D
dapan1121 已提交
3020 3021 3022
      taosArrayDestroy(pIndexList);
      return code;
    }
dengyihao's avatar
dengyihao 已提交
3023

3024
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
3025 3026 3027
      break;
    }

H
Haojun Liao 已提交
3028
    taosArrayClear(pIndexList);
D
dapan1121 已提交
3029
    code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
H
Haojun Liao 已提交
3030
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3031
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3032 3033 3034
      return code;
    }

H
Hongze Cheng 已提交
3035
    if (taosArrayGetSize(pIndexList) > 0 || pReader->pFileReader->pSet->nSttF > 0) {
3036
      code = doLoadFileBlock(pReader, pIndexList, pBlockNum, pTableList);
H
Haojun Liao 已提交
3037
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3038
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3039 3040 3041
        return code;
      }

3042
      if (pBlockNum->numOfBlocks + pBlockNum->numOfLastFiles > 0) {
H
Haojun Liao 已提交
3043 3044 3045
        break;
      }
    }
3046

H
Haojun Liao 已提交
3047 3048 3049
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
3050
  taosArrayDestroy(pIndexList);
3051

H
Haojun Liao 已提交
3052 3053 3054 3055 3056 3057 3058
  if (pReader->pReadSnap != NULL) {
    SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
    if (pReader->pDelFReader == NULL && pDelFile != NULL) {
      int32_t code = tsdbDelFReaderOpen(&pReader->pDelFReader, pDelFile, pReader->pTsdb);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
3059

H
Haojun Liao 已提交
3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070
      pReader->pDelIdx = taosArrayInit(4, sizeof(SDelIdx));
      if (pReader->pDelIdx == NULL) {
        code = TSDB_CODE_OUT_OF_MEMORY;
        return code;
      }

      code = tsdbReadDelIdx(pReader->pDelFReader, pReader->pDelIdx);
      if (code != TSDB_CODE_SUCCESS) {
        taosArrayDestroy(pReader->pDelIdx);
        return code;
      }
3071 3072 3073
    }
  }

H
Haojun Liao 已提交
3074 3075 3076
  return TSDB_CODE_SUCCESS;
}

X
Xiaoyu Wang 已提交
3077
static void resetTableListIndex(SReaderStatus* pStatus) {
3078
  STableUidList* pList = &pStatus->uidList;
3079

H
Haojun Liao 已提交
3080 3081
  pList->currentIndex = 0;
  uint64_t uid = pList->tableUidList[0];
H
Haojun Liao 已提交
3082
  pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
3083 3084
}

3085
static bool moveToNextTable(STableUidList* pOrderedCheckInfo, SReaderStatus* pStatus) {
3086
  pOrderedCheckInfo->currentIndex += 1;
H
Haojun Liao 已提交
3087
  if (pOrderedCheckInfo->currentIndex >= tSimpleHashGetSize(pStatus->pTableMap)) {
3088 3089 3090 3091 3092
    pStatus->pTableIter = NULL;
    return false;
  }

  uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
H
Haojun Liao 已提交
3093
  pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
3094
  return (pStatus->pTableIter != NULL);
3095 3096
}

3097
static int32_t doLoadLastBlockSequentially(STsdbReader* pReader) {
3098
  SReaderStatus*    pStatus = &pReader->status;
3099
  SLastBlockReader* pLastBlockReader = pStatus->fileIter.pLastBlockReader;
3100
  STableUidList*    pUidList = &pStatus->uidList;
D
dapan1121 已提交
3101
  int32_t           code = TSDB_CODE_SUCCESS;
3102

H
Haojun Liao 已提交
3103
  if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
H
Haojun Liao 已提交
3104
    return TSDB_CODE_SUCCESS;
3105
  }
3106

3107
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
3108

3109
  while (1) {
3110
    if (pReader->code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3111 3112
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
H
Haojun Liao 已提交
3113 3114
    }

3115
    // load the last data block of current table
H
Hongze Cheng 已提交
3116
    STableBlockScanInfo* pScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
D
dapan1121 已提交
3117
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pScanInfo->uid, sizeof(pScanInfo->uid))) {
D
dapan1121 已提交
3118 3119 3120 3121
      // reset the index in last block when handing a new file
      doCleanupTableScanInfo(pScanInfo);
      pStatus->mapDataCleaned = true;

3122 3123 3124 3125 3126 3127 3128
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
      if (!hasNexTable) {
        return TSDB_CODE_SUCCESS;
      }

      continue;
    }
3129

3130 3131 3132 3133 3134 3135
    // reset the index in last block when handing a new file
    doCleanupTableScanInfo(pScanInfo);
    pStatus->mapDataCleaned = true;

    bool hasDataInLastFile = initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
    if (!hasDataInLastFile) {
3136
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
3137
      if (!hasNexTable) {
3138 3139
        return TSDB_CODE_SUCCESS;
      }
3140

3141
      continue;
3142 3143
    }

3144 3145 3146 3147 3148 3149 3150 3151 3152
    int64_t st = taosGetTimestampUs();
    while (1) {
      bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

      // no data in last block and block, no need to proceed.
      if (hasBlockLData == false) {
        break;
      }

D
dapan1121 已提交
3153 3154 3155 3156
      code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
      if (code) {
        return code;
      }
3157

3158
      if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
3159 3160
        break;
      }
3161 3162
    }

3163 3164 3165 3166
    double el = (taosGetTimestampUs() - st) / 1000.0;
    updateComposedBlockInfo(pReader, el, pScanInfo);

    if (pResBlock->info.rows > 0) {
3167 3168
      tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
                ", elapsed time:%.2f ms %s",
3169 3170
                pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                pResBlock->info.rows, el, pReader->idStr);
3171 3172
      return TSDB_CODE_SUCCESS;
    }
3173

3174
    // current table is exhausted, let's try next table
3175
    bool hasNexTable = moveToNextTable(pUidList, pStatus);
3176
    if (!hasNexTable) {
3177 3178
      return TSDB_CODE_SUCCESS;
    }
3179 3180 3181
  }
}

3182
static int32_t doBuildDataBlock(STsdbReader* pReader) {
H
Hongze Cheng 已提交
3183
  int32_t   code = TSDB_CODE_SUCCESS;
3184 3185 3186

  SReaderStatus*       pStatus = &pReader->status;
  SDataBlockIter*      pBlockIter = &pStatus->blockIter;
3187 3188 3189
  STableBlockScanInfo* pScanInfo = NULL;
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(pBlockIter);
  SLastBlockReader*    pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
3190 3191
  SDataBlk*            pBlock = getCurrentBlock(pBlockIter);

D
dapan1121 已提交
3192
  if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
3193 3194 3195
    setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
    return code;
  }
3196

H
Haojun Liao 已提交
3197 3198
  if (pReader->code != TSDB_CODE_SUCCESS) {
    return pReader->code;
3199 3200
  }

H
Haojun Liao 已提交
3201
  pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
3202
  if (pScanInfo == NULL) {
H
Haojun Liao 已提交
3203
    return terrno;
H
Haojun Liao 已提交
3204 3205
  }

3206

3207
  initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
C
Cary Xu 已提交
3208
  TSDBKEY keyInBuf = getCurrentKeyInBuf(pScanInfo, pReader);
3209

3210
  if (fileBlockShouldLoad(pReader, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader)) {
3211
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pScanInfo->uid);
3212 3213
    if (code != TSDB_CODE_SUCCESS) {
      return code;
3214 3215 3216
    }

    // build composed data block
3217
    code = buildComposedDataBlock(pReader);
C
Cary Xu 已提交
3218
  } else if (bufferDataInFileBlockGap(pReader->order, keyInBuf, pBlock)) {
3219
    // data in memory that are earlier than current file block
3220
    // rows in buffer should be less than the file block in asc, greater than file block in desc
3221
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
3222
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
3223 3224 3225 3226
  } else {
    if (hasDataInLastBlock(pLastBlockReader) && !ASCENDING_TRAVERSE(pReader->order)) {
      // only return the rows in last block
      int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
H
Hongze Cheng 已提交
3227
      ASSERT(tsLast >= pBlock->maxKey.ts);
3228

3229 3230 3231
      SBlockData* pBData = &pReader->status.fileBlockData;
      tBlockDataReset(pBData);

3232
      SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
3233
      tsdbDebug("load data in last block firstly, due to desc scan data, %s", pReader->idStr);
3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244

      int64_t st = taosGetTimestampUs();

      while (1) {
        bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

        // no data in last block and block, no need to proceed.
        if (hasBlockLData == false) {
          break;
        }

D
dapan1121 已提交
3245 3246 3247 3248
        code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
        if (code) {
          return code;
        }
3249

3250
        if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
3251 3252 3253 3254 3255 3256 3257 3258
          break;
        }
      }

      double el = (taosGetTimestampUs() - st) / 1000.0;
      updateComposedBlockInfo(pReader, el, pScanInfo);

      if (pResBlock->info.rows > 0) {
3259 3260
        tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
                  ", elapsed time:%.2f ms %s",
3261 3262 3263
                  pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                  pResBlock->info.rows, el, pReader->idStr);
      }
H
Hongze Cheng 已提交
3264
    } else {  // whole block is required, return it directly
3265
      SDataBlockInfo* pInfo = &pReader->resBlockInfo.pResBlock->info;
3266
      pInfo->rows = pBlock->nRow;
H
Haojun Liao 已提交
3267
      pInfo->id.uid = pScanInfo->uid;
3268
      pInfo->dataLoad = 0;
3269 3270 3271
      pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
      setComposedBlockFlag(pReader, false);
      setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
3272

3273
      // update the last key for the corresponding table
H
Hongze Cheng 已提交
3274
      pScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? pInfo->window.ekey : pInfo->window.skey;
X
Xiaoyu Wang 已提交
3275 3276
      tsdbDebug("%p uid:%" PRIu64
                " clean file block retrieved from file, global index:%d, "
H
Haojun Liao 已提交
3277 3278 3279
                "table index:%d, rows:%d, brange:%" PRId64 "-%" PRId64 ", %s",
                pReader, pScanInfo->uid, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->nRow, pBlock->minKey.ts,
                pBlock->maxKey.ts, pReader->idStr);
3280
    }
3281 3282
  }

H
Haojun Liao 已提交
3283
  return (pReader->code != TSDB_CODE_SUCCESS)? pReader->code:code;
3284 3285
}

D
dapan1121 已提交
3286
static int32_t doSumFileBlockRows(STsdbReader* pReader, SDataFReader* pFileReader) {
D
dapan1121 已提交
3287 3288 3289 3290 3291 3292 3293
  int64_t    st = taosGetTimestampUs();
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
    goto _end;
  }

H
Haojun Liao 已提交
3294
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
D
dapan1121 已提交
3295 3296 3297 3298 3299 3300 3301 3302

  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
    return TSDB_CODE_SUCCESS;
  }

3303
  SBlockIdx* pBlockIdx = NULL;
D
dapan1121 已提交
3304 3305 3306 3307 3308 3309
  for (int32_t i = 0; i < num; ++i) {
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
    if (pBlockIdx->suid != pReader->suid) {
      continue;
    }

H
Haojun Liao 已提交
3310
    STableBlockScanInfo** p = tSimpleHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(pBlockIdx->uid));
D
dapan1121 已提交
3311
    if (p == NULL) {
D
dapan1121 已提交
3312 3313 3314
      continue;
    }

3315
    STableBlockScanInfo* pScanInfo = *p;
D
dapan1121 已提交
3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330
    tMapDataReset(&pScanInfo->mapData);
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);

    SDataBlk block = {0};
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
      pReader->rowsNum += block.nRow;
    }
  }

_end:
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
  return code;
}

D
dapan1121 已提交
3331
static int32_t doSumSttBlockRows(STsdbReader* pReader) {
3332 3333 3334
  int32_t            code = TSDB_CODE_SUCCESS;
  SLastBlockReader*  pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
  SSttBlockLoadInfo* pBlockLoadInfo = NULL;
D
dapan1121 已提交
3335 3336 3337

  for (int32_t i = 0; i < pReader->pFileReader->pSet->nSttF; ++i) {  // open all last file
    pBlockLoadInfo = &pLastBlockReader->pInfo[i];
3338

D
dapan1121 已提交
3339 3340 3341
    code = tsdbReadSttBlk(pReader->pFileReader, i, pBlockLoadInfo->aSttBlk);
    if (code) {
      return code;
D
dapan1121 已提交
3342 3343 3344 3345
    }

    size_t size = taosArrayGetSize(pBlockLoadInfo->aSttBlk);
    if (size >= 1) {
3346 3347 3348
      SSttBlk* pStart = taosArrayGet(pBlockLoadInfo->aSttBlk, 0);
      SSttBlk* pEnd = taosArrayGet(pBlockLoadInfo->aSttBlk, size - 1);

D
dapan1121 已提交
3349 3350 3351 3352 3353 3354 3355
      // all identical
      if (pStart->suid == pEnd->suid) {
        if (pStart->suid != pReader->suid) {
          // no qualified stt block existed
          taosArrayClear(pBlockLoadInfo->aSttBlk);
          continue;
        }
H
Haojun Liao 已提交
3356 3357
        for (int32_t j = 0; j < size; ++j) {
          SSttBlk* p = taosArrayGet(pBlockLoadInfo->aSttBlk, j);
D
dapan1121 已提交
3358 3359
          pReader->rowsNum += p->nRow;
        }
D
dapan1121 已提交
3360
      } else {
H
Haojun Liao 已提交
3361 3362
        for (int32_t j = 0; j < size; ++j) {
          SSttBlk* p = taosArrayGet(pBlockLoadInfo->aSttBlk, j);
D
dapan1121 已提交
3363 3364 3365 3366
          uint64_t s = p->suid;
          if (s < pReader->suid) {
            continue;
          }
3367

D
dapan1121 已提交
3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380
          if (s == pReader->suid) {
            pReader->rowsNum += p->nRow;
          } else if (s > pReader->suid) {
            break;
          }
        }
      }
    }
  }

  return code;
}

D
dapan1121 已提交
3381
static int32_t readRowsCountFromFiles(STsdbReader* pReader) {
3382
  int32_t code = TSDB_CODE_SUCCESS;
D
dapan1121 已提交
3383 3384

  while (1) {
3385 3386
    bool hasNext = false;
    code = filesetIteratorNext(&pReader->status.fileIter, pReader, &hasNext);
D
dapan1121 已提交
3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402
    if (code) {
      return code;
    }

    if (!hasNext) {  // no data files on disk
      break;
    }

    code = doSumFileBlockRows(pReader, pReader->pFileReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    code = doSumSttBlockRows(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
3403
    }
D
dapan1121 已提交
3404 3405 3406 3407 3408 3409 3410
  }

  pReader->status.loadFromFile = false;

  return code;
}

D
dapan1121 已提交
3411
static int32_t readRowsCountFromMem(STsdbReader* pReader) {
3412 3413
  int32_t code = TSDB_CODE_SUCCESS;
  int64_t memNum = 0, imemNum = 0;
D
dapan1121 已提交
3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426
  if (pReader->pReadSnap->pMem != NULL) {
    tsdbMemTableCountRows(pReader->pReadSnap->pMem, pReader->status.pTableMap, &memNum);
  }

  if (pReader->pReadSnap->pIMem != NULL) {
    tsdbMemTableCountRows(pReader->pReadSnap->pIMem, pReader->status.pTableMap, &imemNum);
  }

  pReader->rowsNum += memNum + imemNum;

  return code;
}

H
Haojun Liao 已提交
3427
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
3428
  SReaderStatus* pStatus = &pReader->status;
3429
  STableUidList* pUidList = &pStatus->uidList;
3430

3431
  while (1) {
3432
    if (pReader->code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3433 3434
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
3435 3436
    }

3437
    STableBlockScanInfo** pBlockScanInfo = pStatus->pTableIter;
D
dapan1121 已提交
3438
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &(*pBlockScanInfo)->uid, sizeof((*pBlockScanInfo)->uid))) {
3439 3440 3441 3442 3443 3444
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
      if (!hasNexTable) {
        return TSDB_CODE_SUCCESS;
      }
    }
    
3445
    initMemDataIterator(*pBlockScanInfo, pReader);
3446

3447
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
3448
    int32_t code = buildDataBlockFromBuf(pReader, *pBlockScanInfo, endKey);
H
Haojun Liao 已提交
3449 3450 3451 3452
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

3453
    if (pReader->resBlockInfo.pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
3454
      return TSDB_CODE_SUCCESS;
3455 3456
    }

3457 3458 3459
    // current table is exhausted, let's try next table
    bool hasNexTable = moveToNextTable(pUidList, pStatus);
    if (!hasNexTable) {
H
Haojun Liao 已提交
3460
      return TSDB_CODE_SUCCESS;
3461 3462 3463 3464
    }
  }
}

3465
// set the correct start position in case of the first/last file block, according to the query time window
3466
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3467 3468 3469 3470
  int64_t             lastKey = ASCENDING_TRAVERSE(pReader->order) ? INT64_MIN : INT64_MAX;
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo) {
H
Haojun Liao 已提交
3471
    STableBlockScanInfo* pScanInfo = tSimpleHashGet(pBlockIter->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
3472 3473 3474
    if (pScanInfo) {
      lastKey = pScanInfo->lastKey;
    }
3475
  }
3476 3477 3478
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
3479 3480 3481

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
3482
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
3483
  pDumpInfo->lastKey = lastKey;
3484 3485
}

3486
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3487
  SBlockNumber num = {0};
3488 3489
  SArray* pTableList = taosArrayInit(40, POINTER_BYTES);

H
Haojun Liao 已提交
3490
  int32_t code = moveToNextFile(pReader, &num, pTableList);
3491
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3492
    taosArrayDestroy(pTableList);
3493 3494 3495 3496
    return code;
  }

  // all data files are consumed, try data in buffer
3497
  if (num.numOfBlocks + num.numOfLastFiles == 0) {
3498
    pReader->status.loadFromFile = false;
H
Haojun Liao 已提交
3499
    taosArrayDestroy(pTableList);
3500 3501 3502 3503
    return code;
  }

  // initialize the block iterator for a new fileset
3504
  if (num.numOfBlocks > 0) {
3505
    code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks, pTableList);
H
Hongze Cheng 已提交
3506
  } else {  // no block data, only last block exists
3507
    tBlockDataReset(&pReader->status.fileBlockData);
3508
    resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
3509
    resetTableListIndex(&pReader->status);
3510
  }
3511 3512

  // set the correct start position according to the query time window
3513
  initBlockDumpInfo(pReader, pBlockIter);
H
Haojun Liao 已提交
3514
  taosArrayDestroy(pTableList);
3515 3516 3517
  return code;
}

3518
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
3519 3520
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
3521 3522
}

3523 3524 3525 3526
typedef enum {
  TSDB_READ_RETURN = 0x1,
  TSDB_READ_CONTINUE = 0x2,
} ERetrieveType;
3527

3528 3529 3530
static ERetrieveType doReadDataFromLastFiles(STsdbReader* pReader) {
  int32_t         code = TSDB_CODE_SUCCESS;
  SSDataBlock*    pResBlock = pReader->resBlockInfo.pResBlock;
3531 3532
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

3533 3534 3535
  while(1) {
    terrno = 0;

3536
    code = doLoadLastBlockSequentially(pReader);
H
Haojun Liao 已提交
3537
    if (code != TSDB_CODE_SUCCESS) {
3538 3539
      terrno = code;
      return TSDB_READ_RETURN;
3540 3541
    }

3542 3543
    if (pResBlock->info.rows > 0) {
      return TSDB_READ_RETURN;
3544 3545
    }

3546
    // all data blocks are checked in this last block file, now let's try the next file
3547 3548
    ASSERT(pReader->status.pTableIter == NULL);
    code = initForFirstBlockInFile(pReader, pBlockIter);
3549

3550
    // error happens or all the data files are completely checked
H
Haojun Liao 已提交
3551
    if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
3552 3553 3554
      terrno = code;
      return TSDB_READ_RETURN;
    }
3555

3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571
    if (pBlockIter->numOfBlocks > 0) { // there are data blocks existed.
      return TSDB_READ_CONTINUE;
    } else {  // all blocks in data file are checked, let's check the data in last files
      resetTableListIndex(&pReader->status);
    }
  }
}

static int32_t buildBlockFromFiles(STsdbReader* pReader) {
  int32_t code = TSDB_CODE_SUCCESS;
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

  SDataBlockIter* pBlockIter = &pReader->status.blockIter;
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;

  if (pBlockIter->numOfBlocks == 0) {
3572
    // let's try to extract data from stt files.
3573
    ERetrieveType type = doReadDataFromLastFiles(pReader);
3574
    if (type == TSDB_READ_RETURN) {
3575
      return terrno;
3576 3577 3578
    }

    code = doBuildDataBlock(pReader);
H
Haojun Liao 已提交
3579
    if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
3580 3581 3582 3583
      return code;
    }
  }

3584
  while (1) {
3585 3586
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

3587
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
3588
      code = buildComposedDataBlock(pReader);
3589 3590 3591 3592
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
H
Haojun Liao 已提交
3593
        bool hasNext = blockIteratorNext(&pReader->status.blockIter, pReader->idStr);
3594 3595
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
3596
        } else {
3597 3598
          // all data blocks in files are checked, let's check the data in last files.
          ASSERT(pReader->status.pCurrentFileset->nSttF > 0);
H
Haojun Liao 已提交
3599

3600 3601 3602 3603 3604
          // data blocks in current file are exhausted, let's try the next file now
          SBlockData* pBlockData = &pReader->status.fileBlockData;
          if (pBlockData->uid != 0) {
            tBlockDataClear(pBlockData);
          }
3605

3606 3607 3608
          tBlockDataReset(pBlockData);
          resetDataBlockIterator(pBlockIter, pReader->order);
          resetTableListIndex(&pReader->status);
3609

3610
          ERetrieveType type = doReadDataFromLastFiles(pReader);
3611
          if (type == TSDB_READ_RETURN) {
3612
            return terrno;
3613
          }
3614
        }
H
Haojun Liao 已提交
3615
      }
3616 3617

      code = doBuildDataBlock(pReader);
3618 3619
    }

H
Haojun Liao 已提交
3620
    if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
3621 3622 3623
      return code;
    }
  }
3624
}
H
refact  
Hongze Cheng 已提交
3625

3626 3627
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
3628
  if (VND_IS_RSMA(pVnode)) {
3629
    int8_t  level = 0;
3630 3631
    int8_t  precision = pVnode->config.tsdbCfg.precision;
    int64_t now = taosGetTimestamp(precision);
H
Haojun Liao 已提交
3632 3633 3634
    int64_t offset = tsQueryRsmaTolerance * ((precision == TSDB_TIME_PRECISION_MILLI)   ? 1L
                                             : (precision == TSDB_TIME_PRECISION_MICRO) ? 1000L
                                                                                        : 1000000L);
3635

3636
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
3637 3638 3639 3640 3641 3642 3643
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
3644
      if ((now - pRetention->keep) <= (winSKey + offset)) {
3645 3646 3647 3648 3649
        break;
      }
      ++level;
    }

3650
    const char* str = (idStr != NULL) ? idStr : "";
3651 3652

    if (level == TSDB_RETENTION_L0) {
3653
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
3654
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
3655 3656
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
3657
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
3658
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
3659 3660
      return VND_RSMA1(pVnode);
    } else {
3661
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
3662
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
3663 3664 3665 3666 3667 3668 3669
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
3670
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
3671
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
3672 3673

  int64_t endVer = 0;
3674 3675
  if (pCond->endVersion == -1) {
    // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
3676 3677
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
3678
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
3679 3680
  }

H
Haojun Liao 已提交
3681
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
3682 3683
}

3684
bool hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order, SVersionRange* pVerRange) {
3685 3686 3687
  if (pDelList == NULL) {
    return false;
  }
H
Haojun Liao 已提交
3688

L
Liu Jicong 已提交
3689 3690 3691
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
3692

3693 3694 3695
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
3696
      ASSERT(key >= last->ts);
3697

3698
      if (key > last->ts) {
3699
        return false;
3700
      } else if (key == last->ts) {
3701
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
3702
        return (prev->version >= ver && prev->version <= pVerRange->maxVer &&
3703
                prev->version >= pVerRange->minVer);
3704 3705
      }
    } else {
3706 3707 3708
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

3709
      if (key < pCurrent->ts) {
3710 3711 3712
        return false;
      }

3713
      if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
3714
          pVerRange->maxVer >= pCurrent->version) {
3715 3716 3717
        return true;
      }

3718
      while (pNext->ts <= key && (*index) < num - 1) {
3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

3730
          if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
3731
              pVerRange->maxVer >= pCurrent->version) {
3732 3733 3734 3735 3736 3737
            return true;
          }
        }
      }

      return false;
3738 3739
    }
  } else {
3740 3741
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
3742

3743
      if (key < pFirst->ts) {
3744
        return false;
3745 3746
      } else if (key == pFirst->ts) {
        return pFirst->version >= ver;
3747 3748 3749
      } else {
        ASSERT(0);
      }
3750
    } else {
3751 3752 3753
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

3754
      if (key > pCurrent->ts) {
3755 3756 3757
        return false;
      }

3758
      if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
3759 3760 3761
        return true;
      }

3762
      while (pPrev->ts >= key && (*index) > 1) {
3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

3774
          if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
3775 3776 3777
            return true;
          }
        }
3778 3779 3780 3781 3782
      }

      return false;
    }
  }
3783 3784

  return false;
3785 3786
}

3787
TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
3788
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
3789 3790
    return NULL;
  }
H
Hongze Cheng 已提交
3791

3792
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3793 3794
  TSDBKEY  key = TSDBROW_KEY(pRow);

3795
  if (outOfTimeWindow(key.ts, &pReader->window)) {
3796
    pIter->hasVal = false;
H
Haojun Liao 已提交
3797 3798
    return NULL;
  }
H
Hongze Cheng 已提交
3799

3800
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
3801
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
3802
      (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3803 3804
    return pRow;
  }
H
Hongze Cheng 已提交
3805

3806
  while (1) {
3807 3808
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3809 3810
      return NULL;
    }
H
Hongze Cheng 已提交
3811

3812
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3813

H
Haojun Liao 已提交
3814
    key = TSDBROW_KEY(pRow);
3815
    if (outOfTimeWindow(key.ts, &pReader->window)) {
3816
      pIter->hasVal = false;
H
Haojun Liao 已提交
3817 3818
      return NULL;
    }
H
Hongze Cheng 已提交
3819

dengyihao's avatar
dengyihao 已提交
3820
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
3821
        (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3822 3823 3824 3825
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
3826

H
Haojun Liao 已提交
3827 3828 3829
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, STsdbReader* pReader) {
  SRowMerger* pMerger = &pReader->status.merger;

H
Haojun Liao 已提交
3830
  while (1) {
3831 3832
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3833 3834
      break;
    }
H
Hongze Cheng 已提交
3835

3836
    // data exists but not valid
3837
    TSDBROW* pRow = getValidMemRow(pIter, pDelList, pReader);
3838 3839 3840 3841 3842
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
3843
    TSDBKEY k = TSDBROW_KEY(pRow);
3844
    if (k.ts != ts) {
H
Haojun Liao 已提交
3845 3846 3847
      break;
    }

3848 3849 3850 3851 3852
    if (pRow->type == TSDBROW_ROW_FMT) {
      STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
      if (pTSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
3853

3854 3855
      tsdbRowMergerAdd(pMerger, pRow, pTSchema);
    } else {  // column format
3856
      tsdbRowMergerAdd(pMerger, pRow, NULL);
3857
    }
H
Haojun Liao 已提交
3858 3859 3860 3861 3862
  }

  return TSDB_CODE_SUCCESS;
}

3863
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
3864
                                          SVersionRange* pVerRange, int32_t step) {
3865
  while (rowIndex < pBlockData->nRow && rowIndex >= 0 && pBlockData->aTSKEY[rowIndex] == key) {
3866
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
3867
      rowIndex += step;
3868 3869 3870 3871
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
3872
    tsdbRowMergerAdd(pMerger, &fRow, NULL);
3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

H
Hongze Cheng 已提交
3884
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SDataBlk* pBlock,
3885 3886
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
3887
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
3888
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
3889
  bool                asc = ASCENDING_TRAVERSE(pReader->order);
3890

3891
  *state = CHECK_FILEBLOCK_QUIT;
3892
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
3893

3894
  bool    loadNeighbor = true;
H
Haojun Liao 已提交
3895
  int32_t code = loadNeighborIfOverlap(pFBlock, pScanInfo, pReader, &loadNeighbor);
3896

H
Haojun Liao 已提交
3897
  if (loadNeighbor && (code == TSDB_CODE_SUCCESS)) {
3898 3899
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
3900
    if ((pDumpInfo->rowIndex >= pDumpInfo->totalRows && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
3901 3902 3903 3904
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

H
Haojun Liao 已提交
3905
  return code;
3906 3907
}

H
Haojun Liao 已提交
3908
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
3909 3910
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

H
Haojun Liao 已提交
3911
  SRowMerger* pMerger = &pReader->status.merger;
3912
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
3913
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
3914
  int32_t step = asc ? 1 : -1;
3915

3916
  pDumpInfo->rowIndex += step;
3917
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
3918 3919 3920
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
3921

3922 3923 3924 3925
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
3926

3927
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
3928
      SDataBlk*           pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
H
Haojun Liao 已提交
3929 3930 3931 3932 3933
      if (pFileBlockInfo == NULL) {
        st = CHECK_FILEBLOCK_QUIT;
        break;
      }

3934 3935 3936
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
3937
      }
3938
    }
H
Haojun Liao 已提交
3939
  }
3940

H
Haojun Liao 已提交
3941 3942 3943
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3944
int32_t doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
3945
                               SRowMerger* pMerger, SVersionRange* pVerRange, const char* idStr) {
3946
  while (nextRowFromLastBlocks(pLastBlockReader, pScanInfo, pVerRange)) {
3947 3948
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 == ts) {
3949 3950
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tsdbRowMergerAdd(pMerger, pRow1, NULL);
3951
    } else {
3952 3953 3954
      tsdbTrace("uid:%" PRIu64 " last del index:%d, del range:%d, lastKeyInStt:%" PRId64 ", %s", pScanInfo->uid,
                pScanInfo->lastBlockDelIndex, (int32_t)taosArrayGetSize(pScanInfo->delSkyline), pScanInfo->lastKeyInStt,
                idStr);
3955 3956 3957 3958 3959 3960 3961
      break;
    }
  }

  return TSDB_CODE_SUCCESS;
}

3962
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, TSDBROW* pResRow,
3963
                                 STsdbReader* pReader, bool* freeTSRow) {
H
Haojun Liao 已提交
3964
  TSDBROW* pNextRow = NULL;
3965
  TSDBROW  current = *pRow;
3966

3967 3968
  {  // if the timestamp of the next valid row has a different ts, return current row directly
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
3969

3970
    if (!pIter->hasVal) {
3971
      *pResRow = *pRow;
3972
      *freeTSRow = false;
3973
      return TSDB_CODE_SUCCESS;
3974
    } else {  // has next point in mem/imem
3975
      pNextRow = getValidMemRow(pIter, pDelList, pReader);
3976
      if (pNextRow == NULL) {
H
Haojun Liao 已提交
3977
        *pResRow = current;
3978
        *freeTSRow = false;
3979
        return TSDB_CODE_SUCCESS;
3980 3981
      }

H
Hongze Cheng 已提交
3982
      if (TSDBROW_TS(&current) != TSDBROW_TS(pNextRow)) {
H
Haojun Liao 已提交
3983
        *pResRow = current;
3984
        *freeTSRow = false;
3985
        return TSDB_CODE_SUCCESS;
3986
      }
3987
    }
3988 3989
  }

H
Haojun Liao 已提交
3990
  terrno = 0;
3991
  int32_t code = 0;
H
Haojun Liao 已提交
3992

3993 3994 3995 3996 3997 3998 3999
  // start to merge duplicated rows
  if (current.type == TSDBROW_ROW_FMT) {
    // get the correct schema for data in memory
    STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(&current), pReader, uid);
    if (pTSchema == NULL) {
      return terrno;
    }
H
Haojun Liao 已提交
4000

H
Haojun Liao 已提交
4001
    code = tsdbRowMergerAdd(&pReader->status.merger, &current, pTSchema);
4002 4003 4004
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
H
Haojun Liao 已提交
4005

4006 4007 4008 4009
    STSchema* pTSchema1 = doGetSchemaForTSRow(TSDBROW_SVERSION(pNextRow), pReader, uid);
    if (pTSchema1 == NULL) {
      return terrno;
    }
H
Haojun Liao 已提交
4010

H
Haojun Liao 已提交
4011
    tsdbRowMergerAdd(&pReader->status.merger,pNextRow, pTSchema1);
4012
  } else {  // let's merge rows in file block
H
Haojun Liao 已提交
4013
    code = tsdbRowMergerAdd(&pReader->status.merger, &current, pReader->pSchema);
4014 4015 4016
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
H
Haojun Liao 已提交
4017

H
Haojun Liao 已提交
4018
    tsdbRowMergerAdd(&pReader->status.merger,pNextRow, NULL);
4019
  }
H
Haojun Liao 已提交
4020

H
Haojun Liao 已提交
4021
  code = doMergeRowsInBuf(pIter, uid, TSDBROW_TS(&current), pDelList, pReader);
H
Haojun Liao 已提交
4022 4023 4024 4025
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

H
Haojun Liao 已提交
4026
  code = tsdbRowMergerGetRow(&pReader->status.merger, &pResRow->pTSRow);
4027 4028 4029
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
M
Minglei Jin 已提交
4030

wmmhello's avatar
wmmhello 已提交
4031
  pResRow->type = TSDBROW_ROW_FMT;
4032
  tsdbRowMergerClear(&pReader->status.merger);
4033
  *freeTSRow = true;
4034

4035
  return TSDB_CODE_SUCCESS;
4036 4037
}

4038
int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
H
Hongze Cheng 已提交
4039
                           SRow** pTSRow) {
H
Haojun Liao 已提交
4040
  SRowMerger* pMerger = &pReader->status.merger;
H
Haojun Liao 已提交
4041

4042 4043 4044
  TSDBKEY   k = TSDBROW_KEY(pRow);
  TSDBKEY   ik = TSDBROW_KEY(piRow);
  STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
4045 4046 4047 4048
  if (pSchema == NULL) {
    return terrno;
  }

4049
  STSchema* piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
4050 4051 4052
  if (piSchema == NULL) {
    return terrno;
  }
4053

4054
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
H
Haojun Liao 已提交
4055
    int32_t code = tsdbRowMergerAdd(&pReader->status.merger, piRow, piSchema);
H
Haojun Liao 已提交
4056 4057 4058 4059
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
4060
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4061 4062 4063
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4064

H
Haojun Liao 已提交
4065
    tsdbRowMergerAdd(&pReader->status.merger,pRow, pSchema);
H
Haojun Liao 已提交
4066
    code =
H
Haojun Liao 已提交
4067
        doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4068 4069 4070 4071
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4072
  } else {
H
Haojun Liao 已提交
4073 4074
    int32_t code = tsdbRowMergerAdd(&pReader->status.merger, pRow, pSchema);
    if (code != TSDB_CODE_SUCCESS || pMerger->pTSchema == NULL) {
H
Haojun Liao 已提交
4075 4076 4077
      return code;
    }

H
Haojun Liao 已提交
4078
    code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4079 4080 4081
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4082

H
Haojun Liao 已提交
4083 4084
    tsdbRowMergerAdd(&pReader->status.merger, piRow, piSchema);
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4085 4086 4087
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4088
  }
4089

H
Haojun Liao 已提交
4090
  int32_t code = tsdbRowMergerGetRow(pMerger, pTSRow);
4091
  tsdbRowMergerClear(pMerger);
4092
  return code;
4093 4094
}

4095
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, TSDBROW* pResRow, int64_t endKey,
4096
                            bool* freeTSRow) {
4097 4098
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
4099
  SArray*  pDelList = pBlockScanInfo->delSkyline;
4100
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
4101

4102 4103
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
4104
  if (pBlockScanInfo->iter.hasVal) {
4105 4106 4107 4108 4109 4110
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

4111
  if (pBlockScanInfo->iiter.hasVal) {
4112 4113 4114 4115 4116 4117
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

4118
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
4119
    TSDBKEY k = TSDBROW_KEY(pRow);
4120
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
4121

4122
    int32_t code = TSDB_CODE_SUCCESS;
4123 4124
    if (ik.ts != k.ts) {
      if (((ik.ts < k.ts) && asc) || ((ik.ts > k.ts) && (!asc))) {  // ik.ts < k.ts
4125
        code = doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pResRow, pReader, freeTSRow);
4126
      } else if (((k.ts < ik.ts) && asc) || ((k.ts > ik.ts) && (!asc))) {
4127
        code = doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, pResRow, pReader, freeTSRow);
4128
      }
4129
    } else {  // ik.ts == k.ts
4130
      *freeTSRow = true;
4131 4132
      pResRow->type = TSDBROW_ROW_FMT;
      code = doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pResRow->pTSRow);
4133 4134 4135
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
4136
    }
4137

4138
    return code;
H
Haojun Liao 已提交
4139 4140
  }

4141
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
4142
    return doMergeMemTableMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pResRow, pReader,
H
Hongze Cheng 已提交
4143
                                    freeTSRow);
H
Haojun Liao 已提交
4144 4145
  }

4146
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
4147
    return doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pResRow, pReader, freeTSRow);
H
Haojun Liao 已提交
4148 4149 4150 4151 4152
  }

  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
4153
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow, STableBlockScanInfo* pScanInfo) {
H
Haojun Liao 已提交
4154
  int32_t outputRowIndex = pBlock->info.rows;
4155
  int64_t uid = pScanInfo->uid;
4156
  int32_t code = TSDB_CODE_SUCCESS;
4157

4158
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
4159
  STSchema*           pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
4160 4161 4162
  if (pSchema == NULL) {
    return terrno;
  }
4163

4164
  SColVal colVal = {0};
4165
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
4166

4167
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
4168
    SColumnInfoData* pColData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4169
    ((int64_t*)pColData->pData)[outputRowIndex] = pTSRow->ts;
4170 4171 4172
    i += 1;
  }

H
Haojun Liao 已提交
4173
  while (i < pSupInfo->numOfCols && j < pSchema->numOfCols) {
H
Haojun Liao 已提交
4174
    col_id_t colId = pSupInfo->colId[i];
4175 4176

    if (colId == pSchema->columns[j].colId) {
H
Haojun Liao 已提交
4177
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4178

H
Hongze Cheng 已提交
4179
      tRowGet(pTSRow, pSchema, j, &colVal);
D
dapan1121 已提交
4180 4181 4182 4183
      code = doCopyColVal(pColInfoData, outputRowIndex, i, &colVal, pSupInfo);
      if (code) {
        return code;
      }
4184 4185 4186
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
H
Haojun Liao 已提交
4187
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4188

4189
      colDataSetNULL(pColInfoData, outputRowIndex);
4190 4191 4192
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
4193
    }
4194 4195
  }

4196
  // set null value since current column does not exist in the "pSchema"
H
Haojun Liao 已提交
4197
  while (i < pSupInfo->numOfCols) {
H
Haojun Liao 已提交
4198
    SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
4199
    colDataSetNULL(pColInfoData, outputRowIndex);
4200 4201 4202
    i += 1;
  }

4203
  pBlock->info.dataLoad = 1;
4204
  pBlock->info.rows += 1;
4205
  pScanInfo->lastKey = pTSRow->ts;
4206 4207 4208
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
4209 4210
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                 int32_t rowIndex) {
4211 4212
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;
D
dapan1121 已提交
4213
  int32_t code = TSDB_CODE_SUCCESS;
4214 4215

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
4216
  ((int64_t*)pReader->status.pPrimaryTsCol->pData)[outputRowIndex] = pBlockData->aTSKEY[rowIndex];
4217
  i += 1;
4218 4219

  SColVal cv = {0};
H
Hongze Cheng 已提交
4220
  int32_t numOfInputCols = pBlockData->nColData;
H
Haojun Liao 已提交
4221
  int32_t numOfOutputCols = pSupInfo->numOfCols;
4222

4223
  while (i < numOfOutputCols && j < numOfInputCols) {
H
Haojun Liao 已提交
4224
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, j);
H
Haojun Liao 已提交
4225
    if (pData->cid < pSupInfo->colId[i]) {
4226 4227 4228 4229
      j += 1;
      continue;
    }

H
Haojun Liao 已提交
4230 4231
    SColumnInfoData* pCol = TARRAY_GET_ELEM(pResBlock->pDataBlock, pSupInfo->slotId[i]);
    if (pData->cid == pSupInfo->colId[i]) {
4232
      tColDataGetValue(pData, rowIndex, &cv);
D
dapan1121 已提交
4233 4234 4235 4236
      code = doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
      if (code) {
        return code;
      }
4237
      j += 1;
H
Haojun Liao 已提交
4238 4239
    } else if (pData->cid > pCol->info.colId) {
      // the specified column does not exist in file block, fill with null data
4240
      colDataSetNULL(pCol, outputRowIndex);
4241 4242 4243 4244 4245 4246
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
4247
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
4248
    colDataSetNULL(pCol, outputRowIndex);
4249 4250 4251
    i += 1;
  }

4252
  pResBlock->info.dataLoad = 1;
4253 4254 4255 4256
  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

4257 4258
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
4259
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
4260
  int32_t      code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
4261 4262

  do {
4263
    //    SRow* pTSRow = NULL;
4264
    TSDBROW row = {.type = -1};
4265
    bool    freeTSRow = false;
4266 4267
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &row, endKey, &freeTSRow);
    if (row.type == -1) {
4268
      break;
H
Haojun Liao 已提交
4269 4270
    }

4271
    if (row.type == TSDBROW_ROW_FMT) {
D
dapan1121 已提交
4272
      code = doAppendRowFromTSRow(pBlock, pReader, row.pTSRow, pBlockScanInfo);
4273

4274 4275 4276
      if (freeTSRow) {
        taosMemoryFree(row.pTSRow);
      }
D
dapan1121 已提交
4277 4278 4279 4280

      if (code) {
        return code;
      }
4281
    } else {
D
dapan1121 已提交
4282 4283 4284 4285
      code = doAppendRowFromFileBlock(pBlock, pReader, row.pBlockData, row.iRow);
      if (code) {
        break;
      }
4286
    }
H
Haojun Liao 已提交
4287 4288

    // no data in buffer, return immediately
4289
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
4290 4291 4292
      break;
    }

4293
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
4294 4295 4296 4297
      break;
    }
  } while (1);

D
dapan1121 已提交
4298
  return code;
H
Haojun Liao 已提交
4299
}
H
Hongze Cheng 已提交
4300

4301 4302
// TODO refactor: with createDataBlockScanInfo
int32_t tsdbSetTableList(STsdbReader* pReader, const void* pTableList, int32_t num) {
H
Haojun Liao 已提交
4303
  int32_t size = tSimpleHashGetSize(pReader->status.pTableMap);
4304

4305
  STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4306 4307 4308
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pReader->status.pTableMap, p, &iter)) != NULL) {
4309
    clearBlockScanInfo(*p);
4310 4311
  }

D
dapan1121 已提交
4312 4313 4314 4315 4316
  if (size < num) {
    int32_t code = ensureBlockScanInfoBuf(&pReader->blockInfoBuf, num);
    if (code) {
      return code;
    }
4317 4318 4319 4320 4321 4322 4323

    char* p1 = taosMemoryRealloc(pReader->status.uidList.tableUidList, sizeof(uint64_t) * num);
    if (p1 == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    pReader->status.uidList.tableUidList = (uint64_t*)p1;
D
dapan1121 已提交
4324
  }
4325

H
Haojun Liao 已提交
4326
  tSimpleHashClear(pReader->status.pTableMap);
4327
  STableUidList* pUidList = &pReader->status.uidList;
H
Haojun Liao 已提交
4328
  pUidList->currentIndex = 0;
4329

4330 4331
  STableKeyInfo* pList = (STableKeyInfo*)pTableList;
  for (int32_t i = 0; i < num; ++i) {
4332 4333
    STableBlockScanInfo* pInfo = getPosInBlockInfoBuf(&pReader->blockInfoBuf, i);
    pInfo->uid = pList[i].uid;
H
Haojun Liao 已提交
4334 4335
    pUidList->tableUidList[i] = pList[i].uid;

4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346
    // todo extract method
    if (ASCENDING_TRAVERSE(pReader->order)) {
      int64_t skey = pReader->window.skey;
      pInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
      pInfo->lastKeyInStt = skey;
    } else {
      int64_t ekey = pReader->window.ekey;
      pInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
      pInfo->lastKeyInStt = ekey;
    }

H
Haojun Liao 已提交
4347
    tSimpleHashPut(pReader->status.pTableMap, &pInfo->uid, sizeof(uint64_t), &pInfo, POINTER_BYTES);
4348 4349
  }

H
Hongze Cheng 已提交
4350 4351 4352
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
4353 4354 4355 4356 4357 4358
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
4359

dengyihao's avatar
dengyihao 已提交
4360 4361 4362 4363 4364 4365
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
4366

4367
uint64_t tsdbGetReaderMaxVersion(STsdbReader* pReader) { return pReader->verRange.maxVer; }
4368

4369
static int32_t doOpenReaderImpl(STsdbReader* pReader) {
4370 4371
  SReaderStatus*  pStatus = &pReader->status;
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
4372

4373 4374
  initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
  resetDataBlockIterator(&pStatus->blockIter, pReader->order);
4375

4376 4377 4378
  int32_t code = TSDB_CODE_SUCCESS;
  if (pStatus->fileIter.numOfFiles == 0) {
    pStatus->loadFromFile = false;
D
dapan1121 已提交
4379 4380
  } else if (READ_MODE_COUNT_ONLY == pReader->readMode) {
    // DO NOTHING
4381
  } else {
4382
    code = initForFirstBlockInFile(pReader, pBlockIter);
4383
  }
4384 4385 4386

  if (!pStatus->loadFromFile) {
    resetTableListIndex(pStatus);
4387
  }
4388 4389

  return code;
4390 4391
}

4392
static void freeSchemaFunc(void* param) {
G
Ganlin Zhao 已提交
4393 4394
  void **p = (void **)param;
  taosMemoryFreeClear(*p);
4395 4396
}

H
refact  
Hongze Cheng 已提交
4397
// ====================================== EXPOSED APIs ======================================
4398
int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, void* pTableList, int32_t numOfTables,
4399
                       SSDataBlock* pResBlock, STsdbReader** ppReader, const char* idstr, bool countOnly, SHashObj** pIgnoreTables) {
4400 4401
  STimeWindow window = pCond->twindows;

4402 4403 4404
  int32_t capacity = pVnode->config.tsdbCfg.maxRows;
  if (pResBlock != NULL) {
    blockDataEnsureCapacity(pResBlock, capacity);
H
Haojun Liao 已提交
4405 4406 4407
  }

  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, capacity, pResBlock, idstr);
4408
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
4409 4410
    goto _err;
  }
H
Hongze Cheng 已提交
4411

4412
  // check for query time window
H
Haojun Liao 已提交
4413
  STsdbReader* pReader = *ppReader;
4414
  if (isEmptyQueryTimeWindow(&pReader->window) && pCond->type == TIMEWINDOW_RANGE_CONTAINED) {
H
Haojun Liao 已提交
4415 4416 4417
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
4418

4419 4420
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
4421
    int32_t order = pCond->order;
4422
    if (order == TSDB_ORDER_ASC) {
4423
      pCond->twindows.ekey = window.skey - 1;
4424 4425 4426
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
4427
      pCond->twindows.skey = window.ekey + 1;
4428 4429 4430 4431
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

4432
    // here we only need one more row, so the capacity is set to be ONE.
H
Haojun Liao 已提交
4433
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[0], 1, pResBlock, idstr);
4434 4435 4436 4437 4438
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
4439
      pCond->twindows.skey = window.ekey + 1;
4440
      pCond->twindows.ekey = INT64_MAX;
4441
    } else {
4442
      pCond->twindows.skey = INT64_MIN;
4443
      pCond->twindows.ekey = window.ekey - 1;
4444
    }
4445 4446
    pCond->order = order;

H
Haojun Liao 已提交
4447
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[1], 1, pResBlock, idstr);
4448 4449 4450 4451 4452
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

H
Haojun Liao 已提交
4453
  // NOTE: the endVersion in pCond is the data version not schema version, so pCond->endVersion is not correct here.
4454 4455
  //  no valid error code set in metaGetTbTSchema, so let's set the error code here.
  //  we should proceed in case of tmq processing.
4456
  if (pCond->suid != 0) {
4457
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, -1, 1);
H
Haojun Liao 已提交
4458
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
4459
      tsdbError("failed to get table schema, suid:%" PRIu64 ", ver:-1, %s", pReader->suid, pReader->idStr);
H
Haojun Liao 已提交
4460
    }
4461 4462
  } else if (numOfTables > 0) {
    STableKeyInfo* pKey = pTableList;
4463
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1, 1);
H
Haojun Liao 已提交
4464
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
4465
      tsdbError("failed to get table schema, uid:%" PRIu64 ", ver:-1, %s", pKey->uid, pReader->idStr);
H
Haojun Liao 已提交
4466
    }
4467 4468
  }

H
Haojun Liao 已提交
4469
  if (pReader->pSchema != NULL) {
4470
    tsdbRowMergerInit(&pReader->status.merger, pReader->pSchema);
H
Haojun Liao 已提交
4471 4472
  }

4473 4474
  pReader->pSchemaMap = tSimpleHashInit(8, taosFastHash);
  if (pReader->pSchemaMap == NULL) {
4475
    tsdbError("failed init schema hash for reader %s", pReader->idStr);
4476 4477 4478 4479 4480
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _err;
  }

  tSimpleHashSetFreeFp(pReader->pSchemaMap, freeSchemaFunc);
4481
  if (pReader->pSchema != NULL) {
H
Haojun Liao 已提交
4482 4483 4484 4485
    code = updateBlockSMAInfo(pReader->pSchema, &pReader->suppInfo);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
4486
  }
4487

4488
  STsdbReader* p = (pReader->innerReader[0] != NULL) ? pReader->innerReader[0] : pReader;
X
Xiaoyu Wang 已提交
4489 4490
  pReader->status.pTableMap =
      createDataBlockScanInfo(p, &pReader->blockInfoBuf, pTableList, &pReader->status.uidList, numOfTables);
H
Haojun Liao 已提交
4491 4492
  if (pReader->status.pTableMap == NULL) {
    *ppReader = NULL;
S
Shengliang Guan 已提交
4493
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
4494 4495
    goto _err;
  }
H
Hongze Cheng 已提交
4496

H
Haojun Liao 已提交
4497 4498 4499 4500 4501 4502
  pReader->status.pLDataIter = taosMemoryCalloc(pVnode->config.sttTrigger, sizeof(SLDataIter));
  if (pReader->status.pLDataIter == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _err;
  }

H
Haojun Liao 已提交
4503
  pReader->flag = READER_STATUS_SUSPEND;
4504

D
dapan1121 已提交
4505 4506 4507
  if (countOnly) {
    pReader->readMode = READ_MODE_COUNT_ONLY;
  }
4508

4509 4510
  pReader->pIgnoreTables = pIgnoreTables;

4511
  tsdbDebug("%p total numOfTable:%d in this query %s", pReader, numOfTables, pReader->idStr);
H
Hongze Cheng 已提交
4512
  return code;
H
Hongze Cheng 已提交
4513 4514

_err:
H
Haojun Liao 已提交
4515
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), idstr);
K
kailixu 已提交
4516
  tsdbReaderClose(*ppReader);
X
Xiaoyu Wang 已提交
4517
  *ppReader = NULL;  // reset the pointer value.
H
Hongze Cheng 已提交
4518
  return code;
H
refact  
Hongze Cheng 已提交
4519 4520
}

4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536
static void clearSharedPtr(STsdbReader* p) {
  p->status.pLDataIter = NULL;
  p->status.pTableMap = NULL;
  p->status.uidList.tableUidList = NULL;
  p->pReadSnap = NULL;
  p->pSchema = NULL;
  p->pSchemaMap = NULL;
}

static void setSharedPtr(STsdbReader* pDst, const STsdbReader* pSrc) {
  pDst->status.pTableMap = pSrc->status.pTableMap;
  pDst->status.pLDataIter = pSrc->status.pLDataIter;
  pDst->status.uidList = pSrc->status.uidList;
  pDst->pSchema = pSrc->pSchema;
  pDst->pSchemaMap = pSrc->pSchemaMap;
  pDst->pReadSnap = pSrc->pReadSnap;
4537 4538 4539 4540

  if (pDst->pSchema) {
    tsdbRowMergerInit(&pDst->status.merger, pDst->pSchema);
  }
4541 4542
}

H
refact  
Hongze Cheng 已提交
4543
void tsdbReaderClose(STsdbReader* pReader) {
4544 4545
  if (pReader == NULL) {
    return;
4546
  }
H
refact  
Hongze Cheng 已提交
4547

4548
  tsdbAcquireReader(pReader);
4549

4550
  {
H
Haojun Liao 已提交
4551
    if (pReader->innerReader[0] != NULL || pReader->innerReader[1] != NULL) {
4552
      STsdbReader* p = pReader->innerReader[0];
4553
      clearSharedPtr(p);
4554 4555

      p = pReader->innerReader[1];
4556
      clearSharedPtr(p);
4557 4558 4559 4560 4561 4562

      tsdbReaderClose(pReader->innerReader[0]);
      tsdbReaderClose(pReader->innerReader[1]);
    }
  }

4563
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
H
Hongze Cheng 已提交
4564

4565
  taosArrayDestroy(pSupInfo->pColAgg);
H
Haojun Liao 已提交
4566
  for (int32_t i = 0; i < pSupInfo->numOfCols; ++i) {
4567 4568 4569 4570
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
4571

4572 4573
  if (pReader->resBlockInfo.freeBlock) {
    pReader->resBlockInfo.pResBlock = blockDataDestroy(pReader->resBlockInfo.pResBlock);
H
Haojun Liao 已提交
4574
  }
4575

H
Haojun Liao 已提交
4576
  taosMemoryFree(pSupInfo->colId);
H
Hongze Cheng 已提交
4577
  tBlockDataDestroy(&pReader->status.fileBlockData);
4578
  cleanupDataBlockIterator(&pReader->status.blockIter);
4579

H
Haojun Liao 已提交
4580
  size_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
4581 4582 4583 4584
  if (pReader->status.pTableMap != NULL) {
    destroyAllBlockScanInfo(pReader->status.pTableMap);
    clearBlockScanInfoBuf(&pReader->blockInfoBuf);
  }
4585

H
Haojun Liao 已提交
4586 4587 4588
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
4589

4590 4591 4592 4593 4594 4595 4596 4597 4598
  if (pReader->pDelFReader != NULL) {
    tsdbDelFReaderClose(&pReader->pDelFReader);
  }

  if (pReader->pDelIdx != NULL) {
    taosArrayDestroy(pReader->pDelIdx);
    pReader->pDelIdx = NULL;
  }

4599
  qTrace("tsdb/reader-close: %p, untake snapshot", pReader);
4600
  tsdbUntakeReadSnap(pReader, pReader->pReadSnap, true);
4601
  pReader->pReadSnap = NULL;
4602

4603 4604
  tsdbReleaseReader(pReader);

4605
  tsdbUninitReaderLock(pReader);
4606

H
Haojun Liao 已提交
4607
  taosMemoryFreeClear(pReader->status.pLDataIter);
H
Haojun Liao 已提交
4608
  taosMemoryFreeClear(pReader->status.uidList.tableUidList);
H
Haojun Liao 已提交
4609
  SIOCostSummary* pCost = &pReader->cost;
4610

H
Haojun Liao 已提交
4611 4612
  SFilesetIter* pFilesetIter = &pReader->status.fileIter;
  if (pFilesetIter->pLastBlockReader != NULL) {
H
Haojun Liao 已提交
4613 4614
    SLastBlockReader* pLReader = pFilesetIter->pLastBlockReader;
    tMergeTreeClose(&pLReader->mergeTree);
H
Haojun Liao 已提交
4615

H
Haojun Liao 已提交
4616
    getLastBlockLoadInfo(pLReader->pInfo, &pCost->lastBlockLoad, &pCost->lastBlockLoadTime);
H
refact  
Hongze Cheng 已提交
4617

H
Haojun Liao 已提交
4618 4619 4620
    pLReader->pInfo = destroyLastBlockLoadInfo(pLReader->pInfo);
    taosMemoryFree(pLReader);
  }
H
refact  
Hongze Cheng 已提交
4621

4622 4623 4624 4625 4626
  tsdbDebug(
      "%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
      " SMA-time:%.2f ms, fileBlocks:%" PRId64
      ", fileBlocks-load-time:%.2f ms, "
      "build in-memory-block-time:%.2f ms, lastBlocks:%" PRId64 ", lastBlocks-time:%.2f ms, composed-blocks:%" PRId64
X
Xiaoyu Wang 已提交
4627 4628
      ", composed-blocks-time:%.2fms, STableBlockScanInfo size:%.2f Kb, createTime:%.2f ms,initDelSkylineIterTime:%.2f "
      "ms, %s",
4629 4630 4631
      pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime, pCost->numOfBlocks,
      pCost->blockLoadTime, pCost->buildmemBlock, pCost->lastBlockLoad, pCost->lastBlockLoadTime, pCost->composedBlocks,
      pCost->buildComposedBlockTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pCost->createScanInfoList,
H
Haojun Liao 已提交
4632
      pCost->initDelSkylineIterTime, pReader->idStr);
H
refact  
Hongze Cheng 已提交
4633

4634
  taosMemoryFree(pReader->idStr);
H
Haojun Liao 已提交
4635

4636
  tsdbRowMergerCleanup(&pReader->status.merger);
4637
  taosMemoryFree(pReader->pSchema);
4638

4639
  tSimpleHashCleanup(pReader->pSchemaMap);
4640
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
4641 4642
}

4643 4644 4645 4646 4647 4648 4649 4650 4651 4652
int32_t tsdbReaderSuspend(STsdbReader* pReader) {
  int32_t code = 0;

  // save reader's base state & reset top state to be reconstructed from base state
  SReaderStatus*       pStatus = &pReader->status;
  STableBlockScanInfo* pBlockScanInfo = NULL;

  if (pStatus->loadFromFile) {
    SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
    if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
4653
      pBlockScanInfo = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, pReader->idStr);
4654 4655 4656 4657
      if (pBlockScanInfo == NULL) {
        goto _err;
      }
    } else {
4658
      pBlockScanInfo = *pStatus->pTableIter;
4659 4660 4661 4662 4663
    }

    tsdbDataFReaderClose(&pReader->pFileReader);

    // resetDataBlockScanInfo excluding lastKey
4664
    STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4665
    int32_t iter = 0;
4666

H
Haojun Liao 已提交
4667
    while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
4668 4669 4670 4671 4672 4673 4674 4675
      STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;

      pInfo->iterInit = false;
      pInfo->iter.hasVal = false;
      pInfo->iiter.hasVal = false;

      if (pInfo->iter.iter != NULL) {
        pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
4676 4677
      }

4678 4679 4680 4681 4682
      if (pInfo->iiter.iter != NULL) {
        pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
      }

      pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
4683 4684
    }
  } else {
4685 4686
    // resetDataBlockScanInfo excluding lastKey
    STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4687
    int32_t iter = 0;
4688

H
Haojun Liao 已提交
4689
    while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706
      STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;

      pInfo->iterInit = false;
      pInfo->iter.hasVal = false;
      pInfo->iiter.hasVal = false;

      if (pInfo->iter.iter != NULL) {
        pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
      }

      if (pInfo->iiter.iter != NULL) {
        pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
      }

      pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    }

4707
    pBlockScanInfo = pStatus->pTableIter == NULL ? NULL : *pStatus->pTableIter;
4708 4709
    if (pBlockScanInfo) {
      // save lastKey to restore memory iterator
4710
      STimeWindow w = pReader->resBlockInfo.pResBlock->info.window;
4711 4712 4713 4714
      pBlockScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? w.ekey : w.skey;

      // reset current current table's data block scan info,
      pBlockScanInfo->iterInit = false;
4715 4716
      pBlockScanInfo->iter.hasVal = false;
      pBlockScanInfo->iiter.hasVal = false;
4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731
      if (pBlockScanInfo->iter.iter != NULL) {
        pBlockScanInfo->iter.iter = tsdbTbDataIterDestroy(pBlockScanInfo->iter.iter);
      }

      if (pBlockScanInfo->iiter.iter != NULL) {
        pBlockScanInfo->iiter.iter = tsdbTbDataIterDestroy(pBlockScanInfo->iiter.iter);
      }

      pBlockScanInfo->pBlockList = taosArrayDestroy(pBlockScanInfo->pBlockList);
      tMapDataClear(&pBlockScanInfo->mapData);
      // TODO: keep skyline for reuse
      pBlockScanInfo->delSkyline = taosArrayDestroy(pBlockScanInfo->delSkyline);
    }
  }

4732
  tsdbUntakeReadSnap(pReader, pReader->pReadSnap, false);
4733
  pReader->pReadSnap = NULL;
H
Haojun Liao 已提交
4734
  pReader->flag = READER_STATUS_SUSPEND;
4735

4736 4737
  tsdbDebug("reader: %p suspended uid %" PRIu64 " in this query %s", pReader, pBlockScanInfo ? pBlockScanInfo->uid : 0,
            pReader->idStr);
4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748
  return code;

_err:
  tsdbError("failed to suspend data reader, code:%s %s", tstrerror(code), pReader->idStr);
  return code;
}

static int32_t tsdbSetQueryReseek(void* pQHandle) {
  int32_t      code = 0;
  STsdbReader* pReader = pQHandle;

4749
  code = tsdbTryAcquireReader(pReader);
4750
  if (code == 0) {
H
Haojun Liao 已提交
4751
    if (pReader->flag == READER_STATUS_SUSPEND) {
4752
      tsdbReleaseReader(pReader);
4753 4754 4755 4756
      return code;
    }

    tsdbReaderSuspend(pReader);
4757

4758
    tsdbReleaseReader(pReader);
4759

4760
    return code;
4761 4762 4763
  } else if (code == EBUSY) {
    return TSDB_CODE_VND_QUERY_BUSY;
  } else {
4764 4765
    terrno = TAOS_SYSTEM_ERROR(code);
    return TSDB_CODE_FAILED;
4766 4767 4768 4769 4770 4771
  }
}

int32_t tsdbReaderResume(STsdbReader* pReader) {
  int32_t code = 0;

4772
  STableBlockScanInfo** pBlockScanInfo = pReader->status.pTableIter;
4773 4774 4775

  //  restore reader's state
  //  task snapshot
H
Haojun Liao 已提交
4776
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
4777
  if (numOfTables > 0) {
4778
    qTrace("tsdb/reader: %p, take snapshot", pReader);
4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793
    code = tsdbTakeReadSnap(pReader, tsdbSetQueryReseek, &pReader->pReadSnap);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
      code = doOpenReaderImpl(pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    } else {
      STsdbReader* pPrevReader = pReader->innerReader[0];
      STsdbReader* pNextReader = pReader->innerReader[1];

      // we need only one row
4794
      pPrevReader->resBlockInfo.capacity = 1;
4795
      setSharedPtr(pPrevReader, pReader);
4796

4797
      pNextReader->resBlockInfo.capacity = 1;
4798
      setSharedPtr(pNextReader, pReader);
4799 4800 4801 4802 4803 4804 4805 4806

      code = doOpenReaderImpl(pPrevReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
  }

H
Haojun Liao 已提交
4807
  pReader->flag = READER_STATUS_NORMAL;
4808 4809
  tsdbDebug("reader: %p resumed uid %" PRIu64 ", numOfTable:%" PRId32 ", in this query %s", pReader,
            pBlockScanInfo ? (*pBlockScanInfo)->uid : 0, numOfTables, pReader->idStr);
4810 4811 4812 4813 4814 4815 4816
  return code;

_err:
  tsdbError("failed to resume data reader, code:%s %s", tstrerror(code), pReader->idStr);
  return code;
}

D
dapan1121 已提交
4817
static bool tsdbReadRowsCountOnly(STsdbReader* pReader) {
4818
  int32_t      code = TSDB_CODE_SUCCESS;
4819
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
D
dapan1121 已提交
4820

D
dapan1121 已提交
4821 4822 4823
  if (pReader->status.loadFromFile == false) {
    return false;
  }
D
dapan1121 已提交
4824

D
dapan1121 已提交
4825
  code = readRowsCountFromFiles(pReader);
D
dapan1121 已提交
4826 4827
  if (code != TSDB_CODE_SUCCESS) {
    return false;
D
dapan1121 已提交
4828 4829
  }

D
dapan1121 已提交
4830 4831 4832 4833 4834
  code = readRowsCountFromMem(pReader);
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

D
dapan1121 已提交
4835 4836 4837
  pBlock->info.rows = pReader->rowsNum;
  pBlock->info.id.uid = 0;
  pBlock->info.dataLoad = 0;
4838

D
dapan1121 已提交
4839
  pReader->rowsNum = 0;
4840

D
dapan1121 已提交
4841 4842 4843
  return pBlock->info.rows > 0;
}

4844
static int32_t doTsdbNextDataBlock(STsdbReader* pReader, bool* hasNext) {
D
dapan1121 已提交
4845
  int32_t code = TSDB_CODE_SUCCESS;
4846

H
Haojun Liao 已提交
4847
  // cleanup the data that belongs to the previous data block
4848
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
4849
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
4850

D
dapan1121 已提交
4851 4852
  *hasNext = false;

4853
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
4854
  if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
D
dapan1121 已提交
4855
    return code;
4856
  }
H
Haojun Liao 已提交
4857

D
dapan1121 已提交
4858 4859 4860 4861
  if (READ_MODE_COUNT_ONLY == pReader->readMode) {
    return tsdbReadRowsCountOnly(pReader);
  }

4862
  if (pStatus->loadFromFile) {
D
dapan1121 已提交
4863
    code = buildBlockFromFiles(pReader);
4864
    if (code != TSDB_CODE_SUCCESS) {
D
dapan1121 已提交
4865
      return code;
4866
    }
4867

D
dapan1121 已提交
4868
    if (pBlock->info.rows <= 0) {
4869
      resetTableListIndex(&pReader->status);
D
dapan1121 已提交
4870
      code = buildBlockFromBufferSequentially(pReader);
H
Haojun Liao 已提交
4871
    }
4872
  } else {  // no data in files, let's try the buffer
D
dapan1121 已提交
4873
    code = buildBlockFromBufferSequentially(pReader);
H
Haojun Liao 已提交
4874
  }
D
dapan1121 已提交
4875

D
dapan1121 已提交
4876 4877
  *hasNext = pBlock->info.rows > 0;

D
dapan1121 已提交
4878
  return code;
H
refact  
Hongze Cheng 已提交
4879 4880
}

4881
int32_t tsdbNextDataBlock(STsdbReader* pReader, bool* hasNext) {
D
dapan1121 已提交
4882 4883 4884
  int32_t code = TSDB_CODE_SUCCESS;

  *hasNext = false;
4885

H
Haojun Liao 已提交
4886 4887
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->step == EXTERNAL_ROWS_NEXT || pReader->code != TSDB_CODE_SUCCESS) {
    return (pReader->code != TSDB_CODE_SUCCESS)? pReader->code:code;
4888 4889
  }

4890 4891
  SReaderStatus* pStatus = &pReader->status;

D
dapan1121 已提交
4892
  code = tsdbAcquireReader(pReader);
4893 4894
  qTrace("tsdb/read: %p, take read mutex, code: %d", pReader, code);

H
Haojun Liao 已提交
4895
  if (pReader->flag == READER_STATUS_SUSPEND) {
4896 4897 4898 4899 4900
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
4901 4902
  }

4903
  if (pReader->innerReader[0] != NULL && pReader->step == 0) {
D
dapan1121 已提交
4904 4905 4906 4907 4908
    code = doTsdbNextDataBlock(pReader->innerReader[0], hasNext);
    if (code) {
      tsdbReleaseReader(pReader);
      return code;
    }
4909

4910
    pReader->step = EXTERNAL_ROWS_PREV;
D
dapan1121 已提交
4911
    if (*hasNext) {
4912
      pStatus = &pReader->innerReader[0]->status;
4913
      if (pStatus->composedDataBlock) {
4914
        qTrace("tsdb/read: %p, unlock read mutex", pReader);
4915
        tsdbReleaseReader(pReader);
4916 4917
      }

D
dapan1121 已提交
4918
      return code;
4919
    }
4920
  }
4921

4922
  if (pReader->step == EXTERNAL_ROWS_PREV) {
4923
    // prepare for the main scan
4924 4925 4926
    code = doOpenReaderImpl(pReader);
    int32_t step = 1;
    resetAllDataBlockScanInfo(pReader->status.pTableMap, pReader->innerReader[0]->window.ekey, step);
4927 4928 4929 4930 4931

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4932
    pReader->step = EXTERNAL_ROWS_MAIN;
4933 4934
  }

D
dapan1121 已提交
4935 4936 4937 4938 4939
  code = doTsdbNextDataBlock(pReader, hasNext);
  if (code != TSDB_CODE_SUCCESS) {
    tsdbReleaseReader(pReader);
    return code;
  }
4940

D
dapan1121 已提交
4941
  if (*hasNext) {
4942
    if (pStatus->composedDataBlock) {
4943
      qTrace("tsdb/read: %p, unlock read mutex", pReader);
4944
      tsdbReleaseReader(pReader);
4945 4946
    }

D
dapan1121 已提交
4947
    return code;
4948 4949
  }

4950
  if (pReader->step == EXTERNAL_ROWS_MAIN && pReader->innerReader[1] != NULL) {
4951
    // prepare for the next row scan
4952 4953 4954
    int32_t step = -1;
    code = doOpenReaderImpl(pReader->innerReader[1]);
    resetAllDataBlockScanInfo(pReader->innerReader[1]->status.pTableMap, pReader->window.ekey, step);
4955 4956 4957 4958
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

D
dapan1121 已提交
4959 4960 4961 4962 4963
    code = doTsdbNextDataBlock(pReader->innerReader[1], hasNext);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
4964

4965
    pReader->step = EXTERNAL_ROWS_NEXT;
D
dapan1121 已提交
4966
    if (*hasNext) {
4967
      pStatus = &pReader->innerReader[1]->status;
4968
      if (pStatus->composedDataBlock) {
4969
        qTrace("tsdb/read: %p, unlock read mutex", pReader);
4970
        tsdbReleaseReader(pReader);
4971 4972
      }

D
dapan1121 已提交
4973
      return code;
4974 4975 4976
    }
  }

4977
  qTrace("tsdb/read: %p, unlock read mutex", pReader);
4978
  tsdbReleaseReader(pReader);
4979

D
dapan1121 已提交
4980
  return code;
4981 4982
}

4983
static void doFillNullColSMA(SBlockLoadSuppInfo* pSup, int32_t numOfRows, int32_t numOfCols, SColumnDataAgg* pTsAgg) {
4984 4985
  // do fill all null column value SMA info
  int32_t i = 0, j = 0;
4986
  int32_t size = (int32_t)taosArrayGetSize(pSup->pColAgg);
4987
  taosArrayInsert(pSup->pColAgg, 0, pTsAgg);
4988
  size++;
4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999

  while (j < numOfCols && i < size) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colId[j]) {
      i += 1;
      j += 1;
    } else if (pAgg->colId < pSup->colId[j]) {
      i += 1;
    } else if (pSup->colId[j] < pAgg->colId) {
      if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
        SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
5000
        taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
5001
        i += 1;
D
dapan1121 已提交
5002
        size++;
5003 5004 5005 5006
      }
      j += 1;
    }
  }
5007 5008 5009 5010 5011 5012 5013 5014 5015

  while (j < numOfCols) {
    if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
      SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
      taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
      i += 1;
    }
    j++;
  }
5016 5017
}

5018
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SSDataBlock* pDataBlock, bool* allHave) {
H
Haojun Liao 已提交
5019 5020
  SColumnDataAgg*** pBlockSMA = &pDataBlock->pBlockAgg;

H
Hongze Cheng 已提交
5021
  int32_t code = 0;
5022
  *allHave = false;
H
Haojun Liao 已提交
5023
  *pBlockSMA = NULL;
H
Hongze Cheng 已提交
5024

5025
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
5026 5027 5028
    return TSDB_CODE_SUCCESS;
  }

5029
  // there is no statistics data for composed block
5030
  if (pReader->status.composedDataBlock || (!pReader->suppInfo.smaValid)) {
5031 5032
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
5033

5034
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
5035 5036
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

5037
  if (pReader->resBlockInfo.pResBlock->info.id.uid != pFBlock->uid) {
H
Haojun Liao 已提交
5038 5039
    return TSDB_CODE_SUCCESS;
  }
5040

D
dapan1121 已提交
5041 5042
  int64_t st = taosGetTimestampUs();

5043
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
H
Hongze Cheng 已提交
5044
  if (tDataBlkHasSma(pBlock)) {
H
Hongze Cheng 已提交
5045
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg);
5046
    if (code != TSDB_CODE_SUCCESS) {
5047 5048
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
5049 5050
      return code;
    }
5051
  } else {
H
Haojun Liao 已提交
5052
    *pBlockSMA = NULL;
5053
    return TSDB_CODE_SUCCESS;
5054
  }
H
Hongze Cheng 已提交
5055

5056
  *allHave = true;
H
Hongze Cheng 已提交
5057

5058 5059
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
5060

5061 5062
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
5063 5064
  pTsAgg->min = pReader->resBlockInfo.pResBlock->info.window.skey;
  pTsAgg->max = pReader->resBlockInfo.pResBlock->info.window.ekey;
5065 5066

  // update the number of NULL data rows
5067
  size_t numOfCols = pSup->numOfCols;
5068

5069
  // ensure capacity
H
Haojun Liao 已提交
5070 5071 5072
  if (pDataBlock->pDataBlock) {
    size_t colsNum = taosArrayGetSize(pDataBlock->pDataBlock);
    taosArrayEnsureCap(pSup->pColAgg, colsNum);
5073 5074
  }

5075
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
5076 5077
  if (pResBlock->pBlockAgg == NULL) {
    size_t num = taosArrayGetSize(pResBlock->pDataBlock);
H
Haojun Liao 已提交
5078
    pResBlock->pBlockAgg = taosMemoryCalloc(num, POINTER_BYTES);
5079
  }
5080

5081
  // do fill all null column value SMA info
5082
  doFillNullColSMA(pSup, pBlock->nRow, numOfCols, pTsAgg);
H
Haojun Liao 已提交
5083
  size_t size = taosArrayGetSize(pSup->pColAgg);
5084

H
Haojun Liao 已提交
5085
  int32_t i = 0, j = 0;
5086
  while (j < numOfCols && i < size) {
5087
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
H
Haojun Liao 已提交
5088 5089
    if (pAgg->colId == pSup->colId[j]) {
      pResBlock->pBlockAgg[pSup->slotId[j]] = pAgg;
5090 5091
      i += 1;
      j += 1;
H
Haojun Liao 已提交
5092
    } else if (pAgg->colId < pSup->colId[j]) {
5093
      i += 1;
H
Haojun Liao 已提交
5094
    } else if (pSup->colId[j] < pAgg->colId) {
5095 5096
      pResBlock->pBlockAgg[pSup->slotId[j]] = NULL;
      *allHave = false;
5097 5098 5099 5100
      j += 1;
    }
  }

H
Haojun Liao 已提交
5101
  *pBlockSMA = pResBlock->pBlockAgg;
5102
  pReader->cost.smaDataLoad += 1;
5103

D
dapan1121 已提交
5104 5105 5106
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
  pReader->cost.smaLoadTime += elapsedTime;

5107
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", %s", 0, pFBlock->uid, pReader->idStr);
H
Hongze Cheng 已提交
5108
  return code;
H
Hongze Cheng 已提交
5109 5110
}

H
Haojun Liao 已提交
5111 5112
STableBlockScanInfo* getTableBlockScanInfo(SSHashObj* pTableMap, uint64_t uid, const char* id) {
  STableBlockScanInfo** p = tSimpleHashGet(pTableMap, &uid, sizeof(uid));
H
Haojun Liao 已提交
5113 5114
  if (p == NULL || *p == NULL) {
    terrno = TSDB_CODE_INVALID_PARA;
H
Haojun Liao 已提交
5115
    int32_t size = tSimpleHashGetSize(pTableMap);
H
Haojun Liao 已提交
5116 5117 5118 5119 5120 5121 5122
    tsdbError("failed to locate the uid:%" PRIu64 " in query table uid list, total tables:%d, %s", uid, size, id);
    return NULL;
  }

  return *p;
}

H
Haojun Liao 已提交
5123
static SSDataBlock* doRetrieveDataBlock(STsdbReader* pReader) {
5124
  SReaderStatus*       pStatus = &pReader->status;
D
dapan1121 已提交
5125
  int32_t              code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
5126
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(&pStatus->blockIter);
5127

H
Haojun Liao 已提交
5128
  if (pReader->code != TSDB_CODE_SUCCESS) {
5129 5130 5131
    return NULL;
  }

H
Haojun Liao 已提交
5132
  STableBlockScanInfo* pBlockScanInfo = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
5133
  if (pBlockScanInfo == NULL) {
5134
    return NULL;
5135 5136
  }

D
dapan1121 已提交
5137 5138 5139 5140 5141 5142 5143 5144
  code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData, pBlockScanInfo->uid);
  if (code != TSDB_CODE_SUCCESS) {
    tBlockDataDestroy(&pStatus->fileBlockData);
    terrno = code;
    return NULL;
  }

  code = copyBlockDataToSDataBlock(pReader);
5145
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
5146
    tBlockDataDestroy(&pStatus->fileBlockData);
5147 5148
    terrno = code;
    return NULL;
5149
  }
5150

5151
  return pReader->resBlockInfo.pResBlock;
H
Hongze Cheng 已提交
5152 5153
}

H
Haojun Liao 已提交
5154
SSDataBlock* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
5155
  STsdbReader* pTReader = pReader;
5156 5157
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
5158
      pTReader = pReader->innerReader[0];
5159
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
5160
      pTReader = pReader->innerReader[1];
5161 5162 5163
    }
  }

5164 5165
  SReaderStatus* pStatus = &pTReader->status;
  if (pStatus->composedDataBlock) {
5166
    return pTReader->resBlockInfo.pResBlock;
5167 5168 5169 5170
  }

  SSDataBlock* ret = doRetrieveDataBlock(pTReader);

5171
  qTrace("tsdb/read-retrieve: %p, unlock read mutex", pReader);
5172
  tsdbReleaseReader(pReader);
5173 5174

  return ret;
5175 5176
}

H
Haojun Liao 已提交
5177
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
5178 5179
  int32_t code = TSDB_CODE_SUCCESS;

5180
  qTrace("tsdb/reader-reset: %p, take read mutex", pReader);
5181
  tsdbAcquireReader(pReader);
L
Liu Jicong 已提交
5182

H
Haojun Liao 已提交
5183
  if (pReader->flag == READER_STATUS_SUSPEND) {
5184 5185 5186 5187 5188
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
L
Liu Jicong 已提交
5189 5190
  }

H
Haojun Liao 已提交
5191
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->pReadSnap == NULL) {
5192
    tsdbDebug("tsdb reader reset return %p, %s", pReader->pReadSnap, pReader->idStr);
5193
    tsdbReleaseReader(pReader);
5194 5195
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
5196

5197
  SReaderStatus*  pStatus = &pReader->status;
H
Haojun Liao 已提交
5198
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
5199

L
Liu Jicong 已提交
5200
  pReader->order = pCond->order;
5201
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
H
Haojun Liao 已提交
5202 5203
  pStatus->loadFromFile = true;
  pStatus->pTableIter = NULL;
H
Haojun Liao 已提交
5204
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
5205

5206
  // allocate buffer in order to load data blocks from file
5207
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
5208

5209
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
5210
  tsdbDataFReaderClose(&pReader->pFileReader);
5211

H
Haojun Liao 已提交
5212
  int32_t numOfTables = tSimpleHashGetSize(pStatus->pTableMap);
L
Liu Jicong 已提交
5213

H
Haojun Liao 已提交
5214
  initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
5215
  resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
5216
  resetTableListIndex(&pReader->status);
H
Haojun Liao 已提交
5217

5218 5219 5220
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
  int64_t ts = asc ? pReader->window.skey - 1 : pReader->window.ekey + 1;
5221
  resetAllDataBlockScanInfo(pStatus->pTableMap, ts, step);
5222 5223

  // no data in files, let's try buffer in memory
H
Haojun Liao 已提交
5224 5225
  if (pStatus->fileIter.numOfFiles == 0) {
    pStatus->loadFromFile = false;
5226
    resetTableListIndex(pStatus);
5227 5228 5229
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
5230 5231
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
5232

5233
      tsdbReleaseReader(pReader);
5234 5235 5236
      return code;
    }
  }
H
Hongze Cheng 已提交
5237

H
Hongze Cheng 已提交
5238 5239 5240 5241
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, skey:%" PRId64 ", query range:%" PRId64 " - %" PRId64
            " in query %s",
            pReader, pReader->suid, numOfTables, pCond->twindows.skey, pReader->window.skey, pReader->window.ekey,
            pReader->idStr);
5242

5243
  tsdbReleaseReader(pReader);
5244

5245
  return code;
H
Hongze Cheng 已提交
5246
}
H
Hongze Cheng 已提交
5247

5248
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows, int32_t numOfBucket) {
X
Xiaoyu Wang 已提交
5249
  int32_t bucketIndex = ((numOfRows - startRow) / bucketRange);
5250 5251 5252 5253
  if (bucketIndex == numOfBucket) {
    bucketIndex -= 1;
  }
  return bucketIndex;
5254
}
H
Hongze Cheng 已提交
5255

5256 5257 5258 5259
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
5260
  pTableBlockInfo->numOfVgroups = 1;
H
Hongze Cheng 已提交
5261

5262 5263
  const int32_t numOfBuckets = 20.0;

5264
  // find the start data block in file
dengyihao's avatar
dengyihao 已提交
5265
  tsdbAcquireReader(pReader);
H
Haojun Liao 已提交
5266
  if (pReader->flag == READER_STATUS_SUSPEND) {
5267 5268 5269 5270 5271
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
dengyihao's avatar
dengyihao 已提交
5272
  }
5273
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
5274

5275 5276 5277
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
5278

X
Xiaoyu Wang 已提交
5279
  int32_t bucketRange = ceil(((double)(pc->maxRows - pc->minRows)) / numOfBuckets);
H
Hongze Cheng 已提交
5280

5281
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
5282

H
Haojun Liao 已提交
5283
  int32_t numOfTables = (int32_t)tSimpleHashGetSize(pStatus->pTableMap);
5284
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
5285

5286 5287
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
5288

5289 5290
  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
H
Haojun Liao 已提交
5291
  }
H
Hongze Cheng 已提交
5292

5293
  pTableBlockInfo->numOfTables = numOfTables;
5294
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
5295

5296 5297
  while (true) {
    if (hasNext) {
H
Hongze Cheng 已提交
5298
      SDataBlk* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
5299

5300 5301
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
5302

5303 5304 5305
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
5306

5307 5308 5309
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
5310

5311 5312 5313
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
5314

5315
      pTableBlockInfo->totalSize += pBlock->aSubBlock[0].szBlock;
5316

5317
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows, numOfBuckets);
5318
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
5319

H
Haojun Liao 已提交
5320
      hasNext = blockIteratorNext(&pStatus->blockIter, pReader->idStr);
5321 5322
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
H
Haojun Liao 已提交
5323
      if ((code != TSDB_CODE_SUCCESS) || (pStatus->loadFromFile == false)) {
5324 5325
        break;
      }
H
refact  
Hongze Cheng 已提交
5326

5327 5328
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
      hasNext = (pBlockIter->numOfBlocks > 0);
5329
    }
H
refact  
Hongze Cheng 已提交
5330

H
Hongze Cheng 已提交
5331 5332
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
5333
  }
dengyihao's avatar
dengyihao 已提交
5334
  tsdbReleaseReader(pReader);
H
refact  
Hongze Cheng 已提交
5335 5336
  return code;
}
H
Hongze Cheng 已提交
5337

H
refact  
Hongze Cheng 已提交
5338
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
5339
  int32_t code = TSDB_CODE_SUCCESS;
5340
  int64_t rows = 0;
H
Hongze Cheng 已提交
5341

5342
  SReaderStatus* pStatus = &pReader->status;
5343
  tsdbAcquireReader(pReader);
H
Haojun Liao 已提交
5344
  if (pReader->flag == READER_STATUS_SUSPEND) {
5345 5346 5347 5348 5349
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
5350 5351
  }

H
Haojun Liao 已提交
5352 5353
  int32_t iter = 0;
  pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, NULL, &iter);
H
Hongze Cheng 已提交
5354

5355
  while (pStatus->pTableIter != NULL) {
5356
    STableBlockScanInfo* pBlockScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
5357 5358

    STbData* d = NULL;
5359
    if (pReader->pReadSnap->pMem != NULL) {
H
Hongze Cheng 已提交
5360
      d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
5361 5362 5363 5364 5365 5366
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
5367
    if (pReader->pReadSnap->pIMem != NULL) {
H
Hongze Cheng 已提交
5368
      di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
5369 5370 5371 5372 5373 5374
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
H
Haojun Liao 已提交
5375
    pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, pStatus->pTableIter, &iter);
5376
  }
H
Hongze Cheng 已提交
5377

5378
  tsdbReleaseReader(pReader);
5379

H
refact  
Hongze Cheng 已提交
5380
  return rows;
H
Hongze Cheng 已提交
5381
}
D
dapan1121 已提交
5382

L
Liu Jicong 已提交
5383
int32_t tsdbGetTableSchema(SVnode* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
5384 5385
  SMetaReader mr = {0};
  metaReaderInit(&mr, pVnode->pMeta, 0);
H
Haojun Liao 已提交
5386
  int32_t code = metaGetTableEntryByUidCache(&mr, uid);
D
dapan1121 已提交
5387 5388 5389 5390 5391 5392 5393
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
5394

5395
  // only child table and ordinary table is allowed, super table is not allowed.
D
dapan1121 已提交
5396
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
5397
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
5398
    *suid = mr.me.ctbEntry.suid;
H
Haojun Liao 已提交
5399
    code = metaGetTableEntryByUidCache(&mr, *suid);
D
dapan1121 已提交
5400 5401 5402 5403 5404
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
5405
  } else if (mr.me.type == TSDB_NORMAL_TABLE) {  // do nothing
H
Haojun Liao 已提交
5406 5407 5408 5409
  } else {
    terrno = TSDB_CODE_INVALID_PARA;
    metaReaderClear(&mr);
    return terrno;
D
dapan1121 已提交
5410 5411 5412
  }

  metaReaderClear(&mr);
L
Liu Jicong 已提交
5413

5414
  // get the newest table schema version
H
Haojun Liao 已提交
5415
  code = metaGetTbTSchemaEx(pVnode->pMeta, *suid, uid, -1, pSchema);
5416
  return code;
D
dapan1121 已提交
5417
}
H
Hongze Cheng 已提交
5418

H
Hongze Cheng 已提交
5419
int32_t tsdbTakeReadSnap(STsdbReader* pReader, _query_reseek_func_t reseek, STsdbReadSnap** ppSnap) {
H
Hongze Cheng 已提交
5420 5421 5422
  int32_t        code = 0;
  STsdb*         pTsdb = pReader->pTsdb;
  SVersionRange* pRange = &pReader->verRange;
H
Hongze Cheng 已提交
5423 5424

  // alloc
H
Hongze Cheng 已提交
5425 5426
  STsdbReadSnap* pSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(*pSnap));
  if (pSnap == NULL) {
H
Hongze Cheng 已提交
5427 5428 5429 5430 5431
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
H
Hongze Cheng 已提交
5432
  taosThreadRwlockRdlock(&pTsdb->rwLock);
H
Hongze Cheng 已提交
5433 5434

  // take snapshot
H
Hongze Cheng 已提交
5435
  if (pTsdb->mem && (pRange->minVer <= pTsdb->mem->maxVer && pRange->maxVer >= pTsdb->mem->minVer)) {
H
Hongze Cheng 已提交
5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446
    pSnap->pMem = pTsdb->mem;
    pSnap->pNode = taosMemoryMalloc(sizeof(*pSnap->pNode));
    if (pSnap->pNode == NULL) {
      taosThreadRwlockUnlock(&pTsdb->rwLock);
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
    pSnap->pNode->pQHandle = pReader;
    pSnap->pNode->reseek = reseek;

    tsdbRefMemTable(pTsdb->mem, pSnap->pNode);
H
Hongze Cheng 已提交
5447 5448
  }

H
Hongze Cheng 已提交
5449
  if (pTsdb->imem && (pRange->minVer <= pTsdb->imem->maxVer && pRange->maxVer >= pTsdb->imem->minVer)) {
H
Hongze Cheng 已提交
5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460
    pSnap->pIMem = pTsdb->imem;
    pSnap->pINode = taosMemoryMalloc(sizeof(*pSnap->pINode));
    if (pSnap->pINode == NULL) {
      taosThreadRwlockUnlock(&pTsdb->rwLock);
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
    pSnap->pINode->pQHandle = pReader;
    pSnap->pINode->reseek = reseek;

    tsdbRefMemTable(pTsdb->imem, pSnap->pINode);
H
Hongze Cheng 已提交
5461 5462
  }

H
Hongze Cheng 已提交
5463
  // fs
H
Hongze Cheng 已提交
5464
  code = tsdbFSRef(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
5465 5466 5467 5468
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
5469 5470

  // unlock
H
Hongze Cheng 已提交
5471
  taosThreadRwlockUnlock(&pTsdb->rwLock);
H
Hongze Cheng 已提交
5472

5473
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
5474

H
Hongze Cheng 已提交
5475
_exit:
H
Hongze Cheng 已提交
5476 5477 5478 5479 5480 5481 5482 5483 5484 5485
  if (code) {
    *ppSnap = NULL;
    if (pSnap) {
      if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
      if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
      taosMemoryFree(pSnap);
    }
  } else {
    *ppSnap = pSnap;
  }
H
Hongze Cheng 已提交
5486 5487 5488
  return code;
}

5489
void tsdbUntakeReadSnap(STsdbReader* pReader, STsdbReadSnap* pSnap, bool proactive) {
H
Hongze Cheng 已提交
5490 5491
  STsdb* pTsdb = pReader->pTsdb;

H
Hongze Cheng 已提交
5492 5493
  if (pSnap) {
    if (pSnap->pMem) {
5494
      tsdbUnrefMemTable(pSnap->pMem, pSnap->pNode, proactive);
H
Hongze Cheng 已提交
5495 5496 5497
    }

    if (pSnap->pIMem) {
5498
      tsdbUnrefMemTable(pSnap->pIMem, pSnap->pINode, proactive);
H
Hongze Cheng 已提交
5499 5500
    }

H
Hongze Cheng 已提交
5501
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
5502 5503
    if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
    if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
H
Hongze Cheng 已提交
5504
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
5505
  }
5506
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
5507
}
5508 5509 5510 5511 5512

// if failed, do nothing
void tsdbReaderSetId(STsdbReader* pReader, const char* idstr) {
  taosMemoryFreeClear(pReader->idStr);
  pReader->idStr = taosStrdup(idstr);
5513
}
H
Haojun Liao 已提交
5514

H
Haojun Liao 已提交
5515
void tsdbReaderSetCloseFlag(STsdbReader* pReader) { pReader->code = TSDB_CODE_TSC_QUERY_CANCELLED; }
5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570

/*-------------todo:refactor the implementation of those APIs in this file to seperate the API into two files------*/
// opt perf, do NOT create so many readers
int64_t tsdbGetLastTimestamp(SVnode* pVnode, void* pTableList, int32_t numOfTables, const char* pIdStr) {
  SQueryTableDataCond cond = {.type = TIMEWINDOW_RANGE_CONTAINED, .numOfCols = 1, .order = TSDB_ORDER_DESC,
                              .startVersion = -1, .endVersion = -1};
  cond.twindows.skey = INT64_MIN;
  cond.twindows.ekey = INT64_MAX;

  cond.colList = taosMemoryCalloc(1, sizeof(SColumnInfo));
  cond.pSlotList = taosMemoryMalloc(sizeof(int32_t) * cond.numOfCols);
  if (cond.colList == NULL || cond.pSlotList == NULL) {
    // todo
  }

  cond.colList[0].colId = 1;
  cond.colList[0].slotId = 0;
  cond.colList[0].type = TSDB_DATA_TYPE_TIMESTAMP;

  cond.pSlotList[0] = 0;

  STableKeyInfo* pTableKeyInfo = pTableList;
  STsdbReader* pReader = NULL;
  SSDataBlock* pBlock = createDataBlock();

  SColumnInfoData data = {0};
  data.info = (SColumnInfo) {.type = TSDB_DATA_TYPE_TIMESTAMP, .colId = 1, .bytes = TSDB_KEYSIZE};
  blockDataAppendColInfo(pBlock, &data);

  int64_t key = INT64_MIN;

  for(int32_t i = 0; i < numOfTables; ++i) {
    int32_t code = tsdbReaderOpen(pVnode, &cond, &pTableKeyInfo[i], 1, pBlock, &pReader, pIdStr, false, NULL);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    bool hasData = false;
    code = tsdbNextDataBlock(pReader, &hasData);
    if (!hasData || code != TSDB_CODE_SUCCESS) {
      continue;
    }

    SColumnInfoData* pCol = taosArrayGet(pBlock->pDataBlock, 0);
    int64_t k = *(int64_t*)pCol->pData;

    if (key < k) {
      key = k;
    }

    tsdbReaderClose(pReader);
  }

  return 0;
}