tsdbRead.c 184.7 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Haojun Liao 已提交
16
#include "osDef.h"
H
Hongze Cheng 已提交
17
#include "tsdb.h"
18
#include "tsimplehash.h"
19

H
Hongze Cheng 已提交
20
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
21
#define getCurrentKeyInLastBlock(_r) ((_r)->currentKey)
H
Hongze Cheng 已提交
22

H
Haojun Liao 已提交
23
typedef enum {
H
Haojun Liao 已提交
24 25 26
  READER_STATUS_SUSPEND = 0x1,
  READER_STATUS_NORMAL = 0x2,
} EReaderStatus;
H
Hongze Cheng 已提交
27

28 29 30 31 32 33
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

D
dapan1121 已提交
34 35 36 37 38
typedef enum {
  READ_MODE_COUNT_ONLY = 0x1,
  READ_MODE_ALL,
} EReadMode;

39
typedef struct {
dengyihao's avatar
dengyihao 已提交
40
  STbDataIter* iter;
41 42 43 44
  int32_t      index;
  bool         hasVal;
} SIterInfo;

45 46
typedef struct {
  int32_t numOfBlocks;
47
  int32_t numOfLastFiles;
48 49
} SBlockNumber;

50
typedef struct SBlockIndex {
51 52
  int32_t     ordinalIndex;
  int64_t     inFileOffset;
H
Haojun Liao 已提交
53
  STimeWindow window;  // todo replace it with overlap flag.
54 55
} SBlockIndex;

H
Haojun Liao 已提交
56
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
57 58
  uint64_t  uid;
  TSKEY     lastKey;
59
  TSKEY     lastKeyInStt;       // last accessed key in stt
H
Hongze Cheng 已提交
60
  SMapData  mapData;            // block info (compressed)
61
  SArray*   pBlockList;         // block data index list, SArray<SBlockIndex>
H
Hongze Cheng 已提交
62 63 64 65 66 67
  SIterInfo iter;               // mem buffer skip list iterator
  SIterInfo iiter;              // imem buffer skip list iterator
  SArray*   delSkyline;         // delete info for this table
  int32_t   fileDelIndex;       // file block delete index
  int32_t   lastBlockDelIndex;  // delete index for last block
  bool      iterInit;           // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
68 69 70
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
71
  int64_t uid;
72
  int64_t offset;
H
Haojun Liao 已提交
73
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
74 75

typedef struct SBlockOrderSupporter {
76 77 78 79
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
80 81 82
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
83 84 85
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
86
  int64_t headFileLoad;
87
  double  headFileLoadTime;
88
  int64_t smaDataLoad;
89
  double  smaLoadTime;
90 91
  int64_t lastBlockLoad;
  double  lastBlockLoadTime;
H
Haojun Liao 已提交
92 93
  int64_t composedBlocks;
  double  buildComposedBlockTime;
H
Haojun Liao 已提交
94
  double  createScanInfoList;
X
Xiaoyu Wang 已提交
95 96 97
  //  double  getTbFromMemTime;
  //  double  getTbFromIMemTime;
  double initDelSkylineIterTime;
H
Hongze Cheng 已提交
98 99 100
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
101 102 103 104 105 106 107
  SArray*        pColAgg;
  SColumnDataAgg tsColAgg;
  int16_t*       colId;
  int16_t*       slotId;
  int32_t        numOfCols;
  char**         buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
  bool           smaValid;  // the sma on all queried columns are activated
H
Hongze Cheng 已提交
108 109
} SBlockLoadSuppInfo;

110
typedef struct SLastBlockReader {
H
Hongze Cheng 已提交
111 112 113 114 115
  STimeWindow        window;
  SVersionRange      verRange;
  int32_t            order;
  uint64_t           uid;
  SMergeTree         mergeTree;
116
  SSttBlockLoadInfo* pInfo;
117
  int64_t            currentKey;
118 119
} SLastBlockReader;

120
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
121 122 123
  int32_t           numOfFiles;  // number of total files
  int32_t           index;       // current accessed index in the list
  SArray*           pFileList;   // data file list
124
  int32_t           order;
H
Hongze Cheng 已提交
125
  SLastBlockReader* pLastBlockReader;  // last file block reader
126
} SFilesetIter;
H
Haojun Liao 已提交
127 128

typedef struct SFileDataBlockInfo {
129
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
130
  uint64_t uid;
131
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
132 133 134
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
H
Haojun Liao 已提交
135 136 137 138 139 140
  int32_t    numOfBlocks;
  int32_t    index;
  SArray*    blockList;  // SArray<SFileDataBlockInfo>
  int32_t    order;
  SDataBlk   block;  // current SDataBlk data
  SSHashObj* pTableMap;
H
Haojun Liao 已提交
141 142 143
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
144 145 146 147
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
148 149
} SFileBlockDumpInfo;

150
typedef struct STableUidList {
151 152
  uint64_t* tableUidList;  // access table uid list in uid ascending order list
  int32_t   currentIndex;  // index in table uid list
153
} STableUidList;
154

H
Haojun Liao 已提交
155
typedef struct SReaderStatus {
H
Hongze Cheng 已提交
156 157
  bool                  loadFromFile;       // check file stage
  bool                  composedDataBlock;  // the returned data block is a composed block or not
158
  bool                  mapDataCleaned;     // mapData has been cleaned up alreay or not
H
Haojun Liao 已提交
159
  SSHashObj*            pTableMap;          // SHash<STableBlockScanInfo>
160
  STableBlockScanInfo** pTableIter;         // table iterator used in building in-memory buffer data blocks.
161
  STableUidList         uidList;            // check tables in uid order, to avoid the repeatly load of blocks in STT.
H
Hongze Cheng 已提交
162 163 164 165 166
  SFileBlockDumpInfo    fBlockDumpInfo;
  SDFileSet*            pCurrentFileset;  // current opened file set
  SBlockData            fileBlockData;
  SFilesetIter          fileIter;
  SDataBlockIter        blockIter;
167
  SLDataIter*           pLDataIter;
H
Haojun Liao 已提交
168
  SRowMerger            merger;
169
  SColumnInfoData*      pPrimaryTsCol;      // primary time stamp output col info data
H
Haojun Liao 已提交
170 171
} SReaderStatus;

172
typedef struct SBlockInfoBuf {
H
Hongze Cheng 已提交
173 174 175
  int32_t currentIndex;
  SArray* pData;
  int32_t numPerBucket;
D
dapan1121 已提交
176
  int32_t numOfTables;
177 178
} SBlockInfoBuf;

H
Haojun Liao 已提交
179 180 181 182 183 184 185
typedef struct STsdbReaderAttr {
  STSchema*     pSchema;
  EReadMode     readMode;
  uint64_t      rowsNum;
  STimeWindow   window;
  bool          freeBlock;
  SVersionRange verRange;
H
Haojun Liao 已提交
186
  int16_t       order;
H
Haojun Liao 已提交
187 188
} STsdbReaderAttr;

189 190 191 192 193 194
typedef struct SResultBlockInfo {
  SSDataBlock* pResBlock;
  bool         freeBlock;
  int64_t      capacity;
} SResultBlockInfo;

H
Hongze Cheng 已提交
195
struct STsdbReader {
H
Haojun Liao 已提交
196
  STsdb*             pTsdb;
197 198
  SVersionRange      verRange;
  TdThreadMutex      readerMutex;
H
Haojun Liao 已提交
199 200
  EReaderStatus      flag;
  int32_t            code;
H
Haojun Liao 已提交
201 202
  uint64_t           suid;
  int16_t            order;
D
dapan1121 已提交
203 204
  EReadMode          readMode;
  uint64_t           rowsNum;
H
Haojun Liao 已提交
205
  STimeWindow        window;  // the primary query time window that applies to all queries
206
  SResultBlockInfo   resBlockInfo;
H
Haojun Liao 已提交
207
  SReaderStatus      status;
208 209
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
210
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
211
  STsdbReadSnap*     pReadSnap;
212
  SIOCostSummary     cost;
213
  SHashObj**         pIgnoreTables;
H
Haojun Liao 已提交
214 215 216 217 218 219
  STSchema*          pSchema;      // the newest version schema
  SSHashObj*         pSchemaMap;   // keep the retrieved schema info, to avoid the overhead by repeatly load schema
  SDataFReader*      pFileReader;  // the file reader
  SDelFReader*       pDelFReader;  // the del file reader
  SArray*            pDelIdx;      // del file block index;
  SBlockInfoBuf      blockInfoBuf;
220
  EContentData       step;
H
Haojun Liao 已提交
221
  STsdbReader*       innerReader[2];
H
Hongze Cheng 已提交
222
};
H
Hongze Cheng 已提交
223

H
Haojun Liao 已提交
224
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
225 226
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
227
static TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
H
Haojun Liao 已提交
228
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
H
Hongze Cheng 已提交
229
static int32_t  doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
230
                                       SRowMerger* pMerger, SVersionRange* pVerRange, const char* id);
H
Haojun Liao 已提交
231
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, STsdbReader* pReader);
H
Hongze Cheng 已提交
232
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow,
H
Haojun Liao 已提交
233
                                     STableBlockScanInfo* pScanInfo);
234
static int32_t  doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
H
Hongze Cheng 已提交
235
                                         int32_t rowIndex);
236
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
237
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order,
H
Hongze Cheng 已提交
238
                               SVersionRange* pVerRange);
239

H
Hongze Cheng 已提交
240
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
H
Haojun Liao 已提交
241
                                        TSDBROW* pResRow, STsdbReader* pReader, bool* freeTSRow);
H
Hongze Cheng 已提交
242
static int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo,
H
Hongze Cheng 已提交
243
                                  STsdbReader* pReader, SRow** pTSRow);
244 245
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                                     STsdbReader* pReader);
246

dengyihao's avatar
dengyihao 已提交
247 248 249 250
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
251
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Hongze Cheng 已提交
252 253
static bool          hasDataInLastBlock(SLastBlockReader* pLastBlockReader);
static int32_t       doBuildDataBlock(STsdbReader* pReader);
C
Cary Xu 已提交
254
static TSDBKEY       getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
255
static bool          hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo);
256
static void          initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter);
257
static int32_t       getInitialDelIndex(const SArray* pDelSkyline, int32_t order);
C
Cary Xu 已提交
258

H
Haojun Liao 已提交
259
static STableBlockScanInfo* getTableBlockScanInfo(SSHashObj* pTableMap, uint64_t uid, const char* id);
260

C
Cary Xu 已提交
261
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
H
Haojun Liao 已提交
262

263 264
static int32_t setColumnIdSlotList(SBlockLoadSuppInfo* pSupInfo, SColumnInfo* pCols, const int32_t* pSlotIdList,
                                   int32_t numOfCols) {
265
  pSupInfo->smaValid = true;
266
  pSupInfo->numOfCols = numOfCols;
267
  pSupInfo->colId = taosMemoryMalloc(numOfCols * (sizeof(int16_t) * 2 + POINTER_BYTES));
H
Haojun Liao 已提交
268 269
  if (pSupInfo->colId == NULL) {
    taosMemoryFree(pSupInfo->colId);
H
Haojun Liao 已提交
270 271
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
272

H
Haojun Liao 已提交
273
  pSupInfo->slotId = (int16_t*)((char*)pSupInfo->colId + (sizeof(int16_t) * numOfCols));
274
  pSupInfo->buildBuf = (char**)((char*)pSupInfo->slotId + (sizeof(int16_t) * numOfCols));
H
Haojun Liao 已提交
275
  for (int32_t i = 0; i < numOfCols; ++i) {
H
Haojun Liao 已提交
276 277
    pSupInfo->colId[i] = pCols[i].colId;
    pSupInfo->slotId[i] = pSlotIdList[i];
278

H
Haojun Liao 已提交
279 280
    if (IS_VAR_DATA_TYPE(pCols[i].type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCols[i].bytes);
H
Haojun Liao 已提交
281 282
    } else {
      pSupInfo->buildBuf[i] = NULL;
283
    }
H
Haojun Liao 已提交
284
  }
H
Hongze Cheng 已提交
285

H
Haojun Liao 已提交
286 287
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
288

H
Haojun Liao 已提交
289
static int32_t updateBlockSMAInfo(STSchema* pSchema, SBlockLoadSuppInfo* pSupInfo) {
290 291
  int32_t i = 0, j = 0;

H
Hongze Cheng 已提交
292
  while (i < pSchema->numOfCols && j < pSupInfo->numOfCols) {
293
    STColumn* pTCol = &pSchema->columns[i];
H
Haojun Liao 已提交
294
    if (pTCol->colId == pSupInfo->colId[j]) {
295 296
      if (!IS_BSMA_ON(pTCol)) {
        pSupInfo->smaValid = false;
H
Haojun Liao 已提交
297
        return TSDB_CODE_SUCCESS;
298 299 300 301
      }

      i += 1;
      j += 1;
H
Haojun Liao 已提交
302
    } else if (pTCol->colId < pSupInfo->colId[j]) {
303 304 305
      // do nothing
      i += 1;
    } else {
H
Haojun Liao 已提交
306
      return TSDB_CODE_INVALID_PARA;
307 308
    }
  }
H
Haojun Liao 已提交
309 310

  return TSDB_CODE_SUCCESS;
311 312
}

313
static int32_t initBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
H
Hongze Cheng 已提交
314
  int32_t num = numOfTables / pBuf->numPerBucket;
315 316 317 318 319
  int32_t remainder = numOfTables % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

H
Hongze Cheng 已提交
320
  for (int32_t i = 0; i < num; ++i) {
321 322 323 324
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
325

326 327 328 329 330 331 332
    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
333
    }
334
    taosArrayPush(pBuf->pData, &p);
H
Haojun Liao 已提交
335
  }
H
Hongze Cheng 已提交
336

D
dapan1121 已提交
337 338 339 340 341 342 343 344 345 346 347
  pBuf->numOfTables = numOfTables;

  return TSDB_CODE_SUCCESS;
}

static int32_t ensureBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
  if (numOfTables <= pBuf->numOfTables) {
    return TSDB_CODE_SUCCESS;
  }

  if (pBuf->numOfTables > 0) {
348
    STableBlockScanInfo** p = (STableBlockScanInfo**)taosArrayPop(pBuf->pData);
D
dapan1121 已提交
349
    taosMemoryFree(*p);
D
dapan1121 已提交
350 351
    pBuf->numOfTables /= pBuf->numPerBucket;
  }
352

D
dapan1121 已提交
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  int32_t num = (numOfTables - pBuf->numOfTables) / pBuf->numPerBucket;
  int32_t remainder = (numOfTables - pBuf->numOfTables) % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

  for (int32_t i = 0; i < num; ++i) {
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
    taosArrayPush(pBuf->pData, &p);
  }

  pBuf->numOfTables = numOfTables;

H
Haojun Liao 已提交
378 379
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
380

381 382
static void clearBlockScanInfoBuf(SBlockInfoBuf* pBuf) {
  size_t num = taosArrayGetSize(pBuf->pData);
H
Hongze Cheng 已提交
383
  for (int32_t i = 0; i < num; ++i) {
384 385 386 387 388 389 390 391 392
    char** p = taosArrayGet(pBuf->pData, i);
    taosMemoryFree(*p);
  }

  taosArrayDestroy(pBuf->pData);
}

static void* getPosInBlockInfoBuf(SBlockInfoBuf* pBuf, int32_t index) {
  int32_t bucketIndex = index / pBuf->numPerBucket;
H
Hongze Cheng 已提交
393
  char**  pBucket = taosArrayGet(pBuf->pData, bucketIndex);
394 395 396
  return (*pBucket) + (index % pBuf->numPerBucket) * sizeof(STableBlockScanInfo);
}

H
Haojun Liao 已提交
397 398 399 400 401 402 403 404 405 406
static int32_t uidComparFunc(const void* p1, const void* p2) {
  uint64_t pu1 = *(uint64_t*)p1;
  uint64_t pu2 = *(uint64_t*)p2;
  if (pu1 == pu2) {
    return 0;
  } else {
    return (pu1 < pu2) ? -1 : 1;
  }
}

407
// NOTE: speedup the whole processing by preparing the buffer for STableBlockScanInfo in batch model
H
Haojun Liao 已提交
408
static SSHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, SBlockInfoBuf* pBuf, const STableKeyInfo* idList,
X
Xiaoyu Wang 已提交
409
                                         STableUidList* pUidList, int32_t numOfTables) {
H
Haojun Liao 已提交
410
  // allocate buffer in order to load data blocks from file
411
  // todo use simple hash instead, optimize the memory consumption
H
Haojun Liao 已提交
412
  SSHashObj* pTableMap = tSimpleHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT));
413
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
414 415 416
    return NULL;
  }

H
Haojun Liao 已提交
417
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
418
  initBlockScanInfoBuf(pBuf, numOfTables);
H
Haojun Liao 已提交
419

H
Haojun Liao 已提交
420 421
  pUidList->tableUidList = taosMemoryMalloc(numOfTables * sizeof(uint64_t));
  if (pUidList->tableUidList == NULL) {
H
Haojun Liao 已提交
422
    tSimpleHashCleanup(pTableMap);
H
Haojun Liao 已提交
423 424
    return NULL;
  }
H
Haojun Liao 已提交
425

H
Haojun Liao 已提交
426
  pUidList->currentIndex = 0;
H
Haojun Liao 已提交
427

428
  for (int32_t j = 0; j < numOfTables; ++j) {
H
Haojun Liao 已提交
429
    STableBlockScanInfo* pScanInfo = getPosInBlockInfoBuf(pBuf, j);
H
Haojun Liao 已提交
430

431
    pScanInfo->uid = idList[j].uid;
H
Haojun Liao 已提交
432
    pUidList->tableUidList[j] = idList[j].uid;
H
Haojun Liao 已提交
433

434
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
H
Haojun Liao 已提交
435
      int64_t skey = pTsdbReader->window.skey;
436
      pScanInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
H
Haojun Liao 已提交
437
      pScanInfo->lastKeyInStt = skey;
wmmhello's avatar
wmmhello 已提交
438
    } else {
H
Haojun Liao 已提交
439
      int64_t ekey = pTsdbReader->window.ekey;
440
      pScanInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
H
Haojun Liao 已提交
441
      pScanInfo->lastKeyInStt = ekey;
H
Haojun Liao 已提交
442
    }
wmmhello's avatar
wmmhello 已提交
443

H
Haojun Liao 已提交
444
    tSimpleHashPut(pTableMap, &pScanInfo->uid, sizeof(uint64_t), &pScanInfo, POINTER_BYTES);
H
Hongze Cheng 已提交
445 446
    tsdbTrace("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, pScanInfo->uid,
              pScanInfo->lastKey, pTsdbReader->idStr);
H
Haojun Liao 已提交
447 448
  }

H
Haojun Liao 已提交
449
  taosSort(pUidList->tableUidList, numOfTables, sizeof(uint64_t), uidComparFunc);
H
Haojun Liao 已提交
450

H
Haojun Liao 已提交
451 452 453 454
  pTsdbReader->cost.createScanInfoList = (taosGetTimestampUs() - st) / 1000.0;
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, elapsed time:%.2f ms, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->cost.createScanInfoList,
            pTsdbReader->idStr);
455

456
  return pTableMap;
H
Hongze Cheng 已提交
457
}
H
Hongze Cheng 已提交
458

H
Haojun Liao 已提交
459 460 461 462 463
static void resetAllDataBlockScanInfo(SSHashObj* pTableMap, int64_t ts, int32_t step) {
  void   *p = NULL;
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pTableMap, p, &iter)) != NULL) {
H
Hongze Cheng 已提交
464
    STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;
465 466

    pInfo->iterInit = false;
H
Haojun Liao 已提交
467
    pInfo->iter.hasVal = false;
468
    pInfo->iiter.hasVal = false;
H
Haojun Liao 已提交
469

470 471
    if (pInfo->iter.iter != NULL) {
      pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
472 473
    }

H
Haojun Liao 已提交
474 475
    if (pInfo->iiter.iter != NULL) {
      pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
476 477
    }

478 479
    pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    pInfo->lastKey = ts;
480
    pInfo->lastKeyInStt = ts + step;
481 482 483
  }
}

484 485
static void clearBlockScanInfo(STableBlockScanInfo* p) {
  p->iterInit = false;
H
Haojun Liao 已提交
486 487

  p->iter.hasVal = false;
488
  p->iiter.hasVal = false;
489

490 491 492
  if (p->iter.iter != NULL) {
    p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
  }
493

494 495 496
  if (p->iiter.iter != NULL) {
    p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
  }
497

498 499 500 501
  p->delSkyline = taosArrayDestroy(p->delSkyline);
  p->pBlockList = taosArrayDestroy(p->pBlockList);
  tMapDataClear(&p->mapData);
}
502

H
Haojun Liao 已提交
503
static void destroyAllBlockScanInfo(SSHashObj* pTableMap) {
504
  void* p = NULL;
H
Haojun Liao 已提交
505 506 507
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pTableMap, p, &iter)) != NULL) {
508
    clearBlockScanInfo(*(STableBlockScanInfo**)p);
509 510
  }

H
Haojun Liao 已提交
511
  tSimpleHashCleanup(pTableMap);
512 513
}

514
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) { return pWindow->skey > pWindow->ekey; }
H
Hongze Cheng 已提交
515

516 517 518
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
519
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
520

521
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
522
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
523

dengyihao's avatar
dengyihao 已提交
524
  STimeWindow win = *pWindow;
525 526 527 528 529 530
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
531

H
Haojun Liao 已提交
532
// init file iterator
533
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet, STsdbReader* pReader) {
H
Hongze Cheng 已提交
534
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
535

536 537
  pIter->index = ASCENDING_TRAVERSE(pReader->order) ? -1 : numOfFileset;
  pIter->order = pReader->order;
H
Hongze Cheng 已提交
538
  pIter->pFileList = aDFileSet;
539
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
540

541 542 543 544
  if (pIter->pLastBlockReader == NULL) {
    pIter->pLastBlockReader = taosMemoryCalloc(1, sizeof(struct SLastBlockReader));
    if (pIter->pLastBlockReader == NULL) {
      int32_t code = TSDB_CODE_OUT_OF_MEMORY;
545
      tsdbError("failed to prepare the last block iterator, since:%s %s", tstrerror(code), pReader->idStr);
546 547
      return code;
    }
548 549
  }

550 551 552 553 554 555 556 557
  SLastBlockReader* pLReader = pIter->pLastBlockReader;
  pLReader->order = pReader->order;
  pLReader->window = pReader->window;
  pLReader->verRange = pReader->verRange;

  pLReader->uid = 0;
  tMergeTreeClose(&pLReader->mergeTree);

558
  if (pLReader->pInfo == NULL) {
559
    // here we ignore the first column, which is always be the primary timestamp column
560 561 562
    SBlockLoadSuppInfo* pInfo = &pReader->suppInfo;

    int32_t numOfStt = pReader->pTsdb->pVnode->config.sttTrigger;
X
Xiaoyu Wang 已提交
563
    pLReader->pInfo = tCreateLastBlockLoadInfo(pReader->pSchema, &pInfo->colId[1], pInfo->numOfCols - 1, numOfStt);
H
Haojun Liao 已提交
564 565 566 567
    if (pLReader->pInfo == NULL) {
      tsdbDebug("init fileset iterator failed, code:%s %s", tstrerror(terrno), pReader->idStr);
      return terrno;
    }
568 569
  }

570
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
H
Haojun Liao 已提交
571 572 573
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
574
static int32_t filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader, bool* hasNext) {
575 576
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
577
  pIter->index += step;
D
dapan1121 已提交
578
  int32_t code = 0;
579 580

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
D
dapan1121 已提交
581 582
    *hasNext = false;
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
583 584
  }

H
Haojun Liao 已提交
585 586 587
  SIOCostSummary* pSum = &pReader->cost;
  getLastBlockLoadInfo(pIter->pLastBlockReader->pInfo, &pSum->lastBlockLoad, &pReader->cost.lastBlockLoadTime);

588 589
  pIter->pLastBlockReader->uid = 0;
  tMergeTreeClose(&pIter->pLastBlockReader->mergeTree);
590
  resetLastBlockLoadInfo(pIter->pLastBlockReader->pInfo);
591

H
Haojun Liao 已提交
592 593
  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
594

595
  while (1) {
H
Haojun Liao 已提交
596 597 598
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
599

600
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
601

D
dapan1121 已提交
602
    code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
603 604 605
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
606

607 608
    pReader->cost.headFileLoad += 1;

609 610 611 612 613 614 615
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
D
dapan1121 已提交
616 617
      *hasNext = false;
      return TSDB_CODE_SUCCESS;
618 619 620 621
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
622
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
D
dapan1121 已提交
623 624
        *hasNext = false;
        return TSDB_CODE_SUCCESS;
625
      }
626 627
      continue;
    }
C
Cary Xu 已提交
628

629
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
630
              pReader->window.ekey, pReader->idStr);
D
dapan1121 已提交
631 632
    *hasNext = true;
    return TSDB_CODE_SUCCESS;
633
  }
634

635
_err:
D
dapan1121 已提交
636 637
  *hasNext = false;
  return code;
H
Haojun Liao 已提交
638 639
}

640
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
641 642
  pIter->order = order;
  pIter->index = -1;
643
  pIter->numOfBlocks = 0;
644 645 646 647 648 649 650
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
}

L
Liu Jicong 已提交
651
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
652

H
Haojun Liao 已提交
653
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
654 655
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
656 657
}

658 659 660 661 662 663 664 665
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
H
Haojun Liao 已提交
666
    SColumnInfoData colInfo = {0};
667 668 669 670 671 672 673 674 675 676 677 678 679
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }
  return pResBlock;
}

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
static int32_t tsdbInitReaderLock(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexInit(&pReader->readerMutex, NULL);

  qTrace("tsdb/read: %p, post-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbUninitReaderLock(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexDestroy(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbAcquireReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-take read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexLock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-take read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbTryAcquireReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexTryLock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbReleaseReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-untake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexUnlock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-untake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

735 736 737 738 739 740
void tsdbReleaseDataBlock(STsdbReader* pReader) {
  SReaderStatus* pStatus = &pReader->status;
  if (!pStatus->composedDataBlock) {
    tsdbReleaseReader(pReader);
  }
}
741

742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
static int32_t initResBlockInfo(SResultBlockInfo* pResBlockInfo, int64_t capacity, SSDataBlock* pResBlock, SQueryTableDataCond* pCond) {
  pResBlockInfo->capacity = capacity;
  pResBlockInfo->pResBlock = pResBlock;
  terrno = 0;

  if (pResBlockInfo->pResBlock == NULL) {
    pResBlockInfo->freeBlock = true;
    pResBlockInfo->pResBlock = createResBlock(pCond, pResBlockInfo->capacity);
  } else {
    pResBlockInfo->freeBlock = false;
  }

  return terrno;
}

757
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, void** ppReader, int32_t capacity,
H
Haojun Liao 已提交
758
                                SSDataBlock* pResBlock, const char* idstr) {
H
Haojun Liao 已提交
759
  int32_t      code = 0;
760
  int8_t       level = 0;
H
Haojun Liao 已提交
761
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
762 763
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
764
    goto _end;
H
Hongze Cheng 已提交
765 766
  }

C
Cary Xu 已提交
767
  if (VND_IS_TSMA(pVnode)) {
H
Haojun Liao 已提交
768
    tsdbDebug("vgId:%d, tsma is selected to query, %s", TD_VID(pVnode), idstr);
C
Cary Xu 已提交
769 770
  }

H
Haojun Liao 已提交
771
  initReaderStatus(&pReader->status);
772

L
Liu Jicong 已提交
773
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
774 775
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
776

777
  pReader->idStr = (idstr != NULL) ? taosStrdup(idstr) : NULL;
dengyihao's avatar
dengyihao 已提交
778
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
779
  pReader->type = pCond->type;
780
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
781
  pReader->blockInfoBuf.numPerBucket = 1000;  // 1000 tables per bucket
H
Hongze Cheng 已提交
782

783 784 785
  code = initResBlockInfo(&pReader->resBlockInfo, capacity, pResBlock, pCond);
  if (code != TSDB_CODE_SUCCESS) {
    goto _end;
H
Haojun Liao 已提交
786
  }
787

H
Haojun Liao 已提交
788 789 790 791 792
  if (pCond->numOfCols <= 0) {
    tsdbError("vgId:%d, invalid column number %d in query cond, %s", TD_VID(pVnode), pCond->numOfCols, idstr);
    code = TSDB_CODE_INVALID_PARA;
    goto _end;
  }
H
Hongze Cheng 已提交
793

794 795
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
796
  pSup->pColAgg = taosArrayInit(pCond->numOfCols, sizeof(SColumnDataAgg));
H
Haojun Liao 已提交
797
  if (pSup->pColAgg == NULL) {
798 799 800
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
801

802
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
803
  setColumnIdSlotList(pSup, pCond->colList, pCond->pSlotList, pCond->numOfCols);
804

H
Hongze Cheng 已提交
805
  code = tBlockDataCreate(&pReader->status.fileBlockData);
H
Haojun Liao 已提交
806 807 808 809 810
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

811
  if (pReader->suppInfo.colId[0] != PRIMARYKEY_TIMESTAMP_COL_ID) {
812
    tsdbError("the first column isn't primary timestamp, %d, %s", pReader->suppInfo.colId[0], pReader->idStr);
K
kailixu 已提交
813
    code = TSDB_CODE_INVALID_PARA;
814 815 816
    goto _end;
  }

817
  pReader->status.pPrimaryTsCol = taosArrayGet(pReader->resBlockInfo.pResBlock->pDataBlock, pSup->slotId[0]);
818 819 820 821
  int32_t type = pReader->status.pPrimaryTsCol->info.type;
  if (type != TSDB_DATA_TYPE_TIMESTAMP) {
    tsdbError("the first column isn't primary timestamp in result block, actual: %s, %s", tDataTypes[type].name,
              pReader->idStr);
K
kailixu 已提交
822
    code = TSDB_CODE_INVALID_PARA;
823 824
    goto _end;
  }
825

826
  tsdbInitReaderLock(pReader);
827

H
Hongze Cheng 已提交
828 829
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
830

H
Haojun Liao 已提交
831 832
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
833 834 835
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
836

H
Haojun Liao 已提交
837
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
X
Xiaoyu Wang 已提交
838
  int64_t    st = taosGetTimestampUs();
839 840 841
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
842
    goto _end;
H
Haojun Liao 已提交
843
  }
H
Hongze Cheng 已提交
844

H
Haojun Liao 已提交
845
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
846

847 848
  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
849
  if (num == 0) {
850
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
851 852
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
853

H
Haojun Liao 已提交
854
  // todo binary search to the start position
855 856
  int64_t et1 = taosGetTimestampUs();

X
Xiaoyu Wang 已提交
857
  SBlockIdx*     pBlockIdx = NULL;
858
  STableUidList* pList = &pReader->status.uidList;
H
Haojun Liao 已提交
859

H
Haojun Liao 已提交
860
  int32_t i = 0, j = 0;
X
Xiaoyu Wang 已提交
861
  while (i < num && j < numOfTables) {
H
Haojun Liao 已提交
862
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Hongze Cheng 已提交
863
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
864
      i += 1;
H
Haojun Liao 已提交
865 866 867
      continue;
    }

H
Haojun Liao 已提交
868 869
    if (pBlockIdx->uid < pList->tableUidList[j]) {
      i += 1;
H
Haojun Liao 已提交
870 871 872
      continue;
    }

H
Haojun Liao 已提交
873
    if (pBlockIdx->uid > pList->tableUidList[j]) {
H
Haojun Liao 已提交
874
      j += 1;
H
Haojun Liao 已提交
875
      continue;
H
Haojun Liao 已提交
876 877
    }

H
Haojun Liao 已提交
878
    if (pBlockIdx->uid == pList->tableUidList[j]) {
H
Haojun Liao 已提交
879
      // this block belongs to a table that is not queried.
H
Haojun Liao 已提交
880 881
      STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockIdx->uid, pReader->idStr);
      if (pScanInfo == NULL) {
882
        tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
883
        return terrno;
H
Haojun Liao 已提交
884 885 886 887 888 889 890
      }

      if (pScanInfo->pBlockList == NULL) {
        pScanInfo->pBlockList = taosArrayInit(4, sizeof(SBlockIndex));
      }

      taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
891

H
Haojun Liao 已提交
892
      i += 1;
H
Haojun Liao 已提交
893
      j += 1;
894
    }
H
Haojun Liao 已提交
895
  }
H
Hongze Cheng 已提交
896

897
  int64_t et2 = taosGetTimestampUs();
H
Haojun Liao 已提交
898 899 900
  tsdbDebug("load block index for %d/%d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
            numOfTables, (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0,
            pReader->idStr);
901 902 903

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

904
_end:
905
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
906 907
  return code;
}
H
Hongze Cheng 已提交
908

909 910 911 912 913 914 915 916 917 918 919 920
static void doCleanupTableScanInfo(STableBlockScanInfo* pScanInfo) {
  // reset the index in last block when handing a new file
  tMapDataClear(&pScanInfo->mapData);
  taosArrayClear(pScanInfo->pBlockList);
}

static void cleanupTableScanInfo(SReaderStatus* pStatus) {
  if (pStatus->mapDataCleaned) {
    return;
  }

  SSHashObj* pTableMap = pStatus->pTableMap;
921
  STableBlockScanInfo** px = NULL;
H
Haojun Liao 已提交
922 923
  int32_t iter = 0;

dengyihao's avatar
dengyihao 已提交
924
  while (1) {
H
Haojun Liao 已提交
925
    px = tSimpleHashIterate(pTableMap, px, &iter);
926 927 928 929
    if (px == NULL) {
      break;
    }

930
    doCleanupTableScanInfo(*px);
931
  }
932 933

  pStatus->mapDataCleaned = true;
934 935
}

936
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum, SArray* pTableScanInfoList) {
937 938 939 940
  size_t  sizeInDisk = 0;
  size_t  numOfTables = taosArrayGetSize(pIndexList);

  int64_t st = taosGetTimestampUs();
941
  cleanupTableScanInfo(&pReader->status);
942

943 944
  // set the flag for the new file
  pReader->status.mapDataCleaned = false;
dengyihao's avatar
dengyihao 已提交
945
  for (int32_t i = 0; i < numOfTables; ++i) {
X
Xiaoyu Wang 已提交
946
    SBlockIdx*           pBlockIdx = taosArrayGet(pIndexList, i);
H
Haojun Liao 已提交
947 948 949 950
    STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockIdx->uid, pReader->idStr);
    if (pScanInfo == NULL) {
      return terrno;
    }
H
Hongze Cheng 已提交
951

952
    tMapDataReset(&pScanInfo->mapData);
H
Hongze Cheng 已提交
953
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);
H
Haojun Liao 已提交
954
    taosArrayEnsureCap(pScanInfo->pBlockList, pScanInfo->mapData.nItem);
955

956
    sizeInDisk += pScanInfo->mapData.nData;
957 958 959 960 961 962 963 964 965 966 967 968 969

    int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
    STimeWindow w = pReader->window;
    if (ASCENDING_TRAVERSE(pReader->order)) {
      w.skey = pScanInfo->lastKey + step;
    } else {
      w.ekey = pScanInfo->lastKey + step;
    }

    if (isEmptyQueryTimeWindow(&w)) {
      continue;
    }

H
Haojun Liao 已提交
970
    SDataBlk block = {0};
971
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Haojun Liao 已提交
972
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
H
Hongze Cheng 已提交
973

974
      // 1. time range check
975 976
      // if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
      if (block.minKey.ts > w.ekey || block.maxKey.ts < w.skey) {
H
Haojun Liao 已提交
977 978
        continue;
      }
H
Hongze Cheng 已提交
979

980
      // 2. version range check
H
Hongze Cheng 已提交
981
      if (block.minVer > pReader->verRange.maxVer || block.maxVer < pReader->verRange.minVer) {
982 983
        continue;
      }
984

985
      SBlockIndex bIndex = {.ordinalIndex = j, .inFileOffset = block.aSubBlock->offset};
986
      bIndex.window = (STimeWindow){.skey = block.minKey.ts, .ekey = block.maxKey.ts};
987

H
Haojun Liao 已提交
988 989
      void* p1 = taosArrayPush(pScanInfo->pBlockList, &bIndex);
      if (p1 == NULL) {
990
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
991 992
        return TSDB_CODE_OUT_OF_MEMORY;
      }
993

994
      pBlockNum->numOfBlocks += 1;
H
Haojun Liao 已提交
995
    }
H
Hongze Cheng 已提交
996

H
Haojun Liao 已提交
997
    if (taosArrayGetSize(pScanInfo->pBlockList) > 0) {
998
      taosArrayPush(pTableScanInfoList, &pScanInfo);
999 1000 1001
    }
  }

H
Hongze Cheng 已提交
1002
  pBlockNum->numOfLastFiles = pReader->pFileReader->pSet->nSttF;
1003
  int32_t total = pBlockNum->numOfLastFiles + pBlockNum->numOfBlocks;
1004

1005
  double el = (taosGetTimestampUs() - st) / 1000.0;
H
Hongze Cheng 已提交
1006
  tsdbDebug(
1007
      "load block of %ld tables completed, blocks:%d in %d tables, last-files:%d, block-info-size:%.2f Kb, elapsed "
1008
      "time:%.2f ms %s",
1009 1010
      numOfTables, pBlockNum->numOfBlocks, (int32_t)taosArrayGetSize(pTableScanInfoList), pBlockNum->numOfLastFiles,
      sizeInDisk / 1000.0, el, pReader->idStr);
1011

1012
  pReader->cost.numOfBlocks += total;
1013
  pReader->cost.headFileLoadTime += el;
1014

H
Haojun Liao 已提交
1015 1016
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1017

1018
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
1019
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
1020
  pDumpInfo->allDumped = true;
1021
  pDumpInfo->lastKey = maxKey + step;
H
Haojun Liao 已提交
1022 1023
}

D
dapan1121 已提交
1024
static int32_t doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
1025
                            SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
1026
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
H
Hongze Cheng 已提交
1027
    if (!COL_VAL_IS_VALUE(pColVal)) {
1028
      colDataSetNULL(pColInfoData, rowIndex);
H
Haojun Liao 已提交
1029 1030
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
D
dapan1121 已提交
1031
      if (pColVal->value.nData > pColInfoData->info.bytes) {
1032 1033
        tsdbWarn("column cid:%d actual data len %d is bigger than schema len %d", pColVal->cid, pColVal->value.nData,
                 pColInfoData->info.bytes);
D
dapan1121 已提交
1034 1035
        return TSDB_CODE_TDB_INVALID_TABLE_SCHEMA_VER;
      }
1036 1037 1038 1039
      if (pColVal->value.nData > 0) {  // pData may be null, if nData is 0
        memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      }

1040
      colDataSetVal(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
H
Haojun Liao 已提交
1041 1042
    }
  } else {
1043
    colDataSetVal(pColInfoData, rowIndex, (const char*)&pColVal->value, !COL_VAL_IS_VALUE(pColVal));
H
Haojun Liao 已提交
1044
  }
D
dapan1121 已提交
1045 1046

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1047 1048
}

1049
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
H
Haojun Liao 已提交
1050 1051 1052
  size_t num = taosArrayGetSize(pBlockIter->blockList);
  if (num == 0) {
    ASSERT(pBlockIter->numOfBlocks == num);
1053 1054
    return NULL;
  }
1055 1056 1057

  SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pBlockInfo;
1058 1059
}

H
Hongze Cheng 已提交
1060
static SDataBlk* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
1061

C
Cary Xu 已提交
1062 1063 1064 1065 1066 1067
static int doBinarySearchKey(TSKEY* keyList, int num, int pos, TSKEY key, int order) {
  // start end position
  int s, e;
  s = pos;

  // check
H
Haojun Liao 已提交
1068
  ASSERT(pos >= 0 && pos < num && num > 0);
C
Cary Xu 已提交
1069 1070
  if (order == TSDB_ORDER_ASC) {
    // find the first position which is smaller than the key
H
Hongze Cheng 已提交
1071 1072
    e = num - 1;
    if (key < keyList[pos]) return -1;
C
Cary Xu 已提交
1073 1074
    while (1) {
      // check can return
H
Hongze Cheng 已提交
1075 1076 1077
      if (key >= keyList[e]) return e;
      if (key <= keyList[s]) return s;
      if (e - s <= 1) return s;
C
Cary Xu 已提交
1078 1079

      // change start or end position
H
Hongze Cheng 已提交
1080
      int mid = s + (e - s + 1) / 2;
C
Cary Xu 已提交
1081 1082
      if (keyList[mid] > key)
        e = mid;
H
Hongze Cheng 已提交
1083
      else if (keyList[mid] < key)
C
Cary Xu 已提交
1084 1085 1086 1087
        s = mid;
      else
        return mid;
    }
H
Hongze Cheng 已提交
1088
  } else {  // DESC
C
Cary Xu 已提交
1089
    // find the first position which is bigger than the key
H
Hongze Cheng 已提交
1090 1091
    e = 0;
    if (key > keyList[pos]) return -1;
C
Cary Xu 已提交
1092 1093
    while (1) {
      // check can return
H
Hongze Cheng 已提交
1094 1095 1096
      if (key <= keyList[e]) return e;
      if (key >= keyList[s]) return s;
      if (s - e <= 1) return s;
C
Cary Xu 已提交
1097 1098

      // change start or end position
H
Hongze Cheng 已提交
1099
      int mid = s - (s - e + 1) / 2;
C
Cary Xu 已提交
1100 1101
      if (keyList[mid] < key)
        e = mid;
H
Hongze Cheng 已提交
1102
      else if (keyList[mid] > key)
C
Cary Xu 已提交
1103 1104 1105 1106 1107 1108 1109
        s = mid;
      else
        return mid;
    }
  }
}

H
Haojun Liao 已提交
1110
static int32_t getEndPosInDataBlock(STsdbReader* pReader, SBlockData* pBlockData, SDataBlk* pBlock, int32_t pos) {
C
Cary Xu 已提交
1111 1112
  // NOTE: reverse the order to find the end position in data block
  int32_t endPos = -1;
H
Hongze Cheng 已提交
1113
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
C
Cary Xu 已提交
1114 1115 1116 1117 1118 1119

  if (asc && pReader->window.ekey >= pBlock->maxKey.ts) {
    endPos = pBlock->nRow - 1;
  } else if (!asc && pReader->window.skey <= pBlock->minKey.ts) {
    endPos = 0;
  } else {
C
Cary Xu 已提交
1120 1121
    int64_t key = asc ? pReader->window.ekey : pReader->window.skey;
    endPos = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, pReader->order);
C
Cary Xu 已提交
1122 1123
  }

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
  if ((pReader->verRange.maxVer >= pBlock->minVer && pReader->verRange.maxVer < pBlock->maxVer)||
      (pReader->verRange.minVer <= pBlock->maxVer && pReader->verRange.minVer > pBlock->minVer)) {
    int32_t i = endPos;

    if (asc) {
      for(; i >= 0; --i) {
        if (pBlockData->aVersion[i] <= pReader->verRange.maxVer) {
          break;
        }
      }
    } else {
      for(; i < pBlock->nRow; ++i) {
        if (pBlockData->aVersion[i] >= pReader->verRange.minVer) {
          break;
        }
      }
    }

    endPos = i;
  }

C
Cary Xu 已提交
1145 1146 1147
  return endPos;
}

H
Haojun Liao 已提交
1148
static void copyPrimaryTsCol(const SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Haojun Liao 已提交
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
                             int32_t dumpedRows, bool asc) {
  if (asc) {
    memcpy(pColData->pData, &pBlockData->aTSKEY[pDumpInfo->rowIndex], dumpedRows * sizeof(int64_t));
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    memcpy(pColData->pData, &pBlockData->aTSKEY[startIndex], dumpedRows * sizeof(int64_t));

    // todo: opt perf by extract the loop
    // reverse the array list
    int32_t  mid = dumpedRows >> 1u;
    int64_t* pts = (int64_t*)pColData->pData;
    for (int32_t j = 0; j < mid; ++j) {
      int64_t t = pts[j];
      pts[j] = pts[dumpedRows - j - 1];
      pts[dumpedRows - j - 1] = t;
    }
  }
}

H
Haojun Liao 已提交
1168 1169
// a faster version of copy procedure.
static void copyNumericCols(const SColData* pData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Hongze Cheng 已提交
1170
                            int32_t dumpedRows, bool asc) {
H
Haojun Liao 已提交
1171 1172 1173 1174 1175 1176 1177 1178
  uint8_t* p = NULL;
  if (asc) {
    p = pData->pData + tDataTypes[pData->type].bytes * pDumpInfo->rowIndex;
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    p = pData->pData + tDataTypes[pData->type].bytes * startIndex;
  }

H
Hongze Cheng 已提交
1179
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1180

H
Haojun Liao 已提交
1181
  // make sure it is aligned to 8bit, the allocated memory address is aligned to 256bit
1182
  //  ASSERT((((uint64_t)pColData->pData) & (0x8 - 1)) == 0);
H
Haojun Liao 已提交
1183 1184 1185 1186 1187 1188

  // 1. copy data in a batch model
  memcpy(pColData->pData, p, dumpedRows * tDataTypes[pData->type].bytes);

  // 2. reverse the array list in case of descending order scan data block
  if (!asc) {
H
Hongze Cheng 已提交
1189
    switch (pColData->info.type) {
H
Haojun Liao 已提交
1190 1191 1192
      case TSDB_DATA_TYPE_TIMESTAMP:
      case TSDB_DATA_TYPE_DOUBLE:
      case TSDB_DATA_TYPE_BIGINT:
H
Hongze Cheng 已提交
1193
      case TSDB_DATA_TYPE_UBIGINT: {
H
Haojun Liao 已提交
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
        int32_t  mid = dumpedRows >> 1u;
        int64_t* pts = (int64_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_BOOL:
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_UTINYINT: {
H
Hongze Cheng 已提交
1207
        int32_t mid = dumpedRows >> 1u;
H
Haojun Liao 已提交
1208 1209
        int8_t* pts = (int8_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
1210
          int8_t t = pts[j];
H
Haojun Liao 已提交
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_SMALLINT:
      case TSDB_DATA_TYPE_USMALLINT: {
        int32_t  mid = dumpedRows >> 1u;
        int16_t* pts = (int16_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_FLOAT:
      case TSDB_DATA_TYPE_INT:
      case TSDB_DATA_TYPE_UINT: {
        int32_t  mid = dumpedRows >> 1u;
        int32_t* pts = (int32_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
1235
          int32_t t = pts[j];
H
Haojun Liao 已提交
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }
    }
  }

  // 3. if the  null value exists, check items one-by-one
  if (pData->flag != HAS_VALUE) {
    int32_t rowIndex = 0;

    for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step, rowIndex++) {
      uint8_t v = tColDataGetBitValue(pData, j);
      if (v == 0 || v == 1) {
        colDataSetNull_f(pColData->nullbitmap, rowIndex);
        pColData->hasNull = true;
      }
    }
  }
}

1258
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
1259 1260 1261 1262
  SReaderStatus*      pStatus = &pReader->status;
  SDataBlockIter*     pBlockIter = &pStatus->blockIter;
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Hongze Cheng 已提交
1263

1264
  SBlockData*         pBlockData = &pStatus->fileBlockData;
C
Cary Xu 已提交
1265
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
H
Hongze Cheng 已提交
1266
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
1267
  SSDataBlock*        pResBlock = pReader->resBlockInfo.pResBlock;
H
Haojun Liao 已提交
1268
  int32_t             numOfOutputCols = pSupInfo->numOfCols;
D
dapan1121 已提交
1269
  int32_t             code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1270

H
Haojun Liao 已提交
1271
  SColVal cv = {0};
1272
  int64_t st = taosGetTimestampUs();
1273 1274
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
1275

1276 1277
  // no data exists, return directly.
  if (pBlockData->nRow == 0 || pBlockData->aTSKEY == 0) {
X
Xiaoyu Wang 已提交
1278 1279
    tsdbWarn("%p no need to copy since no data in blockData, table uid:%" PRIu64 " has been dropped, %s", pReader,
             pBlockInfo->uid, pReader->idStr);
1280 1281 1282 1283
    pResBlock->info.rows = 0;
    return 0;
  }

1284
  // row index of dump info remain the initial position, let's find the appropriate start position.
1285
  if ((pDumpInfo->rowIndex == 0 && asc) || (pDumpInfo->rowIndex == pBlock->nRow - 1 && (!asc))) {
1286
    if (asc && pReader->window.skey <= pBlock->minKey.ts && pReader->verRange.minVer <= pBlock->minVer) {
1287
      // pDumpInfo->rowIndex = 0;
1288
    } else if (!asc && pReader->window.ekey >= pBlock->maxKey.ts && pReader->verRange.maxVer >= pBlock->maxVer) {
1289
      // pDumpInfo->rowIndex = pBlock->nRow - 1;
H
Haojun Liao 已提交
1290
    } else {  // find the appropriate the start position in current block, and set it to be the current rowIndex
1291
      int32_t pos = asc ? pBlock->nRow - 1 : 0;
C
Cary Xu 已提交
1292 1293 1294
      int32_t order = asc ? TSDB_ORDER_DESC : TSDB_ORDER_ASC;
      int64_t key = asc ? pReader->window.skey : pReader->window.ekey;
      pDumpInfo->rowIndex = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, order);
H
Haojun Liao 已提交
1295 1296 1297 1298 1299 1300 1301 1302 1303

      if (pDumpInfo->rowIndex < 0) {
        tsdbError(
            "%p failed to locate the start position in current block, global index:%d, table index:%d, brange:%" PRId64
            "-%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 " %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->minVer,
            pBlock->maxVer, pReader->idStr);
        return TSDB_CODE_INVALID_PARA;
      }
1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326

      ASSERT(pReader->verRange.minVer <= pBlock->maxVer && pReader->verRange.maxVer >= pBlock->minVer);

      // find the appropriate start position that satisfies the version requirement.
      if ((pReader->verRange.maxVer >= pBlock->minVer && pReader->verRange.maxVer < pBlock->maxVer)||
          (pReader->verRange.minVer <= pBlock->maxVer && pReader->verRange.minVer > pBlock->minVer)) {
        int32_t i = pDumpInfo->rowIndex;
        if (asc) {
          for(; i < pBlock->nRow; ++i) {
            if (pBlockData->aVersion[i] >= pReader->verRange.minVer) {
              break;
            }
          }
        } else {
          for(; i >= 0; --i) {
            if (pBlockData->aVersion[i] <= pReader->verRange.maxVer) {
              break;
            }
          }
        }

        pDumpInfo->rowIndex = i;
      }
1327
    }
C
Cary Xu 已提交
1328 1329 1330 1331 1332 1333 1334 1335 1336 1337
  }

  // time window check
  int32_t endIndex = getEndPosInDataBlock(pReader, pBlockData, pBlock, pDumpInfo->rowIndex);
  if (endIndex == -1) {
    setBlockAllDumped(pDumpInfo, pReader->window.ekey, pReader->order);
    return TSDB_CODE_SUCCESS;
  }

  endIndex += step;
H
Haojun Liao 已提交
1338
  int32_t dumpedRows = asc ? (endIndex - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex - endIndex);
1339 1340
  if (dumpedRows > pReader->resBlockInfo.capacity) {  // output buffer check
    dumpedRows = pReader->resBlockInfo.capacity;
1341 1342 1343
  } else if (dumpedRows <= 0) {  // no qualified rows in current data block, abort directly.
    setBlockAllDumped(pDumpInfo, pReader->window.ekey, pReader->order);
    return TSDB_CODE_SUCCESS;
1344 1345
  }

H
Haojun Liao 已提交
1346
  int32_t i = 0;
C
Cary Xu 已提交
1347 1348
  int32_t rowIndex = 0;

H
Haojun Liao 已提交
1349 1350
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
1351
    copyPrimaryTsCol(pBlockData, pDumpInfo, pColData, dumpedRows, asc);
1352 1353 1354
    i += 1;
  }

1355
  int32_t colIndex = 0;
H
Hongze Cheng 已提交
1356
  int32_t num = pBlockData->nColData;
1357
  while (i < numOfOutputCols && colIndex < num) {
1358 1359
    rowIndex = 0;

H
Hongze Cheng 已提交
1360
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
H
Haojun Liao 已提交
1361
    if (pData->cid < pSupInfo->colId[i]) {
1362
      colIndex += 1;
H
Haojun Liao 已提交
1363 1364
    } else if (pData->cid == pSupInfo->colId[i]) {
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
1365

H
Hongze Cheng 已提交
1366
      if (pData->flag == HAS_NONE || pData->flag == HAS_NULL || pData->flag == (HAS_NULL | HAS_NONE)) {
1367
        colDataSetNNULL(pColData, 0, dumpedRows);
C
Cary Xu 已提交
1368
      } else {
H
Haojun Liao 已提交
1369
        if (IS_MATHABLE_TYPE(pColData->info.type)) {
H
Haojun Liao 已提交
1370 1371
          copyNumericCols(pData, pDumpInfo, pColData, dumpedRows, asc);
        } else {  // varchar/nchar type
H
Haojun Liao 已提交
1372
          for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step) {
C
Cary Xu 已提交
1373
            tColDataGetValue(pData, j, &cv);
D
dapan1121 已提交
1374 1375 1376 1377
            code = doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
            if (code) {
              return code;
            }
C
Cary Xu 已提交
1378 1379
          }
        }
H
Haojun Liao 已提交
1380
      }
C
Cary Xu 已提交
1381

1382
      colIndex += 1;
1383
      i += 1;
1384
    } else {  // the specified column does not exist in file block, fill with null data
H
Haojun Liao 已提交
1385
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
1386
      colDataSetNNULL(pColData, 0, dumpedRows);
1387
      i += 1;
H
Haojun Liao 已提交
1388
    }
1389 1390
  }

1391
  // fill the mis-matched columns with null value
1392
  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
1393
    pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
1394
    colDataSetNNULL(pColData, 0, dumpedRows);
1395
    i += 1;
H
Haojun Liao 已提交
1396
  }
H
Haojun Liao 已提交
1397

1398
  pResBlock->info.dataLoad = 1;
H
Haojun Liao 已提交
1399 1400
  pResBlock->info.rows = dumpedRows;
  pDumpInfo->rowIndex += step * dumpedRows;
1401

1402
  // check if current block are all handled
C
Cary Xu 已提交
1403 1404
  if (pDumpInfo->rowIndex >= 0 && pDumpInfo->rowIndex < pBlock->nRow) {
    int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1405 1406 1407
    if (outOfTimeWindow(ts, &pReader->window)) {  // the remain data has out of query time window, ignore current block
      setBlockAllDumped(pDumpInfo, ts, pReader->order);
    }
C
Cary Xu 已提交
1408
  } else {
1409 1410
    int64_t ts = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
    setBlockAllDumped(pDumpInfo, ts, pReader->order);
C
Cary Xu 已提交
1411
  }
H
Haojun Liao 已提交
1412

1413
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
1414
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
1415

1416
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
1417
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
1418
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", uid:%" PRIu64 " elapsed time:%.2f ms, %s",
H
Haojun Liao 已提交
1419
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, dumpedRows,
H
Haojun Liao 已提交
1420
            unDumpedRows, pBlock->minVer, pBlock->maxVer, pBlockInfo->uid, elapsedTime, pReader->idStr);
1421 1422 1423 1424

  return TSDB_CODE_SUCCESS;
}

1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444
static FORCE_INLINE STSchema* getTableSchemaImpl(STsdbReader* pReader, uint64_t uid) {
  ASSERT(pReader->pSchema == NULL);

  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, -1, &pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS || pReader->pSchema == NULL) {
    terrno = code;
    tsdbError("failed to get table schema, uid:%" PRIu64 ", it may have been dropped, ver:-1, %s", uid, pReader->idStr);
    return NULL;
  }

  code = tsdbRowMergerInit(&pReader->status.merger, pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    tsdbError("failed to init merger, code:%s, %s", tstrerror(code), pReader->idStr);
    return NULL;
  }

  return pReader->pSchema;
}

1445 1446
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData,
                                   uint64_t uid) {
1447 1448 1449
  int32_t   code = 0;
  STSchema* pSchema = pReader->pSchema;
  int64_t   st = taosGetTimestampUs();
1450

1451
  tBlockDataReset(pBlockData);
1452 1453 1454 1455 1456 1457 1458

  if (pReader->pSchema == NULL) {
    pSchema = getTableSchemaImpl(pReader, uid);
    if (pSchema == NULL) {
      tsdbDebug("%p table uid:%" PRIu64 " has been dropped, no data existed, %s", pReader, uid, pReader->idStr);
      return code;
    }
1459 1460 1461
  }

  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
X
Xiaoyu Wang 已提交
1462
  TABLEID             tid = {.suid = pReader->suid, .uid = uid};
1463
  code = tBlockDataInit(pBlockData, &tid, pSchema, &pSup->colId[1], pSup->numOfCols - 1);
1464 1465 1466 1467
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1468
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
1469
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1470

H
Hongze Cheng 已提交
1471
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
1472
  code = tsdbReadDataBlock(pReader->pFileReader, pBlock, pBlockData);
1473 1474 1475
  if (code != TSDB_CODE_SUCCESS) {
    tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
              ", rows:%d, code:%s %s",
1476
              pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
1477 1478 1479
              tstrerror(code), pReader->idStr);
    return code;
  }
1480

1481
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
1482

1483 1484 1485 1486
  tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
1487 1488 1489

  pReader->cost.blockLoadTime += elapsedTime;
  pDumpInfo->allDumped = false;
1490

H
Haojun Liao 已提交
1491
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1492
}
H
Hongze Cheng 已提交
1493

H
Haojun Liao 已提交
1494 1495 1496
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
1497

H
Haojun Liao 已提交
1498 1499 1500 1501
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
1502

H
Haojun Liao 已提交
1503 1504
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
1505

H
Haojun Liao 已提交
1506 1507
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
1508 1509
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
1510

H
Haojun Liao 已提交
1511 1512 1513 1514
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
1515

H
Haojun Liao 已提交
1516 1517
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1518

H
Haojun Liao 已提交
1519
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
1520
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
1521
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
1522

H
Haojun Liao 已提交
1523
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
1524

H
Haojun Liao 已提交
1525 1526
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
1527

H
Haojun Liao 已提交
1528 1529 1530 1531 1532 1533 1534
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
1535

1536
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
1537
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
1538

1539 1540 1541
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

H
Haojun Liao 已提交
1542
static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter, const char* idStr) {
1543 1544
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
1545
    STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pBlockIter->pTableMap, pBlockInfo->uid, idStr);
H
Haojun Liao 已提交
1546
    if (pScanInfo == NULL) {
H
Haojun Liao 已提交
1547
      return terrno;
H
Haojun Liao 已提交
1548 1549
    }

H
Haojun Liao 已提交
1550 1551
    SBlockIndex* pIndex = taosArrayGet(pScanInfo->pBlockList, pBlockInfo->tbBlockIdx);
    tMapDataGetItemByIdx(&pScanInfo->mapData, pIndex->ordinalIndex, &pBlockIter->block, tGetDataBlk);
1552
  }
1553 1554 1555 1556 1557 1558

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1559
}
H
Hongze Cheng 已提交
1560

1561
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks, SArray* pTableList) {
1562
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
1563

1564
  SBlockOrderSupporter sup = {0};
1565
  pBlockIter->numOfBlocks = numOfBlocks;
1566
  taosArrayClear(pBlockIter->blockList);
1567
  pBlockIter->pTableMap = pReader->status.pTableMap;
1568

1569
  // access data blocks according to the offset of each block in asc/desc order.
1570
  int32_t numOfTables = taosArrayGetSize(pTableList);
H
Haojun Liao 已提交
1571

1572
  int64_t st = taosGetTimestampUs();
1573
  int32_t code = initBlockOrderSupporter(&sup, numOfTables);
1574 1575 1576
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
1577

1578
  int32_t cnt = 0;
H
Haojun Liao 已提交
1579

1580
  for (int32_t i = 0; i < numOfTables; ++i) {
1581 1582
    STableBlockScanInfo* pTableScanInfo = taosArrayGetP(pTableList, i);
    ASSERT(pTableScanInfo->pBlockList != NULL && taosArrayGetSize(pTableScanInfo->pBlockList) > 0);
H
Haojun Liao 已提交
1583

1584 1585
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
1586

1587 1588 1589
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1590
      return TSDB_CODE_OUT_OF_MEMORY;
1591
    }
H
Haojun Liao 已提交
1592

1593
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
1594

1595 1596 1597
    for (int32_t k = 0; k < num; ++k) {
      SBlockIndex* pIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
      sup.pDataBlockInfo[sup.numOfTables][k] =
1598
          (SBlockOrderWrapper){.uid = pTableScanInfo->uid, .offset = pIndex->inFileOffset};
1599 1600 1601 1602 1603
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
1604

H
Haojun Liao 已提交
1605 1606 1607 1608
  if (numOfBlocks != cnt && sup.numOfTables != numOfTables) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_INVALID_PARA;
  }
H
Haojun Liao 已提交
1609

1610
  // since there is only one table qualified, blocks are not sorted
1611 1612
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
1613 1614
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
1615
    }
1616

1617
    int64_t et = taosGetTimestampUs();
1618
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
1619
              pReader, numOfBlocks, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
1620

1621
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1622
    cleanupBlockOrderSupporter(&sup);
H
Haojun Liao 已提交
1623
    doSetCurrentBlock(pBlockIter, pReader->idStr);
1624
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1625
  }
H
Haojun Liao 已提交
1626

1627 1628
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
1629

1630
  SMultiwayMergeTreeInfo* pTree = NULL;
H
Haojun Liao 已提交
1631 1632

  uint8_t ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
1633 1634
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1635
    return TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
1636
  }
H
Haojun Liao 已提交
1637

1638 1639 1640 1641
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
1642

1643 1644
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
1645

1646 1647 1648 1649
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
1650

1651 1652
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
1653
  }
H
Haojun Liao 已提交
1654

1655
  int64_t et = taosGetTimestampUs();
H
Hongze Cheng 已提交
1656 1657
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, numOfBlocks,
            (et - st) / 1000.0, pReader->idStr);
1658 1659
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
1660

1661
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1662
  doSetCurrentBlock(pBlockIter, pReader->idStr);
1663

1664
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1665
}
H
Hongze Cheng 已提交
1666

H
Haojun Liao 已提交
1667
static bool blockIteratorNext(SDataBlockIter* pBlockIter, const char* idStr) {
1668 1669
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

1670
  int32_t step = asc ? 1 : -1;
1671
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
1672 1673 1674
    return false;
  }

1675
  pBlockIter->index += step;
H
Haojun Liao 已提交
1676
  doSetCurrentBlock(pBlockIter, idStr);
1677

1678 1679 1680
  return true;
}

1681 1682 1683
/**
 * This is an two rectangles overlap cases.
 */
H
Hongze Cheng 已提交
1684
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SDataBlk* pBlock) {
1685 1686
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
H
Hongze Cheng 已提交
1687 1688
         (pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
         (pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
H
Haojun Liao 已提交
1689
}
H
Hongze Cheng 已提交
1690

1691
static bool getNeighborBlockOfSameTable(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
1692
                                        int32_t* nextIndex, int32_t order, SBlockIndex* pBlockIndex) {
1693
  bool asc = ASCENDING_TRAVERSE(order);
H
Haojun Liao 已提交
1694
  if (asc && pBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
1695
    return false;
1696 1697
  }

H
Haojun Liao 已提交
1698
  if (!asc && pBlockInfo->tbBlockIdx == 0) {
1699
    return false;
1700 1701
  }

1702
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1703
  *nextIndex = pBlockInfo->tbBlockIdx + step;
1704 1705
  *pBlockIndex = *(SBlockIndex*)taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
  //  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, pIndex->ordinalIndex, pBlock, tGetDataBlk);
1706
  return true;
1707 1708 1709
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
1710
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1711 1712
  int32_t index = pBlockIter->index;

1713
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  return -1;
}

1725
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1726
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
1727 1728 1729 1730
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1731 1732 1733 1734 1735
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1736

1737 1738 1739
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1740

H
Haojun Liao 已提交
1741
  doSetCurrentBlock(pBlockIter, "");
1742 1743 1744
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
1745
// todo: this attribute could be acquired during extractin the global ordered block list.
1746
static bool overlapWithNeighborBlock(SDataBlk* pBlock, SBlockIndex* pNeighborBlockIndex, int32_t order) {
1747 1748
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
1749
    return pBlock->maxKey.ts == pNeighborBlockIndex->window.skey;
1750
  } else {
1751
    return pBlock->minKey.ts == pNeighborBlockIndex->window.ekey;
1752
  }
H
Haojun Liao 已提交
1753
}
H
Hongze Cheng 已提交
1754

H
Hongze Cheng 已提交
1755
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SDataBlk* pBlock) {
H
Haojun Liao 已提交
1756
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1757

1758
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1759
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1760
}
H
Hongze Cheng 已提交
1761

H
Hongze Cheng 已提交
1762
static bool keyOverlapFileBlock(TSDBKEY key, SDataBlk* pBlock, SVersionRange* pVerRange) {
H
Hongze Cheng 已提交
1763 1764
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVer >= pVerRange->minVer) &&
         (pBlock->minVer <= pVerRange->maxVer);
H
Haojun Liao 已提交
1765 1766
}

H
Hongze Cheng 已提交
1767 1768
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock,
                                       int32_t startIndex) {
1769 1770
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

1771
  for (int32_t i = startIndex; i < num; i += 1) {
1772 1773
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
H
Hongze Cheng 已提交
1774
      if (p->version >= pBlock->minVer) {
1775 1776 1777
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
H
Hongze Cheng 已提交
1778
      if (p->version >= pBlock->minVer) {
1779 1780
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
H
Hongze Cheng 已提交
1781 1782
          if (pnext->ts >= pBlock->minKey.ts) {
            return true;
1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

H
Hongze Cheng 已提交
1796
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock, int32_t order) {
1797 1798 1799 1800
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1801
  // ts is not overlap
1802
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1803
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1804 1805 1806 1807 1808
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1809
  if (ASCENDING_TRAVERSE(order)) {
1810
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, pBlockScanInfo->fileDelIndex);
1811 1812
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1813
    while (1) {
1814 1815 1816 1817
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
1818 1819
        if (p->ts == pBlock->minKey.ts && p->version < pBlock->maxVer && index > 0) {
          index -= 1;
1820
        }
1821
        break;
1822 1823 1824
      }
    }

1825
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, index);
1826
  }
1827 1828
}

C
Cary Xu 已提交
1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841
typedef struct {
  bool overlapWithNeighborBlock;
  bool hasDupTs;
  bool overlapWithDelInfo;
  bool overlapWithLastBlock;
  bool overlapWithKeyInBuf;
  bool partiallyRequired;
  bool moreThanCapcity;
} SDataBlockToLoadInfo;

static void getBlockToLoadInfo(SDataBlockToLoadInfo* pInfo, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                               STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader,
                               STsdbReader* pReader) {
1842 1843
  int32_t     neighborIndex = 0;
  SBlockIndex bIndex = {0};
1844

1845
  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pScanInfo, &neighborIndex, pReader->order, &bIndex);
1846

1847
  // overlap with neighbor
1848
  if (hasNeighbor) {
1849
    pInfo->overlapWithNeighborBlock = overlapWithNeighborBlock(pBlock, &bIndex, pReader->order);
1850 1851
  }

1852
  // has duplicated ts of different version in this block
C
Cary Xu 已提交
1853 1854
  pInfo->hasDupTs = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  pInfo->overlapWithDelInfo = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1855

1856 1857 1858
  if (hasDataInLastBlock(pLastBlockReader)) {
    int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
    pInfo->overlapWithLastBlock = !(pBlock->maxKey.ts < tsLast || pBlock->minKey.ts > tsLast);
1859 1860
  }

1861
  pInfo->moreThanCapcity = pBlock->nRow > pReader->resBlockInfo.capacity;
C
Cary Xu 已提交
1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
  pInfo->partiallyRequired = dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock);
  pInfo->overlapWithKeyInBuf = keyOverlapFileBlock(keyInBuf, pBlock, &pReader->verRange);
}

// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
// 5. delete info should not overlap with current block data
// 6. current block should not contain the duplicated ts
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
1876

C
Cary Xu 已提交
1877 1878 1879
  bool loadDataBlock =
      (info.overlapWithNeighborBlock || info.hasDupTs || info.partiallyRequired || info.overlapWithKeyInBuf ||
       info.moreThanCapcity || info.overlapWithDelInfo || info.overlapWithLastBlock);
1880 1881 1882 1883

  // log the reason why load the datablock for profile
  if (loadDataBlock) {
    tsdbDebug("%p uid:%" PRIu64
X
Xiaoyu Wang 已提交
1884
              " need to load the datablock, overlapneighbor:%d, hasDup:%d, partiallyRequired:%d, "
1885
              "overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithlastBlock:%d, %s",
C
Cary Xu 已提交
1886 1887 1888
              pReader, pBlockInfo->uid, info.overlapWithNeighborBlock, info.hasDupTs, info.partiallyRequired,
              info.overlapWithKeyInBuf, info.moreThanCapcity, info.overlapWithDelInfo, info.overlapWithLastBlock,
              pReader->idStr);
1889 1890 1891
  }

  return loadDataBlock;
H
Haojun Liao 已提交
1892 1893
}

C
Cary Xu 已提交
1894 1895 1896 1897 1898 1899 1900 1901 1902
static bool isCleanFileDataBlock(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                 STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
  bool isCleanFileBlock = !(info.overlapWithNeighborBlock || info.hasDupTs || info.overlapWithKeyInBuf ||
                            info.overlapWithDelInfo || info.overlapWithLastBlock);
  return isCleanFileBlock;
}

1903
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1904
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1905 1906
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1907

1908
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
1909 1910

  int64_t st = taosGetTimestampUs();
1911
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->resBlockInfo.capacity, pReader);
H
Haojun Liao 已提交
1912

H
Haojun Liao 已提交
1913
  blockDataUpdateTsWindow(pBlock, pReader->suppInfo.slotId[0]);
H
Haojun Liao 已提交
1914
  pBlock->info.id.uid = pBlockScanInfo->uid;
1915

1916
  setComposedBlockFlag(pReader, true);
1917

1918
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
D
dapan1121 已提交
1919
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%" PRId64 ", brange:%" PRId64
X
Xiaoyu Wang 已提交
1920
            " - %" PRId64 ", uid:%" PRIu64 ",  %s",
1921
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
1922
            pBlockScanInfo->uid, pReader->idStr);
1923 1924

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1925 1926 1927
  return code;
}

1928
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key,
1929
                                            SFileBlockDumpInfo* pDumpInfo, bool* copied) {
1930 1931 1932
  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
D
dapan1121 已提交
1933 1934 1935
  int32_t code = TSDB_CODE_SUCCESS;

  *copied = false;
1936 1937
  bool asc = (pReader->order == TSDB_ORDER_ASC);
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && asc) || (pDumpInfo->rowIndex > 0 && (!asc))) {
1938
    int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
1939 1940

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1941
    if (nextKey != key) {  // merge is not needed
1942
      code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
D
dapan1121 已提交
1943 1944 1945
      if (code) {
        return code;
      }
1946
      pDumpInfo->rowIndex += step;
D
dapan1121 已提交
1947
      *copied = true;
1948 1949 1950
    }
  }

D
dapan1121 已提交
1951
  return code;
1952 1953
}

1954
static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo,
1955
                                  SVersionRange* pVerRange) {
X
Xiaoyu Wang 已提交
1956
  int32_t step = ASCENDING_TRAVERSE(pLastBlockReader->order) ? 1 : -1;
H
Haojun Liao 已提交
1957

1958 1959
  while (1) {
    bool hasVal = tMergeTreeNext(&pLastBlockReader->mergeTree);
1960
    if (!hasVal) {  // the next value will be the accessed key in stt
1961
      pScanInfo->lastKeyInStt += step;
1962 1963 1964
      return false;
    }

1965
    TSDBROW* pRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
1966
    int64_t key = pRow->pBlockData->aTSKEY[pRow->iRow];
1967
    int64_t ver = pRow->pBlockData->aVersion[pRow->iRow];
1968

1969
    pLastBlockReader->currentKey = key;
1970
    pScanInfo->lastKeyInStt = key;
1971 1972

    if (!hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->lastBlockDelIndex, key, ver, pLastBlockReader->order, pVerRange)) {
1973 1974 1975 1976 1977 1978
      return true;
    }
  }
}

static bool tryCopyDistinctRowFromSttBlock(TSDBROW* fRow, SLastBlockReader* pLastBlockReader,
1979 1980
                                           STableBlockScanInfo* pScanInfo, int64_t ts, STsdbReader* pReader,
                                           bool* copied) {
D
dapan1121 已提交
1981 1982 1983 1984
  int32_t code = TSDB_CODE_SUCCESS;

  *copied = false;

1985
  bool hasVal = nextRowFromLastBlocks(pLastBlockReader, pScanInfo, &pReader->verRange);
1986 1987 1988
  if (hasVal) {
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 != ts) {
1989
      code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
D
dapan1121 已提交
1990 1991 1992
      if (code) {
        return code;
      }
1993

D
dapan1121 已提交
1994 1995
      *copied = true;
      return code;
1996 1997
    }
  } else {
1998
    code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
D
dapan1121 已提交
1999 2000 2001
    if (code) {
      return code;
    }
2002

D
dapan1121 已提交
2003 2004
    *copied = true;
    return code;
2005 2006
  }

D
dapan1121 已提交
2007
  return code;
2008 2009
}

H
Haojun Liao 已提交
2010 2011 2012
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
  // always set the newest schema version in pReader->pSchema
  if (pReader->pSchema == NULL) {
2013 2014
    STSchema* ps = getTableSchemaImpl(pReader, uid);
    if (ps == NULL) {
2015 2016
      return NULL;
    }
H
Haojun Liao 已提交
2017 2018
  }

2019
  if (pReader->pSchema && sversion == pReader->pSchema->version) {
H
Haojun Liao 已提交
2020 2021 2022
    return pReader->pSchema;
  }

2023 2024
  void** p = tSimpleHashGet(pReader->pSchemaMap, &sversion, sizeof(sversion));
  if (p != NULL) {
2025
    return *(STSchema**)p;
H
Haojun Liao 已提交
2026 2027
  }

2028
  STSchema* ptr = NULL;
2029
  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &ptr);
2030
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2031 2032
    terrno = code;
    return NULL;
H
Haojun Liao 已提交
2033
  } else {
2034 2035 2036 2037 2038 2039
    code = tSimpleHashPut(pReader->pSchemaMap, &sversion, sizeof(sversion), &ptr, POINTER_BYTES);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = code;
      return NULL;
    }
    return ptr;
H
Haojun Liao 已提交
2040
  }
H
Haojun Liao 已提交
2041 2042
}

2043
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
2044
                                     SIterInfo* pIter, int64_t key, SLastBlockReader* pLastBlockReader) {
2045
  SRowMerger*         pMerger = &pReader->status.merger;
H
Hongze Cheng 已提交
2046
  SRow*               pTSRow = NULL;
2047 2048 2049
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2050
  int64_t tsLast = INT64_MIN;
2051
  if (hasDataInLastBlock(pLastBlockReader)) {
2052 2053
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
2054

H
Hongze Cheng 已提交
2055 2056
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2057

2058 2059 2060 2061 2062 2063 2064 2065 2066
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

2067 2068
  int64_t minKey = 0;
  if (pReader->order == TSDB_ORDER_ASC) {
H
Hongze Cheng 已提交
2069
    minKey = INT64_MAX;  // chosen the minimum value
2070
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2071 2072
      minKey = tsLast;
    }
2073

2074 2075 2076
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2077

2078
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2079 2080 2081 2082
      minKey = key;
    }
  } else {
    minKey = INT64_MIN;
2083
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2084 2085 2086 2087 2088 2089 2090
      minKey = tsLast;
    }

    if (minKey < k.ts) {
      minKey = k.ts;
    }

2091
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2092 2093
      minKey = key;
    }
2094 2095
  }

2096
  // todo remove init
2097 2098
  bool init = false;

2099
  // ASC: file block ---> last block -----> imem -----> mem
H
Hongze Cheng 已提交
2100
  // DESC: mem -----> imem -----> last block -----> file block
2101 2102
  if (pReader->order == TSDB_ORDER_ASC) {
    if (minKey == key) {
2103
      init = true;
2104
      int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2105 2106 2107
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
2108
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2109 2110
    }

2111
    if (minKey == tsLast) {
2112
      TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2113
      if (init) {
2114
        tsdbRowMergerAdd(pMerger, fRow1, NULL);
H
Haojun Liao 已提交
2115
      } else {
2116
        init = true;
2117
        int32_t code = tsdbRowMergerAdd(pMerger, fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2118 2119 2120
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2121
      }
2122
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2123
    }
2124

2125
    if (minKey == k.ts) {
K
kailixu 已提交
2126 2127 2128 2129
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      if (pSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
2130
      if (init) {
2131
        tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2132
      } else {
2133
        init = true;
2134
        int32_t code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2135 2136 2137 2138
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
      }
H
Haojun Liao 已提交
2139
      int32_t code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2140 2141
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2142 2143 2144 2145 2146
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2147
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2148 2149 2150 2151
      if (pSchema == NULL) {
        return terrno;
      }

2152
      int32_t   code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2153 2154 2155 2156
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2157
      code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
2158
      if (code != TSDB_CODE_SUCCESS || pMerger->pTSchema == NULL) {
H
Haojun Liao 已提交
2159 2160
        return code;
      }
2161 2162
    }

2163
    if (minKey == tsLast) {
2164
      TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2165
      if (init) {
2166
        tsdbRowMergerAdd(pMerger, fRow1, NULL);
H
Haojun Liao 已提交
2167
      } else {
2168
        init = true;
2169
        int32_t code = tsdbRowMergerAdd(pMerger, fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2170 2171 2172
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2173
      }
2174
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2175 2176 2177
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2178
      if (init) {
2179
        tsdbRowMergerAdd(pMerger, &fRow, NULL);
H
Haojun Liao 已提交
2180
      } else {
2181
        init = true;
2182
        int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2183 2184 2185
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2186
      }
H
Haojun Liao 已提交
2187
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2188
    }
2189 2190
  }

2191
  int32_t code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2192 2193 2194 2195
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2196
  code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2197 2198

  taosMemoryFree(pTSRow);
2199
  tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2200 2201

  return code;
2202 2203
}

2204 2205 2206
static int32_t doMergeFileBlockAndLastBlock(SLastBlockReader* pLastBlockReader, STsdbReader* pReader,
                                            STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                            bool mergeBlockData) {
2207
  SRowMerger* pMerger = &pReader->status.merger;
2208
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2209 2210 2211 2212 2213 2214 2215 2216 2217 2218

  int64_t  tsLastBlock = getCurrentKeyInLastBlock(pLastBlockReader);
  bool     copied = false;
  int32_t  code = TSDB_CODE_SUCCESS;
  SRow*    pTSRow = NULL;
  TSDBROW* pRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);

  // create local variable to hold the row value
  TSDBROW  fRow = {.iRow = pRow->iRow, .type = TSDBROW_COL_FMT, .pBlockData = pRow->pBlockData};

2219
  tsdbTrace("fRow ptr:%p, %d, uid:%" PRIu64 ", %s", pRow->pBlockData, pRow->iRow, pLastBlockReader->uid, pReader->idStr);
2220

2221 2222
  // only last block exists
  if ((!mergeBlockData) || (tsLastBlock != pBlockData->aTSKEY[pDumpInfo->rowIndex])) {
D
dapan1121 已提交
2223 2224 2225 2226
    code = tryCopyDistinctRowFromSttBlock(&fRow, pLastBlockReader, pBlockScanInfo, tsLastBlock, pReader, &copied);
    if (code) {
      return code;
    }
2227

D
dapan1121 已提交
2228
    if (copied) {
2229
      pBlockScanInfo->lastKey = tsLastBlock;
2230 2231
      return TSDB_CODE_SUCCESS;
    } else {
2232
      code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2233 2234 2235
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2236

2237 2238
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tsdbRowMergerAdd(pMerger, pRow1, NULL);
2239
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, pMerger, &pReader->verRange, pReader->idStr);
2240

2241
      code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2242 2243 2244
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2245

2246
      code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2247 2248

      taosMemoryFree(pTSRow);
2249
      tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2250 2251 2252 2253

      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2254 2255
    }
  } else {  // not merge block data
2256
    code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2257 2258 2259 2260
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2261
    doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, pMerger, &pReader->verRange, pReader->idStr);
2262 2263

    // merge with block data if ts == key
H
Haojun Liao 已提交
2264
    if (tsLastBlock == pBlockData->aTSKEY[pDumpInfo->rowIndex]) {
H
Haojun Liao 已提交
2265
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2266 2267
    }

2268
    code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2269 2270 2271 2272
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2273
    code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2274 2275

    taosMemoryFree(pTSRow);
2276
    tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2277 2278 2279 2280

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
2281
  }
2282 2283 2284 2285

  return TSDB_CODE_SUCCESS;
}

2286 2287
static int32_t mergeFileBlockAndLastBlock(STsdbReader* pReader, SLastBlockReader* pLastBlockReader, int64_t key,
                                          STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
2288
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2289 2290 2291 2292 2293 2294 2295 2296 2297 2298
  SRowMerger* pMerger = &pReader->status.merger;

  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }
2299

2300
  if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
2301
    // no last block available, only data block exists
2302
    if (!hasDataInLastBlock(pLastBlockReader)) {
2303 2304 2305 2306 2307 2308 2309 2310 2311
      return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
    }

    // row in last file block
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
    int64_t ts = getCurrentKeyInLastBlock(pLastBlockReader);
    ASSERT(ts >= key);

    if (ASCENDING_TRAVERSE(pReader->order)) {
2312
      if (key < ts) {  // imem, mem are all empty, file blocks (data blocks and last block) exist
2313 2314
        return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
      } else if (key == ts) {
H
Haojun Liao 已提交
2315 2316
        SRow*       pTSRow = NULL;
        int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2317 2318 2319 2320
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

H
Haojun Liao 已提交
2321
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2322

2323 2324
        TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
2325

H
Haojun Liao 已提交
2326
        doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, ts, pMerger, &pReader->verRange, pReader->idStr);
2327

H
Haojun Liao 已提交
2328
        code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2329 2330 2331 2332
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

2333
        code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2334

2335
        taosMemoryFree(pTSRow);
2336
        tsdbRowMergerClear(pMerger);
2337
        return code;
2338
      } else {
2339
        return TSDB_CODE_SUCCESS;
2340
      }
2341
    } else {  // desc order
2342
      return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, pBlockData, true);
2343
    }
2344
  } else {  // only last block exists
2345
    return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, NULL, false);
H
Haojun Liao 已提交
2346
  }
2347 2348
}

2349 2350
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                     SLastBlockReader* pLastBlockReader) {
2351
  SRowMerger*         pMerger = &pReader->status.merger;
H
Hongze Cheng 已提交
2352
  SRow*               pTSRow = NULL;
H
Haojun Liao 已提交
2353
  int32_t             code = TSDB_CODE_SUCCESS;
2354 2355 2356
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SArray*             pDelList = pBlockScanInfo->delSkyline;

2357 2358
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
2359

2360
  int64_t tsLast = INT64_MIN;
2361 2362 2363
  if (hasDataInLastBlock(pLastBlockReader)) {
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
2364

H
Hongze Cheng 已提交
2365
  int64_t key = hasDataInFileBlock(pBlockData, pDumpInfo) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2366

2367 2368 2369 2370 2371 2372
  TSDBKEY   k = TSDBROW_KEY(pRow);
  TSDBKEY   ik = TSDBROW_KEY(piRow);
  STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
  if (pSchema == NULL) {
    return code;
  }
2373

2374 2375 2376 2377
  STSchema* piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
  if (piSchema == NULL) {
    return code;
  }
2378

2379 2380 2381 2382 2383 2384 2385 2386 2387
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

2388
  int64_t minKey = 0;
2389 2390 2391 2392 2393
  if (ASCENDING_TRAVERSE(pReader->order)) {
    minKey = INT64_MAX;  // let's find the minimum
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2394

2395 2396 2397
    if (minKey > ik.ts) {
      minKey = ik.ts;
    }
2398

2399
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2400 2401
      minKey = key;
    }
2402

2403
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2404 2405 2406
      minKey = tsLast;
    }
  } else {
H
Hongze Cheng 已提交
2407
    minKey = INT64_MIN;  // let find the maximum ts value
2408 2409 2410 2411 2412 2413 2414 2415
    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < ik.ts) {
      minKey = ik.ts;
    }

2416
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2417 2418 2419
      minKey = key;
    }

2420
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2421 2422
      minKey = tsLast;
    }
2423 2424 2425 2426
  }

  bool init = false;

2427 2428 2429 2430
  // ASC: file block -----> last block -----> imem -----> mem
  // DESC: mem -----> imem -----> last block -----> file block
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (minKey == key) {
2431
      init = true;
2432
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2433
      code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2434 2435 2436 2437
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2438
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2439 2440
    }

2441
    if (minKey == tsLast) {
2442
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2443
      if (init) {
2444
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
H
Haojun Liao 已提交
2445
      } else {
2446
        init = true;
2447
        code = tsdbRowMergerAdd(pMerger, pRow1, pReader->pSchema);
H
Haojun Liao 已提交
2448 2449 2450
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2451
      }
H
Haojun Liao 已提交
2452

2453
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2454 2455 2456
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2457
      if (init) {
2458
        tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2459
      } else {
2460
        init = true;
2461
        code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2462 2463 2464
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2465
      }
H
Haojun Liao 已提交
2466

H
Haojun Liao 已提交
2467
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2468 2469
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2470
      }
2471 2472
    }

2473
    if (minKey == k.ts) {
H
Haojun Liao 已提交
2474
      if (init) {
2475
        tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2476
      } else {
2477
        // STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2478
        code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2479 2480 2481
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2482
      }
H
Haojun Liao 已提交
2483
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2484 2485
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2486 2487 2488 2489 2490
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2491
      code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2492 2493 2494 2495
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2496
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2497 2498 2499
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2500 2501 2502
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2503
      if (init) {
2504
        tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2505
      } else {
2506
        init = true;
2507
        code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2508 2509 2510
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2511
      }
H
Haojun Liao 已提交
2512
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2513 2514
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2515 2516 2517 2518
      }
    }

    if (minKey == tsLast) {
2519
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2520
      if (init) {
2521
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
H
Haojun Liao 已提交
2522
      } else {
2523
        init = true;
2524
        code = tsdbRowMergerAdd(pMerger, pRow1, pReader->pSchema);
H
Haojun Liao 已提交
2525 2526 2527
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2528
      }
2529
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2530 2531 2532
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2533
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2534
      if (!init) {
2535
        code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2536 2537 2538
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
H
Haojun Liao 已提交
2539
      } else {
2540
        tsdbRowMergerAdd(pMerger, &fRow, NULL);
2541
      }
H
Haojun Liao 已提交
2542
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2543 2544 2545
    }
  }

2546
  code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2547 2548 2549 2550
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2551
  code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2552 2553

  taosMemoryFree(pTSRow);
2554
  tsdbRowMergerClear(pMerger);
2555
  return code;
2556 2557
}

2558 2559 2560 2561 2562 2563 2564 2565 2566
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

  int32_t code = TSDB_CODE_SUCCESS;

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
2567 2568
    // startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
    startKey = (TSDBKEY){.ts = pBlockScanInfo->lastKey + 1, .version = pReader->verRange.minVer};
2569
  } else {
2570 2571
    // startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
    startKey = (TSDBKEY){.ts = pBlockScanInfo->lastKey - 1, .version = pReader->verRange.maxVer};
2572 2573 2574
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));
D
dapan1121 已提交
2575
  int64_t st = 0;
2576 2577 2578 2579 2580 2581 2582 2583 2584

  STbData* d = NULL;
  if (pReader->pReadSnap->pMem != NULL) {
    d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
    if (d != NULL) {
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);

H
Haojun Liao 已提交
2585
        tsdbDebug("%p uid:%" PRIu64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
2586
                  "-%" PRId64 " %s",
2587 2588
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2589
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
2590 2591 2592 2593 2594
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2595
    tsdbDebug("%p uid:%" PRIu64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
  }

  STbData* di = NULL;
  if (pReader->pReadSnap->pIMem != NULL) {
    di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
    if (di != NULL) {
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);

H
Haojun Liao 已提交
2606
        tsdbDebug("%p uid:%" PRIu64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
2607
                  "-%" PRId64 " %s",
2608 2609
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2610
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
2611 2612 2613 2614 2615
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2616
    tsdbDebug("%p uid:%" PRIu64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2617 2618
  }

2619
  st = taosGetTimestampUs();
2620
  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);
2621
  pReader->cost.initDelSkylineIterTime += (taosGetTimestampUs() - st) / 1000.0;
2622 2623 2624 2625 2626

  pBlockScanInfo->iterInit = true;
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
2627 2628
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
2629 2630 2631 2632 2633 2634 2635 2636
  // it is an multi-table data block
  if (pBlockData->aUid != NULL) {
    uint64_t uid = pBlockData->aUid[pDumpInfo->rowIndex];
    if (uid != pBlockScanInfo->uid) {  // move to next row
      return false;
    }
  }

2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

2648
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, pReader->order,
2649
                     &pReader->verRange)) {
2650 2651 2652
    return false;
  }

2653 2654 2655
  return true;
}

2656
static bool initLastBlockReader(SLastBlockReader* pLBlockReader, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
2657
  // the last block reader has been initialized for this table.
2658
  if (pLBlockReader->uid == pScanInfo->uid) {
2659
    return hasDataInLastBlock(pLBlockReader);
2660 2661
  }

2662 2663
  if (pLBlockReader->uid != 0) {
    tMergeTreeClose(&pLBlockReader->mergeTree);
2664 2665
  }

2666 2667
  initMemDataIterator(pScanInfo, pReader);
  pLBlockReader->uid = pScanInfo->uid;
2668

2669 2670
  STimeWindow w = pLBlockReader->window;
  if (ASCENDING_TRAVERSE(pLBlockReader->order)) {
2671
    w.skey = pScanInfo->lastKeyInStt;
2672
  } else {
2673
    w.ekey = pScanInfo->lastKeyInStt;
2674 2675
  }

X
Xiaoyu Wang 已提交
2676 2677
  tsdbDebug("init last block reader, window:%" PRId64 "-%" PRId64 ", uid:%" PRIu64 ", %s", w.skey, w.ekey,
            pScanInfo->uid, pReader->idStr);
2678 2679
  int32_t code = tMergeTreeOpen(&pLBlockReader->mergeTree, (pLBlockReader->order == TSDB_ORDER_DESC),
                                pReader->pFileReader, pReader->suid, pScanInfo->uid, &w, &pLBlockReader->verRange,
2680
                                pLBlockReader->pInfo, false, pReader->idStr, false, pReader->status.pLDataIter);
2681 2682 2683 2684
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

2685
  return nextRowFromLastBlocks(pLBlockReader, pScanInfo, &pReader->verRange);
2686 2687
}

H
Hongze Cheng 已提交
2688
static bool hasDataInLastBlock(SLastBlockReader* pLastBlockReader) { return pLastBlockReader->mergeTree.pIter != NULL; }
2689

2690
bool hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo) {
H
Haojun Liao 已提交
2691
  if ((pBlockData->nRow > 0) && (pBlockData->nRow != pDumpInfo->totalRows)) {
2692
    return false;  // this is an invalid result.
2693
  }
2694
  return pBlockData->nRow > 0 && (!pDumpInfo->allDumped);
2695
}
2696

2697 2698
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                              STsdbReader* pReader) {
2699
  SRowMerger*         pMerger = &pReader->status.merger;
2700
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2701 2702
  bool                copied = false;
  int32_t             code = tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo, &copied);
D
dapan1121 已提交
2703 2704 2705
  if (code) {
    return code;
  }
2706

2707 2708 2709 2710 2711 2712 2713 2714 2715
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

D
dapan1121 已提交
2716
  if (copied) {
2717
    pBlockScanInfo->lastKey = key;
2718 2719
    return TSDB_CODE_SUCCESS;
  } else {
C
Cary Xu 已提交
2720 2721
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);

H
Hongze Cheng 已提交
2722
    SRow*      pTSRow = NULL;
2723
    code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2724 2725 2726 2727
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
2728
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2729
    code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2730 2731 2732 2733
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2734
    code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2735 2736

    taosMemoryFree(pTSRow);
2737
    tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2738
    return code;
2739 2740 2741
  }
}

H
Haojun Liao 已提交
2742 2743
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
                                          SBlockData* pBlockData, SLastBlockReader* pLastBlockReader) {
2744 2745
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2746
  TSDBROW *pRow = NULL, *piRow = NULL;
2747
  int64_t key = (pBlockData->nRow > 0 && (!pDumpInfo->allDumped)) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2748 2749 2750
  if (pBlockScanInfo->iter.hasVal) {
    pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  }
C
Cary Xu 已提交
2751

2752 2753 2754
  if (pBlockScanInfo->iiter.hasVal) {
    piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
  }
C
Cary Xu 已提交
2755

2756 2757 2758 2759
  // two levels of mem-table does contain the valid rows
  if (pRow != NULL && piRow != NULL) {
    return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
  }
2760

2761 2762 2763 2764
  // imem + file + last block
  if (pBlockScanInfo->iiter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key, pLastBlockReader);
  }
2765

2766 2767 2768
  // mem + file + last block
  if (pBlockScanInfo->iter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key, pLastBlockReader);
2769
  }
2770 2771 2772

  // files data blocks + last block
  return mergeFileBlockAndLastBlock(pReader, pLastBlockReader, key, pBlockScanInfo, pBlockData);
2773 2774
}

H
Haojun Liao 已提交
2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814
static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pBlockScanInfo,
                                     STsdbReader* pReader, bool* loadNeighbor) {
  int32_t     code = TSDB_CODE_SUCCESS;
  int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
  int32_t     nextIndex = -1;
  SBlockIndex nxtBIndex = {0};

  *loadNeighbor = false;
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);

  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pBlockScanInfo, &nextIndex, pReader->order, &nxtBIndex);
  if (!hasNeighbor) {  // do nothing
    return code;
  }

  if (overlapWithNeighborBlock(pBlock, &nxtBIndex, pReader->order)) {  // load next block
    SReaderStatus*  pStatus = &pReader->status;
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

    // 1. find the next neighbor block in the scan block list
    SFileDataBlockInfo fb = {.uid = pBlockInfo->uid, .tbBlockIdx = nextIndex};
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);

    // 2. remove it from the scan block list
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);

    // 3. load the neighbor block, and set it to be the currently accessed file data block
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pBlockInfo->uid);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    // 4. check the data values
    initBlockDumpInfo(pReader, pBlockIter);
    *loadNeighbor = true;
  }

  return code;
}

2815
static void updateComposedBlockInfo(STsdbReader* pReader, double el, STableBlockScanInfo* pBlockScanInfo) {
2816
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827

  pResBlock->info.id.uid = (pBlockScanInfo != NULL) ? pBlockScanInfo->uid : 0;
  pResBlock->info.dataLoad = 1;
  blockDataUpdateTsWindow(pResBlock, pReader->suppInfo.slotId[0]);

  setComposedBlockFlag(pReader, true);

  pReader->cost.composedBlocks += 1;
  pReader->cost.buildComposedBlockTime += el;
}

2828
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2829 2830
  int32_t code = TSDB_CODE_SUCCESS;

2831
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
2832

H
Hongze Cheng 已提交
2833
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
C
Cary Xu 已提交
2834 2835
  SLastBlockReader*   pLastBlockReader = pReader->status.fileIter.pLastBlockReader;

2836
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
C
Cary Xu 已提交
2837
  int64_t st = taosGetTimestampUs();
2838
  int32_t step = asc ? 1 : -1;
2839
  double  el = 0;
2840 2841
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2842 2843 2844

  STableBlockScanInfo* pBlockScanInfo = NULL;
  if (pBlockInfo != NULL) {
D
dapan1121 已提交
2845
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
2846 2847 2848 2849
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
      return code;
    }
    
H
Haojun Liao 已提交
2850 2851
    pBlockScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr);
    if (pBlockScanInfo == NULL) {
H
Haojun Liao 已提交
2852 2853 2854
      goto _end;
    }

H
Hongze Cheng 已提交
2855
    TSDBKEY   keyInBuf = getCurrentKeyInBuf(pBlockScanInfo, pReader);
C
Cary Xu 已提交
2856 2857

    // it is a clean block, load it directly
H
Hongze Cheng 已提交
2858
    if (isCleanFileDataBlock(pReader, pBlockInfo, pBlock, pBlockScanInfo, keyInBuf, pLastBlockReader) &&
2859
        pBlock->nRow <= pReader->resBlockInfo.capacity) {
2860
      if (asc || (!hasDataInLastBlock(pLastBlockReader))) {
D
dapan1121 已提交
2861 2862 2863 2864
        code = copyBlockDataToSDataBlock(pReader);
        if (code) {
          goto _end;
        }
2865 2866

        // record the last key value
H
Hongze Cheng 已提交
2867
        pBlockScanInfo->lastKey = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
H
Haojun Liao 已提交
2868 2869
        goto _end;
      }
C
Cary Xu 已提交
2870 2871
    }
  } else {  // file blocks not exist
2872
    pBlockScanInfo = *pReader->status.pTableIter;
D
dapan1121 已提交
2873
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockScanInfo->uid, sizeof(pBlockScanInfo->uid))) {
2874 2875 2876
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
      return code;
    }
2877 2878
  }

2879
  SBlockData* pBlockData = &pReader->status.fileBlockData;
2880

2881
  while (1) {
2882
    bool hasBlockData = false;
2883
    {
2884 2885
      while (pBlockData->nRow > 0 &&
             pBlockData->uid == pBlockScanInfo->uid) {  // find the first qualified row in data block
2886 2887 2888 2889 2890
        if (isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
          hasBlockData = true;
          break;
        }

2891 2892
        pDumpInfo->rowIndex += step;

2893
        pBlock = getCurrentBlock(&pReader->status.blockIter);
2894
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
H
Haojun Liao 已提交
2895
          pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);  // NOTE: get the new block info
H
Haojun Liao 已提交
2896

H
Haojun Liao 已提交
2897 2898 2899 2900 2901
          // continue check for the next file block if the last ts in the current block
          // is overlapped with the next neighbor block
          bool loadNeighbor = false;
          code = loadNeighborIfOverlap(pBlockInfo, pBlockScanInfo, pReader, &loadNeighbor);
          if ((!loadNeighbor) || (code != 0)) {
2902 2903
            setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
            break;
2904
          }
2905 2906
        }
      }
2907
    }
2908

2909
    // no data in last block and block, no need to proceed.
2910
    if (hasBlockData == false) {
2911
      break;
2912 2913
    }

D
dapan1121 已提交
2914 2915 2916 2917
    code = buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
    if (code) {
      goto _end;
    }
2918

2919
    // currently loaded file data block is consumed
2920
    if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
2921
      pBlock = getCurrentBlock(&pReader->status.blockIter);
2922
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2923 2924 2925
      break;
    }

H
Haojun Liao 已提交
2926
    if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
2927
      break;
2928 2929 2930
    }
  }

H
Hongze Cheng 已提交
2931
_end:
2932 2933
  el = (taosGetTimestampUs() - st) / 1000.0;
  updateComposedBlockInfo(pReader, el, pBlockScanInfo);
2934

2935
  if (pResBlock->info.rows > 0) {
2936 2937
    tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
              ", elapsed time:%.2f ms %s",
H
Haojun Liao 已提交
2938
              pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
H
Haojun Liao 已提交
2939
              pResBlock->info.rows, el, pReader->idStr);
2940
  }
2941

H
Haojun Liao 已提交
2942
  return code;
2943 2944 2945 2946
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

2947 2948 2949 2950 2951 2952 2953 2954
int32_t getInitialDelIndex(const SArray* pDelSkyline, int32_t order) {
  if (pDelSkyline == NULL) {
    return 0;
  }

  return ASCENDING_TRAVERSE(order) ? 0 : taosArrayGetSize(pDelSkyline) - 1;
}

dengyihao's avatar
dengyihao 已提交
2955 2956
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
2957 2958 2959
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
2960

2961
  int32_t code = 0;
2962 2963
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
2964
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
2965
  if (pDelFile && taosArrayGetSize(pReader->pDelIdx) > 0) {
2966
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
2967
    SDelIdx* pIdx = taosArraySearch(pReader->pDelIdx, &idx, tCmprDelIdx, TD_EQ);
2968

H
Haojun Liao 已提交
2969
    if (pIdx != NULL) {
H
Haojun Liao 已提交
2970
      code = tsdbReadDelDatav1(pReader->pDelFReader, pIdx, pDelData, pReader->verRange.maxVer);
2971 2972 2973
    }
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
2974
    }
2975
  }
2976

2977 2978 2979 2980
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
2981 2982 2983 2984
      if (p->version <= pReader->verRange.maxVer) {
        taosArrayPush(pDelData, p);
      }

2985 2986
      p = p->pNext;
    }
2987 2988
  }

2989 2990 2991
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
2992 2993 2994
      if (p->version <= pReader->verRange.maxVer) {
        taosArrayPush(pDelData, p);
      }
2995 2996 2997 2998 2999 3000 3001 3002 3003 3004
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
3005 3006 3007 3008 3009 3010 3011
  int32_t index = getInitialDelIndex(pBlockScanInfo->delSkyline, pReader->order);

  pBlockScanInfo->iter.index = index;
  pBlockScanInfo->iiter.index = index;
  pBlockScanInfo->fileDelIndex = index;
  pBlockScanInfo->lastBlockDelIndex = index;

3012 3013
  return code;

3014 3015 3016
_err:
  taosArrayDestroy(pDelData);
  return code;
3017 3018
}

C
Cary Xu 已提交
3019
TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
3020
  bool asc = ASCENDING_TRAVERSE(pReader->order);
X
Xiaoyu Wang 已提交
3021
  //  TSKEY initialVal = asc? TSKEY_MIN:TSKEY_MAX;
3022

X
Xiaoyu Wang 已提交
3023
  TSDBKEY key = {.ts = TSKEY_INITIAL_VAL}, ikey = {.ts = TSKEY_INITIAL_VAL};
3024

X
Xiaoyu Wang 已提交
3025
  bool     hasKey = false, hasIKey = false;
3026
  TSDBROW* pRow = getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
3027
  if (pRow != NULL) {
3028
    hasKey = true;
3029 3030 3031
    key = TSDBROW_KEY(pRow);
  }

3032 3033 3034 3035
  TSDBROW* pIRow = getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
  if (pIRow != NULL) {
    hasIKey = true;
    ikey = TSDBROW_KEY(pIRow);
3036 3037
  }

3038
  if (hasKey) {
X
Xiaoyu Wang 已提交
3039
    if (hasIKey) {  // has data in mem & imem
3040 3041
      if (asc) {
        return key.ts <= ikey.ts ? key : ikey;
X
Xiaoyu Wang 已提交
3042 3043
      } else {
        return key.ts <= ikey.ts ? ikey : key;
3044 3045 3046
      }
    } else {  // no data in imem
      return key;
3047
    }
3048 3049 3050 3051
  } else {
    // no data in mem & imem, return the initial value
    // only imem has data, return ikey
    return ikey;
3052 3053 3054
  }
}

3055
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum, SArray* pTableList) {
H
Haojun Liao 已提交
3056
  SReaderStatus* pStatus = &pReader->status;
3057
  pBlockNum->numOfBlocks = 0;
3058
  pBlockNum->numOfLastFiles = 0;
3059

H
Haojun Liao 已提交
3060
  size_t  numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
3061
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
3062 3063

  while (1) {
H
Haojun Liao 已提交
3064
    // only check here, since the iterate data in memory is very fast.
H
Haojun Liao 已提交
3065 3066 3067
    if (pReader->code != TSDB_CODE_SUCCESS) {
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
H
Haojun Liao 已提交
3068 3069
    }

dengyihao's avatar
dengyihao 已提交
3070
    bool    hasNext = false;
D
dapan1121 已提交
3071
    int32_t code = filesetIteratorNext(&pStatus->fileIter, pReader, &hasNext);
H
Haojun Liao 已提交
3072
    if (code != TSDB_CODE_SUCCESS) {
D
dapan1121 已提交
3073 3074 3075
      taosArrayDestroy(pIndexList);
      return code;
    }
dengyihao's avatar
dengyihao 已提交
3076

3077
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
3078 3079 3080
      break;
    }

H
Haojun Liao 已提交
3081
    taosArrayClear(pIndexList);
D
dapan1121 已提交
3082
    code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
H
Haojun Liao 已提交
3083
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3084
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3085 3086 3087
      return code;
    }

H
Hongze Cheng 已提交
3088
    if (taosArrayGetSize(pIndexList) > 0 || pReader->pFileReader->pSet->nSttF > 0) {
3089
      code = doLoadFileBlock(pReader, pIndexList, pBlockNum, pTableList);
H
Haojun Liao 已提交
3090
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3091
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3092 3093 3094
        return code;
      }

3095
      if (pBlockNum->numOfBlocks + pBlockNum->numOfLastFiles > 0) {
H
Haojun Liao 已提交
3096 3097 3098
        break;
      }
    }
3099

H
Haojun Liao 已提交
3100 3101 3102
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
3103
  taosArrayDestroy(pIndexList);
3104

H
Haojun Liao 已提交
3105 3106 3107 3108 3109 3110 3111
  if (pReader->pReadSnap != NULL) {
    SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
    if (pReader->pDelFReader == NULL && pDelFile != NULL) {
      int32_t code = tsdbDelFReaderOpen(&pReader->pDelFReader, pDelFile, pReader->pTsdb);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
3112

H
Haojun Liao 已提交
3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123
      pReader->pDelIdx = taosArrayInit(4, sizeof(SDelIdx));
      if (pReader->pDelIdx == NULL) {
        code = TSDB_CODE_OUT_OF_MEMORY;
        return code;
      }

      code = tsdbReadDelIdx(pReader->pDelFReader, pReader->pDelIdx);
      if (code != TSDB_CODE_SUCCESS) {
        taosArrayDestroy(pReader->pDelIdx);
        return code;
      }
3124 3125 3126
    }
  }

H
Haojun Liao 已提交
3127 3128 3129
  return TSDB_CODE_SUCCESS;
}

X
Xiaoyu Wang 已提交
3130
static void resetTableListIndex(SReaderStatus* pStatus) {
3131
  STableUidList* pList = &pStatus->uidList;
3132

H
Haojun Liao 已提交
3133 3134
  pList->currentIndex = 0;
  uint64_t uid = pList->tableUidList[0];
H
Haojun Liao 已提交
3135
  pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
3136 3137
}

3138
static bool moveToNextTable(STableUidList* pOrderedCheckInfo, SReaderStatus* pStatus) {
3139
  pOrderedCheckInfo->currentIndex += 1;
H
Haojun Liao 已提交
3140
  if (pOrderedCheckInfo->currentIndex >= tSimpleHashGetSize(pStatus->pTableMap)) {
3141 3142 3143 3144 3145
    pStatus->pTableIter = NULL;
    return false;
  }

  uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
H
Haojun Liao 已提交
3146
  pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
3147
  return (pStatus->pTableIter != NULL);
3148 3149
}

3150
static int32_t doLoadLastBlockSequentially(STsdbReader* pReader) {
3151
  SReaderStatus*    pStatus = &pReader->status;
3152
  SLastBlockReader* pLastBlockReader = pStatus->fileIter.pLastBlockReader;
3153
  STableUidList*    pUidList = &pStatus->uidList;
D
dapan1121 已提交
3154
  int32_t           code = TSDB_CODE_SUCCESS;
3155

H
Haojun Liao 已提交
3156
  if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
H
Haojun Liao 已提交
3157
    return TSDB_CODE_SUCCESS;
3158
  }
3159

3160
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
3161

3162
  while (1) {
3163
    if (pReader->code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3164 3165
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
H
Haojun Liao 已提交
3166 3167
    }

3168
    // load the last data block of current table
H
Hongze Cheng 已提交
3169
    STableBlockScanInfo* pScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
D
dapan1121 已提交
3170
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pScanInfo->uid, sizeof(pScanInfo->uid))) {
D
dapan1121 已提交
3171 3172 3173 3174
      // reset the index in last block when handing a new file
      doCleanupTableScanInfo(pScanInfo);
      pStatus->mapDataCleaned = true;

3175 3176 3177 3178 3179 3180 3181
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
      if (!hasNexTable) {
        return TSDB_CODE_SUCCESS;
      }

      continue;
    }
3182

3183 3184 3185 3186 3187 3188
    // reset the index in last block when handing a new file
    doCleanupTableScanInfo(pScanInfo);
    pStatus->mapDataCleaned = true;

    bool hasDataInLastFile = initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
    if (!hasDataInLastFile) {
3189
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
3190
      if (!hasNexTable) {
3191 3192
        return TSDB_CODE_SUCCESS;
      }
3193

3194
      continue;
3195 3196
    }

3197 3198 3199 3200 3201 3202 3203 3204 3205
    int64_t st = taosGetTimestampUs();
    while (1) {
      bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

      // no data in last block and block, no need to proceed.
      if (hasBlockLData == false) {
        break;
      }

D
dapan1121 已提交
3206 3207 3208 3209
      code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
      if (code) {
        return code;
      }
3210

3211
      if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
3212 3213
        break;
      }
3214 3215
    }

3216 3217 3218 3219
    double el = (taosGetTimestampUs() - st) / 1000.0;
    updateComposedBlockInfo(pReader, el, pScanInfo);

    if (pResBlock->info.rows > 0) {
3220 3221
      tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
                ", elapsed time:%.2f ms %s",
3222 3223
                pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                pResBlock->info.rows, el, pReader->idStr);
3224 3225
      return TSDB_CODE_SUCCESS;
    }
3226

3227
    // current table is exhausted, let's try next table
3228
    bool hasNexTable = moveToNextTable(pUidList, pStatus);
3229
    if (!hasNexTable) {
3230 3231
      return TSDB_CODE_SUCCESS;
    }
3232 3233 3234
  }
}

3235
static int32_t doBuildDataBlock(STsdbReader* pReader) {
H
Hongze Cheng 已提交
3236
  int32_t   code = TSDB_CODE_SUCCESS;
3237 3238 3239

  SReaderStatus*       pStatus = &pReader->status;
  SDataBlockIter*      pBlockIter = &pStatus->blockIter;
3240 3241 3242
  STableBlockScanInfo* pScanInfo = NULL;
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(pBlockIter);
  SLastBlockReader*    pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
3243 3244
  SDataBlk*            pBlock = getCurrentBlock(pBlockIter);

D
dapan1121 已提交
3245
  if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
3246 3247 3248
    setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
    return code;
  }
3249

H
Haojun Liao 已提交
3250 3251
  if (pReader->code != TSDB_CODE_SUCCESS) {
    return pReader->code;
3252 3253
  }

H
Haojun Liao 已提交
3254
  pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
3255
  if (pScanInfo == NULL) {
H
Haojun Liao 已提交
3256
    return terrno;
H
Haojun Liao 已提交
3257 3258
  }

3259

3260
  initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
C
Cary Xu 已提交
3261
  TSDBKEY keyInBuf = getCurrentKeyInBuf(pScanInfo, pReader);
3262

3263
  if (fileBlockShouldLoad(pReader, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader)) {
3264
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pScanInfo->uid);
3265 3266
    if (code != TSDB_CODE_SUCCESS) {
      return code;
3267 3268 3269
    }

    // build composed data block
3270
    code = buildComposedDataBlock(pReader);
C
Cary Xu 已提交
3271
  } else if (bufferDataInFileBlockGap(pReader->order, keyInBuf, pBlock)) {
3272
    // data in memory that are earlier than current file block
3273
    // rows in buffer should be less than the file block in asc, greater than file block in desc
3274
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
3275
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
3276 3277 3278 3279
  } else {
    if (hasDataInLastBlock(pLastBlockReader) && !ASCENDING_TRAVERSE(pReader->order)) {
      // only return the rows in last block
      int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
H
Hongze Cheng 已提交
3280
      ASSERT(tsLast >= pBlock->maxKey.ts);
3281

3282 3283 3284
      SBlockData* pBData = &pReader->status.fileBlockData;
      tBlockDataReset(pBData);

3285
      SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
3286
      tsdbDebug("load data in last block firstly, due to desc scan data, %s", pReader->idStr);
3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297

      int64_t st = taosGetTimestampUs();

      while (1) {
        bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

        // no data in last block and block, no need to proceed.
        if (hasBlockLData == false) {
          break;
        }

D
dapan1121 已提交
3298 3299 3300 3301
        code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
        if (code) {
          return code;
        }
3302

3303
        if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
3304 3305 3306 3307 3308 3309 3310 3311
          break;
        }
      }

      double el = (taosGetTimestampUs() - st) / 1000.0;
      updateComposedBlockInfo(pReader, el, pScanInfo);

      if (pResBlock->info.rows > 0) {
3312 3313
        tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
                  ", elapsed time:%.2f ms %s",
3314 3315 3316
                  pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                  pResBlock->info.rows, el, pReader->idStr);
      }
H
Hongze Cheng 已提交
3317
    } else {  // whole block is required, return it directly
3318
      SDataBlockInfo* pInfo = &pReader->resBlockInfo.pResBlock->info;
3319
      pInfo->rows = pBlock->nRow;
H
Haojun Liao 已提交
3320
      pInfo->id.uid = pScanInfo->uid;
3321
      pInfo->dataLoad = 0;
3322 3323 3324
      pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
      setComposedBlockFlag(pReader, false);
      setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
3325

3326
      // update the last key for the corresponding table
H
Hongze Cheng 已提交
3327
      pScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? pInfo->window.ekey : pInfo->window.skey;
X
Xiaoyu Wang 已提交
3328 3329
      tsdbDebug("%p uid:%" PRIu64
                " clean file block retrieved from file, global index:%d, "
H
Haojun Liao 已提交
3330 3331 3332
                "table index:%d, rows:%d, brange:%" PRId64 "-%" PRId64 ", %s",
                pReader, pScanInfo->uid, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->nRow, pBlock->minKey.ts,
                pBlock->maxKey.ts, pReader->idStr);
3333
    }
3334 3335
  }

H
Haojun Liao 已提交
3336
  return (pReader->code != TSDB_CODE_SUCCESS)? pReader->code:code;
3337 3338
}

D
dapan1121 已提交
3339
static int32_t doSumFileBlockRows(STsdbReader* pReader, SDataFReader* pFileReader) {
D
dapan1121 已提交
3340 3341 3342 3343 3344 3345 3346
  int64_t    st = taosGetTimestampUs();
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
    goto _end;
  }

H
Haojun Liao 已提交
3347
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
D
dapan1121 已提交
3348 3349 3350 3351 3352 3353 3354 3355

  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
    return TSDB_CODE_SUCCESS;
  }

3356
  SBlockIdx* pBlockIdx = NULL;
D
dapan1121 已提交
3357 3358 3359 3360 3361 3362
  for (int32_t i = 0; i < num; ++i) {
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
    if (pBlockIdx->suid != pReader->suid) {
      continue;
    }

H
Haojun Liao 已提交
3363
    STableBlockScanInfo** p = tSimpleHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(pBlockIdx->uid));
D
dapan1121 已提交
3364
    if (p == NULL) {
D
dapan1121 已提交
3365 3366 3367
      continue;
    }

3368
    STableBlockScanInfo* pScanInfo = *p;
D
dapan1121 已提交
3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383
    tMapDataReset(&pScanInfo->mapData);
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);

    SDataBlk block = {0};
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
      pReader->rowsNum += block.nRow;
    }
  }

_end:
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
  return code;
}

D
dapan1121 已提交
3384
static int32_t doSumSttBlockRows(STsdbReader* pReader) {
3385 3386 3387
  int32_t            code = TSDB_CODE_SUCCESS;
  SLastBlockReader*  pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
  SSttBlockLoadInfo* pBlockLoadInfo = NULL;
D
dapan1121 已提交
3388 3389 3390

  for (int32_t i = 0; i < pReader->pFileReader->pSet->nSttF; ++i) {  // open all last file
    pBlockLoadInfo = &pLastBlockReader->pInfo[i];
3391

D
dapan1121 已提交
3392 3393 3394
    code = tsdbReadSttBlk(pReader->pFileReader, i, pBlockLoadInfo->aSttBlk);
    if (code) {
      return code;
D
dapan1121 已提交
3395 3396 3397 3398
    }

    size_t size = taosArrayGetSize(pBlockLoadInfo->aSttBlk);
    if (size >= 1) {
3399 3400 3401
      SSttBlk* pStart = taosArrayGet(pBlockLoadInfo->aSttBlk, 0);
      SSttBlk* pEnd = taosArrayGet(pBlockLoadInfo->aSttBlk, size - 1);

D
dapan1121 已提交
3402 3403 3404 3405 3406 3407 3408
      // all identical
      if (pStart->suid == pEnd->suid) {
        if (pStart->suid != pReader->suid) {
          // no qualified stt block existed
          taosArrayClear(pBlockLoadInfo->aSttBlk);
          continue;
        }
H
Haojun Liao 已提交
3409 3410
        for (int32_t j = 0; j < size; ++j) {
          SSttBlk* p = taosArrayGet(pBlockLoadInfo->aSttBlk, j);
D
dapan1121 已提交
3411 3412
          pReader->rowsNum += p->nRow;
        }
D
dapan1121 已提交
3413
      } else {
H
Haojun Liao 已提交
3414 3415
        for (int32_t j = 0; j < size; ++j) {
          SSttBlk* p = taosArrayGet(pBlockLoadInfo->aSttBlk, j);
D
dapan1121 已提交
3416 3417 3418 3419
          uint64_t s = p->suid;
          if (s < pReader->suid) {
            continue;
          }
3420

D
dapan1121 已提交
3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433
          if (s == pReader->suid) {
            pReader->rowsNum += p->nRow;
          } else if (s > pReader->suid) {
            break;
          }
        }
      }
    }
  }

  return code;
}

D
dapan1121 已提交
3434
static int32_t readRowsCountFromFiles(STsdbReader* pReader) {
3435
  int32_t code = TSDB_CODE_SUCCESS;
D
dapan1121 已提交
3436 3437

  while (1) {
3438 3439
    bool hasNext = false;
    code = filesetIteratorNext(&pReader->status.fileIter, pReader, &hasNext);
D
dapan1121 已提交
3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455
    if (code) {
      return code;
    }

    if (!hasNext) {  // no data files on disk
      break;
    }

    code = doSumFileBlockRows(pReader, pReader->pFileReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    code = doSumSttBlockRows(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
3456
    }
D
dapan1121 已提交
3457 3458 3459 3460 3461 3462 3463
  }

  pReader->status.loadFromFile = false;

  return code;
}

D
dapan1121 已提交
3464
static int32_t readRowsCountFromMem(STsdbReader* pReader) {
3465 3466
  int32_t code = TSDB_CODE_SUCCESS;
  int64_t memNum = 0, imemNum = 0;
D
dapan1121 已提交
3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479
  if (pReader->pReadSnap->pMem != NULL) {
    tsdbMemTableCountRows(pReader->pReadSnap->pMem, pReader->status.pTableMap, &memNum);
  }

  if (pReader->pReadSnap->pIMem != NULL) {
    tsdbMemTableCountRows(pReader->pReadSnap->pIMem, pReader->status.pTableMap, &imemNum);
  }

  pReader->rowsNum += memNum + imemNum;

  return code;
}

H
Haojun Liao 已提交
3480
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
3481
  SReaderStatus* pStatus = &pReader->status;
3482
  STableUidList* pUidList = &pStatus->uidList;
3483

3484
  while (1) {
3485
    if (pReader->code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3486 3487
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
3488
    }
3489

3490
    STableBlockScanInfo** pBlockScanInfo = pStatus->pTableIter;
D
dapan1121 已提交
3491
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &(*pBlockScanInfo)->uid, sizeof((*pBlockScanInfo)->uid))) {
3492 3493 3494 3495
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
      if (!hasNexTable) {
        return TSDB_CODE_SUCCESS;
      }
3496
      pBlockScanInfo = pStatus->pTableIter;
3497 3498
    }
    
3499
    initMemDataIterator(*pBlockScanInfo, pReader);
3500

3501
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
3502
    int32_t code = buildDataBlockFromBuf(pReader, *pBlockScanInfo, endKey);
H
Haojun Liao 已提交
3503 3504 3505 3506
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

3507
    if (pReader->resBlockInfo.pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
3508
      return TSDB_CODE_SUCCESS;
3509 3510
    }

3511 3512 3513
    // current table is exhausted, let's try next table
    bool hasNexTable = moveToNextTable(pUidList, pStatus);
    if (!hasNexTable) {
H
Haojun Liao 已提交
3514
      return TSDB_CODE_SUCCESS;
3515 3516 3517 3518
    }
  }
}

3519
// set the correct start position in case of the first/last file block, according to the query time window
3520
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3521 3522 3523 3524
  int64_t             lastKey = ASCENDING_TRAVERSE(pReader->order) ? INT64_MIN : INT64_MAX;
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo) {
H
Haojun Liao 已提交
3525
    STableBlockScanInfo* pScanInfo = tSimpleHashGet(pBlockIter->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
3526 3527 3528
    if (pScanInfo) {
      lastKey = pScanInfo->lastKey;
    }
3529
  }
3530 3531 3532
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
3533 3534 3535

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
3536
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
3537
  pDumpInfo->lastKey = lastKey;
3538 3539
}

3540
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3541
  SBlockNumber num = {0};
3542 3543
  SArray* pTableList = taosArrayInit(40, POINTER_BYTES);

H
Haojun Liao 已提交
3544
  int32_t code = moveToNextFile(pReader, &num, pTableList);
3545
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3546
    taosArrayDestroy(pTableList);
3547 3548 3549 3550
    return code;
  }

  // all data files are consumed, try data in buffer
3551
  if (num.numOfBlocks + num.numOfLastFiles == 0) {
3552
    pReader->status.loadFromFile = false;
H
Haojun Liao 已提交
3553
    taosArrayDestroy(pTableList);
3554 3555 3556 3557
    return code;
  }

  // initialize the block iterator for a new fileset
3558
  if (num.numOfBlocks > 0) {
3559
    code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks, pTableList);
H
Hongze Cheng 已提交
3560
  } else {  // no block data, only last block exists
3561
    tBlockDataReset(&pReader->status.fileBlockData);
3562
    resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
3563
    resetTableListIndex(&pReader->status);
3564
  }
3565 3566

  // set the correct start position according to the query time window
3567
  initBlockDumpInfo(pReader, pBlockIter);
H
Haojun Liao 已提交
3568
  taosArrayDestroy(pTableList);
3569 3570 3571
  return code;
}

3572
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
3573 3574
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
3575 3576
}

3577 3578 3579 3580
typedef enum {
  TSDB_READ_RETURN = 0x1,
  TSDB_READ_CONTINUE = 0x2,
} ERetrieveType;
3581

3582 3583 3584
static ERetrieveType doReadDataFromLastFiles(STsdbReader* pReader) {
  int32_t         code = TSDB_CODE_SUCCESS;
  SSDataBlock*    pResBlock = pReader->resBlockInfo.pResBlock;
3585 3586
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

3587 3588 3589
  while(1) {
    terrno = 0;

3590 3591
    code = doLoadLastBlockSequentially(pReader);
    if (code != TSDB_CODE_SUCCESS) {
3592 3593
      terrno = code;
      return TSDB_READ_RETURN;
3594 3595
    }

3596 3597
    if (pResBlock->info.rows > 0) {
      return TSDB_READ_RETURN;
3598 3599
    }

3600
    // all data blocks are checked in this last block file, now let's try the next file
3601 3602
    ASSERT(pReader->status.pTableIter == NULL);
    code = initForFirstBlockInFile(pReader, pBlockIter);
3603

3604
    // error happens or all the data files are completely checked
H
Haojun Liao 已提交
3605
    if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
3606 3607 3608
      terrno = code;
      return TSDB_READ_RETURN;
    }
3609

3610 3611 3612 3613
    if (pBlockIter->numOfBlocks > 0) { // there are data blocks existed.
      return TSDB_READ_CONTINUE;
    } else {  // all blocks in data file are checked, let's check the data in last files
      resetTableListIndex(&pReader->status);
3614
    }
3615 3616
  }
}
3617

3618 3619 3620 3621 3622 3623 3624 3625
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
  int32_t code = TSDB_CODE_SUCCESS;
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

  SDataBlockIter* pBlockIter = &pReader->status.blockIter;
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;

  if (pBlockIter->numOfBlocks == 0) {
3626
    // let's try to extract data from stt files.
3627
    ERetrieveType type = doReadDataFromLastFiles(pReader);
3628
    if (type == TSDB_READ_RETURN) {
3629
      return terrno;
3630 3631 3632
    }

    code = doBuildDataBlock(pReader);
H
Haojun Liao 已提交
3633
    if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
3634 3635 3636 3637
      return code;
    }
  }

3638
  while (1) {
3639 3640
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

3641
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
3642
      code = buildComposedDataBlock(pReader);
3643 3644 3645 3646
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
H
Haojun Liao 已提交
3647
        bool hasNext = blockIteratorNext(&pReader->status.blockIter, pReader->idStr);
3648 3649
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
3650
        } else {
3651 3652
          // all data blocks in files are checked, let's check the data in last files.
          ASSERT(pReader->status.pCurrentFileset->nSttF > 0);
H
Haojun Liao 已提交
3653

3654 3655 3656 3657 3658
          // data blocks in current file are exhausted, let's try the next file now
          SBlockData* pBlockData = &pReader->status.fileBlockData;
          if (pBlockData->uid != 0) {
            tBlockDataClear(pBlockData);
          }
3659

3660 3661 3662
          tBlockDataReset(pBlockData);
          resetDataBlockIterator(pBlockIter, pReader->order);
          resetTableListIndex(&pReader->status);
3663

3664
          ERetrieveType type = doReadDataFromLastFiles(pReader);
3665
          if (type == TSDB_READ_RETURN) {
3666
            return terrno;
3667
          }
3668
        }
H
Haojun Liao 已提交
3669
      }
3670 3671

      code = doBuildDataBlock(pReader);
3672 3673
    }

H
Haojun Liao 已提交
3674
    if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
3675 3676 3677
      return code;
    }
  }
3678
}
H
refact  
Hongze Cheng 已提交
3679

3680 3681
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
3682
  if (VND_IS_RSMA(pVnode)) {
3683
    int8_t  level = 0;
3684 3685
    int8_t  precision = pVnode->config.tsdbCfg.precision;
    int64_t now = taosGetTimestamp(precision);
H
Haojun Liao 已提交
3686 3687 3688
    int64_t offset = tsQueryRsmaTolerance * ((precision == TSDB_TIME_PRECISION_MILLI)   ? 1L
                                             : (precision == TSDB_TIME_PRECISION_MICRO) ? 1000L
                                                                                        : 1000000L);
3689

3690
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
3691 3692 3693 3694 3695 3696 3697
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
3698
      if ((now - pRetention->keep) <= (winSKey + offset)) {
3699 3700 3701 3702 3703
        break;
      }
      ++level;
    }

3704
    const char* str = (idStr != NULL) ? idStr : "";
3705 3706

    if (level == TSDB_RETENTION_L0) {
3707
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
3708
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
3709 3710
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
3711
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
3712
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
3713 3714
      return VND_RSMA1(pVnode);
    } else {
3715
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
3716
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
3717 3718 3719 3720 3721 3722 3723
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
3724
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
3725
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
3726 3727

  int64_t endVer = 0;
3728 3729
  if (pCond->endVersion == -1) {
    // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
3730 3731
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
3732
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
3733 3734
  }

H
Haojun Liao 已提交
3735
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
3736 3737
}

3738
bool hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order, SVersionRange* pVerRange) {
3739 3740 3741
  if (pDelList == NULL) {
    return false;
  }
H
Haojun Liao 已提交
3742

L
Liu Jicong 已提交
3743 3744 3745
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
3746

3747 3748 3749
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
3750
      ASSERT(key >= last->ts);
3751

3752
      if (key > last->ts) {
3753
        return false;
3754
      } else if (key == last->ts) {
3755
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
3756
        return (prev->version >= ver && prev->version <= pVerRange->maxVer &&
3757
                prev->version >= pVerRange->minVer);
3758 3759
      }
    } else {
3760 3761 3762
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

3763
      if (key < pCurrent->ts) {
3764 3765 3766
        return false;
      }

3767
      if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
3768
          pVerRange->maxVer >= pCurrent->version) {
3769 3770 3771
        return true;
      }

3772
      while (pNext->ts <= key && (*index) < num - 1) {
3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

3784
          if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
3785
              pVerRange->maxVer >= pCurrent->version) {
3786 3787 3788 3789 3790 3791
            return true;
          }
        }
      }

      return false;
3792 3793
    }
  } else {
3794 3795
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
3796

3797
      if (key < pFirst->ts) {
3798
        return false;
3799 3800
      } else if (key == pFirst->ts) {
        return pFirst->version >= ver;
3801 3802 3803
      } else {
        ASSERT(0);
      }
3804
    } else {
3805 3806 3807
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

3808
      if (key > pCurrent->ts) {
3809 3810 3811
        return false;
      }

3812
      if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
3813 3814 3815
        return true;
      }

3816
      while (pPrev->ts >= key && (*index) > 1) {
3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

3828
          if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
3829 3830 3831
            return true;
          }
        }
3832 3833 3834 3835 3836
      }

      return false;
    }
  }
3837 3838

  return false;
3839 3840
}

3841
TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
3842
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
3843 3844
    return NULL;
  }
H
Hongze Cheng 已提交
3845

3846
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3847 3848
  TSDBKEY  key = TSDBROW_KEY(pRow);

3849
  if (outOfTimeWindow(key.ts, &pReader->window)) {
3850
    pIter->hasVal = false;
H
Haojun Liao 已提交
3851 3852
    return NULL;
  }
H
Hongze Cheng 已提交
3853

3854
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
3855
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
3856
      (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3857 3858
    return pRow;
  }
H
Hongze Cheng 已提交
3859

3860
  while (1) {
3861 3862
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3863 3864
      return NULL;
    }
H
Hongze Cheng 已提交
3865

3866
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3867

H
Haojun Liao 已提交
3868
    key = TSDBROW_KEY(pRow);
3869
    if (outOfTimeWindow(key.ts, &pReader->window)) {
3870
      pIter->hasVal = false;
H
Haojun Liao 已提交
3871 3872
      return NULL;
    }
H
Hongze Cheng 已提交
3873

dengyihao's avatar
dengyihao 已提交
3874
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
3875
        (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3876 3877 3878 3879
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
3880

H
Haojun Liao 已提交
3881 3882 3883
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, STsdbReader* pReader) {
  SRowMerger* pMerger = &pReader->status.merger;

H
Haojun Liao 已提交
3884
  while (1) {
3885 3886
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3887 3888
      break;
    }
H
Hongze Cheng 已提交
3889

3890
    // data exists but not valid
3891
    TSDBROW* pRow = getValidMemRow(pIter, pDelList, pReader);
3892 3893 3894 3895 3896
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
3897
    TSDBKEY k = TSDBROW_KEY(pRow);
3898
    if (k.ts != ts) {
H
Haojun Liao 已提交
3899 3900 3901
      break;
    }

3902 3903 3904 3905 3906
    if (pRow->type == TSDBROW_ROW_FMT) {
      STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
      if (pTSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
3907

3908 3909
      tsdbRowMergerAdd(pMerger, pRow, pTSchema);
    } else {  // column format
3910
      tsdbRowMergerAdd(pMerger, pRow, NULL);
3911
    }
H
Haojun Liao 已提交
3912 3913 3914 3915 3916
  }

  return TSDB_CODE_SUCCESS;
}

3917
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
3918
                                          SVersionRange* pVerRange, int32_t step) {
3919
  while (rowIndex < pBlockData->nRow && rowIndex >= 0 && pBlockData->aTSKEY[rowIndex] == key) {
3920
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
3921
      rowIndex += step;
3922 3923 3924 3925
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
3926
    tsdbRowMergerAdd(pMerger, &fRow, NULL);
3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

H
Hongze Cheng 已提交
3938
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SDataBlk* pBlock,
3939 3940
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
3941
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
3942
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
3943
  bool                asc = ASCENDING_TRAVERSE(pReader->order);
3944

3945
  *state = CHECK_FILEBLOCK_QUIT;
3946
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
3947

3948
  bool    loadNeighbor = true;
H
Haojun Liao 已提交
3949
  int32_t code = loadNeighborIfOverlap(pFBlock, pScanInfo, pReader, &loadNeighbor);
3950

H
Haojun Liao 已提交
3951
  if (loadNeighbor && (code == TSDB_CODE_SUCCESS)) {
3952 3953
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
3954
    if ((pDumpInfo->rowIndex >= pDumpInfo->totalRows && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
3955 3956 3957 3958
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

H
Haojun Liao 已提交
3959
  return code;
3960 3961
}

H
Haojun Liao 已提交
3962
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
3963 3964
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

H
Haojun Liao 已提交
3965
  SRowMerger* pMerger = &pReader->status.merger;
3966
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
3967
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
3968
  int32_t step = asc ? 1 : -1;
3969

3970
  pDumpInfo->rowIndex += step;
3971
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
3972 3973 3974
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
3975

3976 3977 3978 3979
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
3980

3981
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
3982
      SDataBlk*           pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
H
Haojun Liao 已提交
3983 3984 3985 3986 3987
      if (pFileBlockInfo == NULL) {
        st = CHECK_FILEBLOCK_QUIT;
        break;
      }

3988 3989 3990
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
3991
      }
3992
    }
H
Haojun Liao 已提交
3993
  }
3994

H
Haojun Liao 已提交
3995 3996 3997
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3998
int32_t doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
3999
                               SRowMerger* pMerger, SVersionRange* pVerRange, const char* idStr) {
4000
  while (nextRowFromLastBlocks(pLastBlockReader, pScanInfo, pVerRange)) {
4001 4002
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 == ts) {
4003 4004
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tsdbRowMergerAdd(pMerger, pRow1, NULL);
4005
    } else {
4006 4007 4008
      tsdbTrace("uid:%" PRIu64 " last del index:%d, del range:%d, lastKeyInStt:%" PRId64 ", %s", pScanInfo->uid,
                pScanInfo->lastBlockDelIndex, (int32_t)taosArrayGetSize(pScanInfo->delSkyline), pScanInfo->lastKeyInStt,
                idStr);
4009 4010 4011 4012 4013 4014 4015
      break;
    }
  }

  return TSDB_CODE_SUCCESS;
}

4016
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, TSDBROW* pResRow,
4017
                                 STsdbReader* pReader, bool* freeTSRow) {
H
Haojun Liao 已提交
4018
  TSDBROW* pNextRow = NULL;
4019
  TSDBROW  current = *pRow;
4020

4021 4022
  {  // if the timestamp of the next valid row has a different ts, return current row directly
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
4023

4024
    if (!pIter->hasVal) {
4025
      *pResRow = *pRow;
4026
      *freeTSRow = false;
4027
      return TSDB_CODE_SUCCESS;
4028
    } else {  // has next point in mem/imem
4029
      pNextRow = getValidMemRow(pIter, pDelList, pReader);
4030
      if (pNextRow == NULL) {
H
Haojun Liao 已提交
4031
        *pResRow = current;
4032
        *freeTSRow = false;
4033
        return TSDB_CODE_SUCCESS;
4034 4035
      }

H
Hongze Cheng 已提交
4036
      if (TSDBROW_TS(&current) != TSDBROW_TS(pNextRow)) {
H
Haojun Liao 已提交
4037
        *pResRow = current;
4038
        *freeTSRow = false;
4039
        return TSDB_CODE_SUCCESS;
4040
      }
4041
    }
4042 4043
  }

H
Haojun Liao 已提交
4044
  terrno = 0;
4045
  int32_t code = 0;
H
Haojun Liao 已提交
4046

4047 4048 4049 4050 4051 4052 4053
  // start to merge duplicated rows
  if (current.type == TSDBROW_ROW_FMT) {
    // get the correct schema for data in memory
    STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(&current), pReader, uid);
    if (pTSchema == NULL) {
      return terrno;
    }
H
Haojun Liao 已提交
4054

H
Haojun Liao 已提交
4055
    code = tsdbRowMergerAdd(&pReader->status.merger, &current, pTSchema);
4056 4057 4058
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
H
Haojun Liao 已提交
4059

4060 4061 4062 4063
    STSchema* pTSchema1 = doGetSchemaForTSRow(TSDBROW_SVERSION(pNextRow), pReader, uid);
    if (pTSchema1 == NULL) {
      return terrno;
    }
H
Haojun Liao 已提交
4064

H
Haojun Liao 已提交
4065
    tsdbRowMergerAdd(&pReader->status.merger,pNextRow, pTSchema1);
4066
  } else {  // let's merge rows in file block
H
Haojun Liao 已提交
4067
    code = tsdbRowMergerAdd(&pReader->status.merger, &current, pReader->pSchema);
4068 4069 4070
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
H
Haojun Liao 已提交
4071

H
Haojun Liao 已提交
4072
    tsdbRowMergerAdd(&pReader->status.merger,pNextRow, NULL);
4073
  }
H
Haojun Liao 已提交
4074

H
Haojun Liao 已提交
4075
  code = doMergeRowsInBuf(pIter, uid, TSDBROW_TS(&current), pDelList, pReader);
H
Haojun Liao 已提交
4076 4077 4078 4079
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

H
Haojun Liao 已提交
4080
  code = tsdbRowMergerGetRow(&pReader->status.merger, &pResRow->pTSRow);
4081 4082 4083
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
M
Minglei Jin 已提交
4084

wmmhello's avatar
wmmhello 已提交
4085
  pResRow->type = TSDBROW_ROW_FMT;
4086
  tsdbRowMergerClear(&pReader->status.merger);
4087
  *freeTSRow = true;
4088

4089
  return TSDB_CODE_SUCCESS;
4090 4091
}

4092
int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
H
Hongze Cheng 已提交
4093
                           SRow** pTSRow) {
H
Haojun Liao 已提交
4094
  SRowMerger* pMerger = &pReader->status.merger;
H
Haojun Liao 已提交
4095

4096 4097 4098
  TSDBKEY   k = TSDBROW_KEY(pRow);
  TSDBKEY   ik = TSDBROW_KEY(piRow);
  STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
4099 4100 4101 4102
  if (pSchema == NULL) {
    return terrno;
  }

4103
  STSchema* piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
4104 4105 4106
  if (piSchema == NULL) {
    return terrno;
  }
4107

4108
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
H
Haojun Liao 已提交
4109
    int32_t code = tsdbRowMergerAdd(&pReader->status.merger, piRow, piSchema);
H
Haojun Liao 已提交
4110 4111 4112 4113
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
4114
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4115 4116 4117
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4118

H
Haojun Liao 已提交
4119
    tsdbRowMergerAdd(&pReader->status.merger,pRow, pSchema);
H
Haojun Liao 已提交
4120
    code =
H
Haojun Liao 已提交
4121
        doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4122 4123 4124 4125
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4126
  } else {
H
Haojun Liao 已提交
4127 4128
    int32_t code = tsdbRowMergerAdd(&pReader->status.merger, pRow, pSchema);
    if (code != TSDB_CODE_SUCCESS || pMerger->pTSchema == NULL) {
H
Haojun Liao 已提交
4129 4130 4131
      return code;
    }

H
Haojun Liao 已提交
4132
    code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4133 4134 4135
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4136

H
Haojun Liao 已提交
4137 4138
    tsdbRowMergerAdd(&pReader->status.merger, piRow, piSchema);
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4139 4140 4141
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4142
  }
4143

H
Haojun Liao 已提交
4144
  int32_t code = tsdbRowMergerGetRow(pMerger, pTSRow);
4145
  tsdbRowMergerClear(pMerger);
4146
  return code;
4147 4148
}

4149
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, TSDBROW* pResRow, int64_t endKey,
4150
                            bool* freeTSRow) {
4151 4152
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
4153
  SArray*  pDelList = pBlockScanInfo->delSkyline;
4154
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
4155

4156 4157
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
4158
  if (pBlockScanInfo->iter.hasVal) {
4159 4160 4161 4162 4163 4164
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

4165
  if (pBlockScanInfo->iiter.hasVal) {
4166 4167 4168 4169 4170 4171
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

4172
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
4173
    TSDBKEY k = TSDBROW_KEY(pRow);
4174
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
4175

4176
    int32_t code = TSDB_CODE_SUCCESS;
4177 4178
    if (ik.ts != k.ts) {
      if (((ik.ts < k.ts) && asc) || ((ik.ts > k.ts) && (!asc))) {  // ik.ts < k.ts
4179
        code = doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pResRow, pReader, freeTSRow);
4180
      } else if (((k.ts < ik.ts) && asc) || ((k.ts > ik.ts) && (!asc))) {
4181
        code = doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, pResRow, pReader, freeTSRow);
4182
      }
4183
    } else {  // ik.ts == k.ts
4184
      *freeTSRow = true;
4185 4186
      pResRow->type = TSDBROW_ROW_FMT;
      code = doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pResRow->pTSRow);
4187 4188 4189
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
4190
    }
4191

4192
    return code;
H
Haojun Liao 已提交
4193 4194
  }

4195
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
4196
    return doMergeMemTableMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pResRow, pReader,
H
Hongze Cheng 已提交
4197
                                    freeTSRow);
H
Haojun Liao 已提交
4198 4199
  }

4200
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
4201
    return doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pResRow, pReader, freeTSRow);
H
Haojun Liao 已提交
4202 4203 4204 4205 4206
  }

  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
4207
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow, STableBlockScanInfo* pScanInfo) {
H
Haojun Liao 已提交
4208
  int32_t outputRowIndex = pBlock->info.rows;
4209
  int64_t uid = pScanInfo->uid;
4210
  int32_t code = TSDB_CODE_SUCCESS;
4211

4212
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
4213
  STSchema*           pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
4214 4215 4216
  if (pSchema == NULL) {
    return terrno;
  }
4217

4218
  SColVal colVal = {0};
4219
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
4220

4221
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
4222
    SColumnInfoData* pColData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4223
    ((int64_t*)pColData->pData)[outputRowIndex] = pTSRow->ts;
4224 4225 4226
    i += 1;
  }

H
Haojun Liao 已提交
4227
  while (i < pSupInfo->numOfCols && j < pSchema->numOfCols) {
H
Haojun Liao 已提交
4228
    col_id_t colId = pSupInfo->colId[i];
4229 4230

    if (colId == pSchema->columns[j].colId) {
H
Haojun Liao 已提交
4231
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4232

H
Hongze Cheng 已提交
4233
      tRowGet(pTSRow, pSchema, j, &colVal);
D
dapan1121 已提交
4234 4235 4236 4237
      code = doCopyColVal(pColInfoData, outputRowIndex, i, &colVal, pSupInfo);
      if (code) {
        return code;
      }
4238 4239 4240
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
H
Haojun Liao 已提交
4241
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4242

4243
      colDataSetNULL(pColInfoData, outputRowIndex);
4244 4245 4246
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
4247
    }
4248 4249
  }

4250
  // set null value since current column does not exist in the "pSchema"
H
Haojun Liao 已提交
4251
  while (i < pSupInfo->numOfCols) {
H
Haojun Liao 已提交
4252
    SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
4253
    colDataSetNULL(pColInfoData, outputRowIndex);
4254 4255 4256
    i += 1;
  }

4257
  pBlock->info.dataLoad = 1;
4258
  pBlock->info.rows += 1;
4259
  pScanInfo->lastKey = pTSRow->ts;
4260 4261 4262
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
4263 4264
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                 int32_t rowIndex) {
4265 4266
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;
D
dapan1121 已提交
4267
  int32_t code = TSDB_CODE_SUCCESS;
4268 4269

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
4270
  ((int64_t*)pReader->status.pPrimaryTsCol->pData)[outputRowIndex] = pBlockData->aTSKEY[rowIndex];
4271
  i += 1;
4272 4273

  SColVal cv = {0};
H
Hongze Cheng 已提交
4274
  int32_t numOfInputCols = pBlockData->nColData;
H
Haojun Liao 已提交
4275
  int32_t numOfOutputCols = pSupInfo->numOfCols;
4276

4277
  while (i < numOfOutputCols && j < numOfInputCols) {
H
Haojun Liao 已提交
4278
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, j);
H
Haojun Liao 已提交
4279
    if (pData->cid < pSupInfo->colId[i]) {
4280 4281 4282 4283
      j += 1;
      continue;
    }

H
Haojun Liao 已提交
4284 4285
    SColumnInfoData* pCol = TARRAY_GET_ELEM(pResBlock->pDataBlock, pSupInfo->slotId[i]);
    if (pData->cid == pSupInfo->colId[i]) {
4286
      tColDataGetValue(pData, rowIndex, &cv);
D
dapan1121 已提交
4287 4288 4289 4290
      code = doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
      if (code) {
        return code;
      }
4291
      j += 1;
H
Haojun Liao 已提交
4292 4293
    } else if (pData->cid > pCol->info.colId) {
      // the specified column does not exist in file block, fill with null data
4294
      colDataSetNULL(pCol, outputRowIndex);
4295 4296 4297 4298 4299 4300
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
4301
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
4302
    colDataSetNULL(pCol, outputRowIndex);
4303 4304 4305
    i += 1;
  }

4306
  pResBlock->info.dataLoad = 1;
4307 4308 4309 4310
  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

4311 4312
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
4313
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
4314
  int32_t      code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
4315 4316

  do {
4317
    //    SRow* pTSRow = NULL;
4318
    TSDBROW row = {.type = -1};
4319
    bool    freeTSRow = false;
4320 4321
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &row, endKey, &freeTSRow);
    if (row.type == -1) {
4322
      break;
H
Haojun Liao 已提交
4323 4324
    }

4325
    if (row.type == TSDBROW_ROW_FMT) {
D
dapan1121 已提交
4326
      code = doAppendRowFromTSRow(pBlock, pReader, row.pTSRow, pBlockScanInfo);
4327

4328 4329 4330
      if (freeTSRow) {
        taosMemoryFree(row.pTSRow);
      }
D
dapan1121 已提交
4331 4332 4333 4334

      if (code) {
        return code;
      }
4335
    } else {
D
dapan1121 已提交
4336 4337 4338 4339
      code = doAppendRowFromFileBlock(pBlock, pReader, row.pBlockData, row.iRow);
      if (code) {
        break;
      }
4340
    }
H
Haojun Liao 已提交
4341 4342

    // no data in buffer, return immediately
4343
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
4344 4345 4346
      break;
    }

4347
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
4348 4349 4350 4351
      break;
    }
  } while (1);

D
dapan1121 已提交
4352
  return code;
H
Haojun Liao 已提交
4353
}
H
Hongze Cheng 已提交
4354

4355 4356
// TODO refactor: with createDataBlockScanInfo
int32_t tsdbSetTableList(STsdbReader* pReader, const void* pTableList, int32_t num) {
H
Haojun Liao 已提交
4357
  int32_t size = tSimpleHashGetSize(pReader->status.pTableMap);
4358

4359
  STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4360 4361 4362
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pReader->status.pTableMap, p, &iter)) != NULL) {
4363
    clearBlockScanInfo(*p);
4364 4365
  }

D
dapan1121 已提交
4366 4367 4368 4369 4370
  if (size < num) {
    int32_t code = ensureBlockScanInfoBuf(&pReader->blockInfoBuf, num);
    if (code) {
      return code;
    }
4371 4372 4373 4374 4375 4376 4377

    char* p1 = taosMemoryRealloc(pReader->status.uidList.tableUidList, sizeof(uint64_t) * num);
    if (p1 == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    pReader->status.uidList.tableUidList = (uint64_t*)p1;
D
dapan1121 已提交
4378
  }
4379

H
Haojun Liao 已提交
4380
  tSimpleHashClear(pReader->status.pTableMap);
4381
  STableUidList* pUidList = &pReader->status.uidList;
H
Haojun Liao 已提交
4382
  pUidList->currentIndex = 0;
4383

4384 4385
  STableKeyInfo* pList = (STableKeyInfo*)pTableList;
  for (int32_t i = 0; i < num; ++i) {
4386 4387
    STableBlockScanInfo* pInfo = getPosInBlockInfoBuf(&pReader->blockInfoBuf, i);
    pInfo->uid = pList[i].uid;
H
Haojun Liao 已提交
4388 4389
    pUidList->tableUidList[i] = pList[i].uid;

4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400
    // todo extract method
    if (ASCENDING_TRAVERSE(pReader->order)) {
      int64_t skey = pReader->window.skey;
      pInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
      pInfo->lastKeyInStt = skey;
    } else {
      int64_t ekey = pReader->window.ekey;
      pInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
      pInfo->lastKeyInStt = ekey;
    }

H
Haojun Liao 已提交
4401
    tSimpleHashPut(pReader->status.pTableMap, &pInfo->uid, sizeof(uint64_t), &pInfo, POINTER_BYTES);
4402 4403
  }

H
Hongze Cheng 已提交
4404 4405 4406
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
4407 4408 4409 4410 4411 4412
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
4413

dengyihao's avatar
dengyihao 已提交
4414 4415 4416 4417 4418 4419
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
4420

4421
uint64_t tsdbGetReaderMaxVersion(STsdbReader* pReader) { return pReader->verRange.maxVer; }
4422

4423
static int32_t doOpenReaderImpl(STsdbReader* pReader) {
4424 4425
  SReaderStatus*  pStatus = &pReader->status;
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
4426

4427 4428
  initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
  resetDataBlockIterator(&pStatus->blockIter, pReader->order);
4429

4430 4431 4432
  int32_t code = TSDB_CODE_SUCCESS;
  if (pStatus->fileIter.numOfFiles == 0) {
    pStatus->loadFromFile = false;
D
dapan1121 已提交
4433 4434
  } else if (READ_MODE_COUNT_ONLY == pReader->readMode) {
    // DO NOTHING
4435
  } else {
4436
    code = initForFirstBlockInFile(pReader, pBlockIter);
4437
  }
4438 4439 4440

  if (!pStatus->loadFromFile) {
    resetTableListIndex(pStatus);
4441
  }
4442 4443

  return code;
4444 4445
}

4446
static void freeSchemaFunc(void* param) {
G
Ganlin Zhao 已提交
4447 4448
  void **p = (void **)param;
  taosMemoryFreeClear(*p);
4449 4450
}

H
refact  
Hongze Cheng 已提交
4451
// ====================================== EXPOSED APIs ======================================
4452 4453
int32_t tsdbReaderOpen(void* pVnode, SQueryTableDataCond* pCond, void* pTableList, int32_t numOfTables,
                       SSDataBlock* pResBlock, void** ppReader, const char* idstr, bool countOnly, SHashObj** pIgnoreTables) {
4454
  STimeWindow window = pCond->twindows;
4455
  SVnodeCfg* pConf = &(((SVnode*)pVnode)->config);
4456

4457
  int32_t capacity = pConf->tsdbCfg.maxRows;
4458 4459
  if (pResBlock != NULL) {
    blockDataEnsureCapacity(pResBlock, capacity);
H
Haojun Liao 已提交
4460 4461 4462
  }

  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, capacity, pResBlock, idstr);
4463
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
4464 4465
    goto _err;
  }
H
Hongze Cheng 已提交
4466

4467
  // check for query time window
H
Haojun Liao 已提交
4468
  STsdbReader* pReader = *ppReader;
4469
  if (isEmptyQueryTimeWindow(&pReader->window) && pCond->type == TIMEWINDOW_RANGE_CONTAINED) {
H
Haojun Liao 已提交
4470 4471 4472
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
4473

4474 4475
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
4476
    int32_t order = pCond->order;
4477
    if (order == TSDB_ORDER_ASC) {
4478
      pCond->twindows.ekey = window.skey - 1;
4479 4480 4481
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
4482
      pCond->twindows.skey = window.ekey + 1;
4483 4484 4485 4486
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

4487
    // here we only need one more row, so the capacity is set to be ONE.
4488
    code = tsdbReaderCreate(pVnode, pCond, (void**)&((STsdbReader*)pReader)->innerReader[0], 1, pResBlock, idstr);
4489 4490 4491 4492 4493
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
4494
      pCond->twindows.skey = window.ekey + 1;
4495
      pCond->twindows.ekey = INT64_MAX;
4496
    } else {
4497
      pCond->twindows.skey = INT64_MIN;
4498
      pCond->twindows.ekey = window.ekey - 1;
4499
    }
4500 4501
    pCond->order = order;

4502
    code = tsdbReaderCreate(pVnode, pCond, (void**)&((STsdbReader*)pReader)->innerReader[1], 1, pResBlock, idstr);
4503 4504 4505 4506 4507
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

H
Haojun Liao 已提交
4508
  // NOTE: the endVersion in pCond is the data version not schema version, so pCond->endVersion is not correct here.
4509 4510
  //  no valid error code set in metaGetTbTSchema, so let's set the error code here.
  //  we should proceed in case of tmq processing.
4511
  if (pCond->suid != 0) {
4512
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, -1, 1);
H
Haojun Liao 已提交
4513
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
4514
      tsdbError("failed to get table schema, suid:%" PRIu64 ", ver:-1, %s", pReader->suid, pReader->idStr);
H
Haojun Liao 已提交
4515
    }
4516 4517
  } else if (numOfTables > 0) {
    STableKeyInfo* pKey = pTableList;
4518
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1, 1);
H
Haojun Liao 已提交
4519
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
4520
      tsdbError("failed to get table schema, uid:%" PRIu64 ", ver:-1, %s", pKey->uid, pReader->idStr);
H
Haojun Liao 已提交
4521
    }
4522 4523
  }

H
Haojun Liao 已提交
4524
  if (pReader->pSchema != NULL) {
4525
    tsdbRowMergerInit(&pReader->status.merger, pReader->pSchema);
H
Haojun Liao 已提交
4526 4527
  }

4528 4529
  pReader->pSchemaMap = tSimpleHashInit(8, taosFastHash);
  if (pReader->pSchemaMap == NULL) {
4530
    tsdbError("failed init schema hash for reader %s", pReader->idStr);
4531 4532 4533 4534 4535
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _err;
  }

  tSimpleHashSetFreeFp(pReader->pSchemaMap, freeSchemaFunc);
4536
  if (pReader->pSchema != NULL) {
H
Haojun Liao 已提交
4537 4538 4539 4540
    code = updateBlockSMAInfo(pReader->pSchema, &pReader->suppInfo);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
4541
  }
4542

4543
  STsdbReader* p = (pReader->innerReader[0] != NULL) ? pReader->innerReader[0] : pReader;
X
Xiaoyu Wang 已提交
4544 4545
  pReader->status.pTableMap =
      createDataBlockScanInfo(p, &pReader->blockInfoBuf, pTableList, &pReader->status.uidList, numOfTables);
H
Haojun Liao 已提交
4546 4547
  if (pReader->status.pTableMap == NULL) {
    *ppReader = NULL;
S
Shengliang Guan 已提交
4548
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
4549 4550
    goto _err;
  }
H
Hongze Cheng 已提交
4551

4552
  pReader->status.pLDataIter = taosMemoryCalloc(pConf->sttTrigger, sizeof(SLDataIter));
H
Haojun Liao 已提交
4553 4554 4555 4556 4557
  if (pReader->status.pLDataIter == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _err;
  }

H
Haojun Liao 已提交
4558
  pReader->flag = READER_STATUS_SUSPEND;
4559

D
dapan1121 已提交
4560 4561 4562
  if (countOnly) {
    pReader->readMode = READ_MODE_COUNT_ONLY;
  }
4563

4564 4565
  pReader->pIgnoreTables = pIgnoreTables;

4566 4567 4568 4569 4570
  tsdbDebug("%p total numOfTable:%d, window:%" PRId64 " - %" PRId64 ", verRange:%" PRId64 " - %" PRId64
            " in this query %s",
            pReader, numOfTables, pReader->window.skey, pReader->window.ekey, pReader->verRange.minVer,
            pReader->verRange.maxVer, pReader->idStr);

H
Hongze Cheng 已提交
4571
  return code;
H
Hongze Cheng 已提交
4572 4573

_err:
H
Haojun Liao 已提交
4574
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), idstr);
K
kailixu 已提交
4575
  tsdbReaderClose(*ppReader);
X
Xiaoyu Wang 已提交
4576
  *ppReader = NULL;  // reset the pointer value.
H
Hongze Cheng 已提交
4577
  return code;
H
refact  
Hongze Cheng 已提交
4578 4579
}

4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595
static void clearSharedPtr(STsdbReader* p) {
  p->status.pLDataIter = NULL;
  p->status.pTableMap = NULL;
  p->status.uidList.tableUidList = NULL;
  p->pReadSnap = NULL;
  p->pSchema = NULL;
  p->pSchemaMap = NULL;
}

static void setSharedPtr(STsdbReader* pDst, const STsdbReader* pSrc) {
  pDst->status.pTableMap = pSrc->status.pTableMap;
  pDst->status.pLDataIter = pSrc->status.pLDataIter;
  pDst->status.uidList = pSrc->status.uidList;
  pDst->pSchema = pSrc->pSchema;
  pDst->pSchemaMap = pSrc->pSchemaMap;
  pDst->pReadSnap = pSrc->pReadSnap;
4596 4597 4598 4599

  if (pDst->pSchema) {
    tsdbRowMergerInit(&pDst->status.merger, pDst->pSchema);
  }
4600 4601
}

H
refact  
Hongze Cheng 已提交
4602
void tsdbReaderClose(STsdbReader* pReader) {
4603 4604
  if (pReader == NULL) {
    return;
4605
  }
H
refact  
Hongze Cheng 已提交
4606

4607
  tsdbAcquireReader(pReader);
4608

4609
  {
H
Haojun Liao 已提交
4610
    if (pReader->innerReader[0] != NULL || pReader->innerReader[1] != NULL) {
4611
      STsdbReader* p = pReader->innerReader[0];
4612
      clearSharedPtr(p);
4613 4614

      p = pReader->innerReader[1];
4615
      clearSharedPtr(p);
4616 4617 4618 4619 4620 4621

      tsdbReaderClose(pReader->innerReader[0]);
      tsdbReaderClose(pReader->innerReader[1]);
    }
  }

4622
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
H
Hongze Cheng 已提交
4623

4624
  taosArrayDestroy(pSupInfo->pColAgg);
H
Haojun Liao 已提交
4625
  for (int32_t i = 0; i < pSupInfo->numOfCols; ++i) {
4626 4627 4628 4629
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
4630

4631 4632
  if (pReader->resBlockInfo.freeBlock) {
    pReader->resBlockInfo.pResBlock = blockDataDestroy(pReader->resBlockInfo.pResBlock);
H
Haojun Liao 已提交
4633
  }
4634

H
Haojun Liao 已提交
4635
  taosMemoryFree(pSupInfo->colId);
H
Hongze Cheng 已提交
4636
  tBlockDataDestroy(&pReader->status.fileBlockData);
4637
  cleanupDataBlockIterator(&pReader->status.blockIter);
4638

H
Haojun Liao 已提交
4639
  size_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
4640 4641 4642 4643
  if (pReader->status.pTableMap != NULL) {
    destroyAllBlockScanInfo(pReader->status.pTableMap);
    clearBlockScanInfoBuf(&pReader->blockInfoBuf);
  }
4644

H
Haojun Liao 已提交
4645 4646 4647
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
4648

4649 4650 4651 4652 4653 4654 4655 4656 4657
  if (pReader->pDelFReader != NULL) {
    tsdbDelFReaderClose(&pReader->pDelFReader);
  }

  if (pReader->pDelIdx != NULL) {
    taosArrayDestroy(pReader->pDelIdx);
    pReader->pDelIdx = NULL;
  }

4658
  qTrace("tsdb/reader-close: %p, untake snapshot", pReader);
4659
  tsdbUntakeReadSnap(pReader, pReader->pReadSnap, true);
4660
  pReader->pReadSnap = NULL;
4661

4662 4663
  tsdbReleaseReader(pReader);

4664
  tsdbUninitReaderLock(pReader);
4665

H
Haojun Liao 已提交
4666
  taosMemoryFreeClear(pReader->status.pLDataIter);
H
Haojun Liao 已提交
4667
  taosMemoryFreeClear(pReader->status.uidList.tableUidList);
H
Haojun Liao 已提交
4668
  SIOCostSummary* pCost = &pReader->cost;
4669

H
Haojun Liao 已提交
4670 4671
  SFilesetIter* pFilesetIter = &pReader->status.fileIter;
  if (pFilesetIter->pLastBlockReader != NULL) {
H
Haojun Liao 已提交
4672 4673
    SLastBlockReader* pLReader = pFilesetIter->pLastBlockReader;
    tMergeTreeClose(&pLReader->mergeTree);
H
Haojun Liao 已提交
4674

H
Haojun Liao 已提交
4675
    getLastBlockLoadInfo(pLReader->pInfo, &pCost->lastBlockLoad, &pCost->lastBlockLoadTime);
H
refact  
Hongze Cheng 已提交
4676

H
Haojun Liao 已提交
4677 4678 4679
    pLReader->pInfo = destroyLastBlockLoadInfo(pLReader->pInfo);
    taosMemoryFree(pLReader);
  }
H
refact  
Hongze Cheng 已提交
4680

4681 4682 4683 4684 4685
  tsdbDebug(
      "%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
      " SMA-time:%.2f ms, fileBlocks:%" PRId64
      ", fileBlocks-load-time:%.2f ms, "
      "build in-memory-block-time:%.2f ms, lastBlocks:%" PRId64 ", lastBlocks-time:%.2f ms, composed-blocks:%" PRId64
X
Xiaoyu Wang 已提交
4686 4687
      ", composed-blocks-time:%.2fms, STableBlockScanInfo size:%.2f Kb, createTime:%.2f ms,initDelSkylineIterTime:%.2f "
      "ms, %s",
4688 4689 4690
      pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime, pCost->numOfBlocks,
      pCost->blockLoadTime, pCost->buildmemBlock, pCost->lastBlockLoad, pCost->lastBlockLoadTime, pCost->composedBlocks,
      pCost->buildComposedBlockTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pCost->createScanInfoList,
H
Haojun Liao 已提交
4691
      pCost->initDelSkylineIterTime, pReader->idStr);
H
refact  
Hongze Cheng 已提交
4692

4693
  taosMemoryFree(pReader->idStr);
H
Haojun Liao 已提交
4694

4695
  tsdbRowMergerCleanup(&pReader->status.merger);
4696
  taosMemoryFree(pReader->pSchema);
4697

4698
  tSimpleHashCleanup(pReader->pSchemaMap);
4699
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
4700 4701
}

4702 4703 4704 4705 4706 4707 4708 4709 4710 4711
int32_t tsdbReaderSuspend(STsdbReader* pReader) {
  int32_t code = 0;

  // save reader's base state & reset top state to be reconstructed from base state
  SReaderStatus*       pStatus = &pReader->status;
  STableBlockScanInfo* pBlockScanInfo = NULL;

  if (pStatus->loadFromFile) {
    SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
    if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
4712
      pBlockScanInfo = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, pReader->idStr);
4713 4714 4715 4716
      if (pBlockScanInfo == NULL) {
        goto _err;
      }
    } else {
4717
      pBlockScanInfo = *pStatus->pTableIter;
4718 4719 4720 4721 4722
    }

    tsdbDataFReaderClose(&pReader->pFileReader);

    // resetDataBlockScanInfo excluding lastKey
4723
    STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4724
    int32_t iter = 0;
4725

H
Haojun Liao 已提交
4726
    while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
4727 4728 4729 4730 4731 4732 4733 4734
      STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;

      pInfo->iterInit = false;
      pInfo->iter.hasVal = false;
      pInfo->iiter.hasVal = false;

      if (pInfo->iter.iter != NULL) {
        pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
4735 4736
      }

4737 4738 4739 4740 4741
      if (pInfo->iiter.iter != NULL) {
        pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
      }

      pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
4742 4743
    }
  } else {
4744 4745
    // resetDataBlockScanInfo excluding lastKey
    STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4746
    int32_t iter = 0;
4747

H
Haojun Liao 已提交
4748
    while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765
      STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;

      pInfo->iterInit = false;
      pInfo->iter.hasVal = false;
      pInfo->iiter.hasVal = false;

      if (pInfo->iter.iter != NULL) {
        pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
      }

      if (pInfo->iiter.iter != NULL) {
        pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
      }

      pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    }

4766
    pBlockScanInfo = pStatus->pTableIter == NULL ? NULL : *pStatus->pTableIter;
4767 4768
    if (pBlockScanInfo) {
      // save lastKey to restore memory iterator
4769
      STimeWindow w = pReader->resBlockInfo.pResBlock->info.window;
4770 4771 4772 4773
      pBlockScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? w.ekey : w.skey;

      // reset current current table's data block scan info,
      pBlockScanInfo->iterInit = false;
4774 4775
      pBlockScanInfo->iter.hasVal = false;
      pBlockScanInfo->iiter.hasVal = false;
4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790
      if (pBlockScanInfo->iter.iter != NULL) {
        pBlockScanInfo->iter.iter = tsdbTbDataIterDestroy(pBlockScanInfo->iter.iter);
      }

      if (pBlockScanInfo->iiter.iter != NULL) {
        pBlockScanInfo->iiter.iter = tsdbTbDataIterDestroy(pBlockScanInfo->iiter.iter);
      }

      pBlockScanInfo->pBlockList = taosArrayDestroy(pBlockScanInfo->pBlockList);
      tMapDataClear(&pBlockScanInfo->mapData);
      // TODO: keep skyline for reuse
      pBlockScanInfo->delSkyline = taosArrayDestroy(pBlockScanInfo->delSkyline);
    }
  }

4791
  tsdbUntakeReadSnap(pReader, pReader->pReadSnap, false);
4792
  pReader->pReadSnap = NULL;
H
Haojun Liao 已提交
4793
  pReader->flag = READER_STATUS_SUSPEND;
4794

4795 4796
  tsdbDebug("reader: %p suspended uid %" PRIu64 " in this query %s", pReader, pBlockScanInfo ? pBlockScanInfo->uid : 0,
            pReader->idStr);
4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807
  return code;

_err:
  tsdbError("failed to suspend data reader, code:%s %s", tstrerror(code), pReader->idStr);
  return code;
}

static int32_t tsdbSetQueryReseek(void* pQHandle) {
  int32_t      code = 0;
  STsdbReader* pReader = pQHandle;

4808
  code = tsdbTryAcquireReader(pReader);
4809
  if (code == 0) {
H
Haojun Liao 已提交
4810
    if (pReader->flag == READER_STATUS_SUSPEND) {
4811
      tsdbReleaseReader(pReader);
4812 4813 4814 4815
      return code;
    }

    tsdbReaderSuspend(pReader);
4816

4817
    tsdbReleaseReader(pReader);
4818

4819
    return code;
4820 4821 4822
  } else if (code == EBUSY) {
    return TSDB_CODE_VND_QUERY_BUSY;
  } else {
4823 4824
    terrno = TAOS_SYSTEM_ERROR(code);
    return TSDB_CODE_FAILED;
4825 4826 4827 4828 4829 4830
  }
}

int32_t tsdbReaderResume(STsdbReader* pReader) {
  int32_t code = 0;

4831
  STableBlockScanInfo** pBlockScanInfo = pReader->status.pTableIter;
4832 4833 4834

  //  restore reader's state
  //  task snapshot
H
Haojun Liao 已提交
4835
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
4836
  if (numOfTables > 0) {
4837
    qTrace("tsdb/reader: %p, take snapshot", pReader);
4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852
    code = tsdbTakeReadSnap(pReader, tsdbSetQueryReseek, &pReader->pReadSnap);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
      code = doOpenReaderImpl(pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    } else {
      STsdbReader* pPrevReader = pReader->innerReader[0];
      STsdbReader* pNextReader = pReader->innerReader[1];

      // we need only one row
4853
      pPrevReader->resBlockInfo.capacity = 1;
4854
      setSharedPtr(pPrevReader, pReader);
4855

4856
      pNextReader->resBlockInfo.capacity = 1;
4857
      setSharedPtr(pNextReader, pReader);
4858 4859 4860 4861 4862 4863 4864 4865

      code = doOpenReaderImpl(pPrevReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
  }

H
Haojun Liao 已提交
4866
  pReader->flag = READER_STATUS_NORMAL;
4867 4868
  tsdbDebug("reader: %p resumed uid %" PRIu64 ", numOfTable:%" PRId32 ", in this query %s", pReader,
            pBlockScanInfo ? (*pBlockScanInfo)->uid : 0, numOfTables, pReader->idStr);
4869 4870 4871 4872 4873 4874 4875
  return code;

_err:
  tsdbError("failed to resume data reader, code:%s %s", tstrerror(code), pReader->idStr);
  return code;
}

D
dapan1121 已提交
4876
static bool tsdbReadRowsCountOnly(STsdbReader* pReader) {
4877
  int32_t      code = TSDB_CODE_SUCCESS;
4878
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
D
dapan1121 已提交
4879

D
dapan1121 已提交
4880 4881 4882
  if (pReader->status.loadFromFile == false) {
    return false;
  }
D
dapan1121 已提交
4883

D
dapan1121 已提交
4884
  code = readRowsCountFromFiles(pReader);
D
dapan1121 已提交
4885 4886
  if (code != TSDB_CODE_SUCCESS) {
    return false;
D
dapan1121 已提交
4887 4888
  }

D
dapan1121 已提交
4889 4890 4891 4892 4893
  code = readRowsCountFromMem(pReader);
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

D
dapan1121 已提交
4894 4895 4896
  pBlock->info.rows = pReader->rowsNum;
  pBlock->info.id.uid = 0;
  pBlock->info.dataLoad = 0;
4897

D
dapan1121 已提交
4898
  pReader->rowsNum = 0;
4899

D
dapan1121 已提交
4900 4901 4902
  return pBlock->info.rows > 0;
}

4903
static int32_t doTsdbNextDataBlock(STsdbReader* pReader, bool* hasNext) {
D
dapan1121 已提交
4904
  int32_t code = TSDB_CODE_SUCCESS;
4905

H
Haojun Liao 已提交
4906
  // cleanup the data that belongs to the previous data block
4907
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
4908
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
4909

D
dapan1121 已提交
4910 4911
  *hasNext = false;

4912
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
4913
  if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
D
dapan1121 已提交
4914
    return code;
4915
  }
H
Haojun Liao 已提交
4916

D
dapan1121 已提交
4917 4918 4919 4920
  if (READ_MODE_COUNT_ONLY == pReader->readMode) {
    return tsdbReadRowsCountOnly(pReader);
  }

4921
  if (pStatus->loadFromFile) {
D
dapan1121 已提交
4922
    code = buildBlockFromFiles(pReader);
4923
    if (code != TSDB_CODE_SUCCESS) {
D
dapan1121 已提交
4924
      return code;
4925
    }
4926

D
dapan1121 已提交
4927
    if (pBlock->info.rows <= 0) {
4928
      resetTableListIndex(&pReader->status);
D
dapan1121 已提交
4929
      code = buildBlockFromBufferSequentially(pReader);
H
Haojun Liao 已提交
4930
    }
4931
  } else {  // no data in files, let's try the buffer
D
dapan1121 已提交
4932
    code = buildBlockFromBufferSequentially(pReader);
H
Haojun Liao 已提交
4933
  }
D
dapan1121 已提交
4934

D
dapan1121 已提交
4935 4936
  *hasNext = pBlock->info.rows > 0;

D
dapan1121 已提交
4937
  return code;
H
refact  
Hongze Cheng 已提交
4938 4939
}

4940
int32_t tsdbNextDataBlock(STsdbReader* pReader, bool* hasNext) {
D
dapan1121 已提交
4941 4942 4943
  int32_t code = TSDB_CODE_SUCCESS;

  *hasNext = false;
4944

H
Haojun Liao 已提交
4945 4946
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->step == EXTERNAL_ROWS_NEXT || pReader->code != TSDB_CODE_SUCCESS) {
    return (pReader->code != TSDB_CODE_SUCCESS)? pReader->code:code;
4947 4948
  }

4949 4950
  SReaderStatus* pStatus = &pReader->status;

D
dapan1121 已提交
4951
  code = tsdbAcquireReader(pReader);
4952 4953
  qTrace("tsdb/read: %p, take read mutex, code: %d", pReader, code);

H
Haojun Liao 已提交
4954
  if (pReader->flag == READER_STATUS_SUSPEND) {
4955 4956 4957 4958 4959
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
4960 4961
  }

4962
  if (pReader->innerReader[0] != NULL && pReader->step == 0) {
D
dapan1121 已提交
4963 4964 4965 4966 4967
    code = doTsdbNextDataBlock(pReader->innerReader[0], hasNext);
    if (code) {
      tsdbReleaseReader(pReader);
      return code;
    }
4968

4969
    pReader->step = EXTERNAL_ROWS_PREV;
D
dapan1121 已提交
4970
    if (*hasNext) {
4971
      pStatus = &pReader->innerReader[0]->status;
4972
      if (pStatus->composedDataBlock) {
4973
        qTrace("tsdb/read: %p, unlock read mutex", pReader);
4974
        tsdbReleaseReader(pReader);
4975 4976
      }

D
dapan1121 已提交
4977
      return code;
4978
    }
4979
  }
4980

4981
  if (pReader->step == EXTERNAL_ROWS_PREV) {
4982
    // prepare for the main scan
4983 4984 4985
    code = doOpenReaderImpl(pReader);
    int32_t step = 1;
    resetAllDataBlockScanInfo(pReader->status.pTableMap, pReader->innerReader[0]->window.ekey, step);
4986 4987 4988 4989 4990

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4991
    pReader->step = EXTERNAL_ROWS_MAIN;
4992 4993
  }

D
dapan1121 已提交
4994 4995 4996 4997 4998
  code = doTsdbNextDataBlock(pReader, hasNext);
  if (code != TSDB_CODE_SUCCESS) {
    tsdbReleaseReader(pReader);
    return code;
  }
4999

D
dapan1121 已提交
5000
  if (*hasNext) {
5001
    if (pStatus->composedDataBlock) {
5002
      qTrace("tsdb/read: %p, unlock read mutex", pReader);
5003
      tsdbReleaseReader(pReader);
5004 5005
    }

D
dapan1121 已提交
5006
    return code;
5007 5008
  }

5009
  if (pReader->step == EXTERNAL_ROWS_MAIN && pReader->innerReader[1] != NULL) {
5010
    // prepare for the next row scan
5011 5012 5013
    int32_t step = -1;
    code = doOpenReaderImpl(pReader->innerReader[1]);
    resetAllDataBlockScanInfo(pReader->innerReader[1]->status.pTableMap, pReader->window.ekey, step);
5014 5015 5016 5017
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

D
dapan1121 已提交
5018 5019 5020 5021 5022
    code = doTsdbNextDataBlock(pReader->innerReader[1], hasNext);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
5023

5024
    pReader->step = EXTERNAL_ROWS_NEXT;
D
dapan1121 已提交
5025
    if (*hasNext) {
5026
      pStatus = &pReader->innerReader[1]->status;
5027
      if (pStatus->composedDataBlock) {
5028
        qTrace("tsdb/read: %p, unlock read mutex", pReader);
5029
        tsdbReleaseReader(pReader);
5030 5031
      }

D
dapan1121 已提交
5032
      return code;
5033 5034 5035
    }
  }

5036
  qTrace("tsdb/read: %p, unlock read mutex", pReader);
5037
  tsdbReleaseReader(pReader);
5038

D
dapan1121 已提交
5039
  return code;
5040 5041
}

G
Ganlin Zhao 已提交
5042 5043
static bool doFillNullColSMA(SBlockLoadSuppInfo* pSup, int32_t numOfRows, int32_t numOfCols, SColumnDataAgg* pTsAgg) {
  bool hasNullSMA = false;
5044 5045
  // do fill all null column value SMA info
  int32_t i = 0, j = 0;
5046
  int32_t size = (int32_t)taosArrayGetSize(pSup->pColAgg);
5047
  taosArrayInsert(pSup->pColAgg, 0, pTsAgg);
5048
  size++;
5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059

  while (j < numOfCols && i < size) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colId[j]) {
      i += 1;
      j += 1;
    } else if (pAgg->colId < pSup->colId[j]) {
      i += 1;
    } else if (pSup->colId[j] < pAgg->colId) {
      if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
        SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
5060
        taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
5061
        i += 1;
D
dapan1121 已提交
5062
        size++;
G
Ganlin Zhao 已提交
5063
        hasNullSMA = true;
5064 5065 5066 5067
      }
      j += 1;
    }
  }
5068 5069 5070 5071 5072 5073

  while (j < numOfCols) {
    if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
      SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
      taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
      i += 1;
G
Ganlin Zhao 已提交
5074
      hasNullSMA = true;
5075 5076 5077
    }
    j++;
  }
G
Ganlin Zhao 已提交
5078 5079

  return hasNullSMA;
5080 5081
}

G
Ganlin Zhao 已提交
5082
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SSDataBlock* pDataBlock, bool* allHave, bool *hasNullSMA) {
H
Haojun Liao 已提交
5083 5084
  SColumnDataAgg*** pBlockSMA = &pDataBlock->pBlockAgg;

H
Hongze Cheng 已提交
5085
  int32_t code = 0;
5086
  *allHave = false;
H
Haojun Liao 已提交
5087
  *pBlockSMA = NULL;
H
Hongze Cheng 已提交
5088

5089
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
5090 5091 5092
    return TSDB_CODE_SUCCESS;
  }

5093
  // there is no statistics data for composed block
5094
  if (pReader->status.composedDataBlock || (!pReader->suppInfo.smaValid)) {
5095 5096
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
5097

5098
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
5099 5100
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

5101
  if (pReader->resBlockInfo.pResBlock->info.id.uid != pFBlock->uid) {
H
Haojun Liao 已提交
5102 5103
    return TSDB_CODE_SUCCESS;
  }
5104

D
dapan1121 已提交
5105 5106
  int64_t st = taosGetTimestampUs();

5107
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
H
Hongze Cheng 已提交
5108
  if (tDataBlkHasSma(pBlock)) {
H
Hongze Cheng 已提交
5109
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg);
5110
    if (code != TSDB_CODE_SUCCESS) {
5111 5112
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
5113 5114
      return code;
    }
5115
  } else {
H
Haojun Liao 已提交
5116
    *pBlockSMA = NULL;
5117
    return TSDB_CODE_SUCCESS;
5118
  }
H
Hongze Cheng 已提交
5119

5120
  *allHave = true;
H
Hongze Cheng 已提交
5121

5122 5123
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
5124

5125 5126
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
5127 5128
  pTsAgg->min = pReader->resBlockInfo.pResBlock->info.window.skey;
  pTsAgg->max = pReader->resBlockInfo.pResBlock->info.window.ekey;
5129 5130

  // update the number of NULL data rows
5131
  size_t numOfCols = pSup->numOfCols;
5132

5133
  // ensure capacity
H
Haojun Liao 已提交
5134 5135 5136
  if (pDataBlock->pDataBlock) {
    size_t colsNum = taosArrayGetSize(pDataBlock->pDataBlock);
    taosArrayEnsureCap(pSup->pColAgg, colsNum);
5137 5138
  }

5139
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
5140 5141
  if (pResBlock->pBlockAgg == NULL) {
    size_t num = taosArrayGetSize(pResBlock->pDataBlock);
H
Haojun Liao 已提交
5142
    pResBlock->pBlockAgg = taosMemoryCalloc(num, POINTER_BYTES);
5143
  }
5144

5145
  // do fill all null column value SMA info
G
Ganlin Zhao 已提交
5146 5147 5148 5149
  if (doFillNullColSMA(pSup, pBlock->nRow, numOfCols, pTsAgg)) {
    *hasNullSMA = true;
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
5150
  size_t size = taosArrayGetSize(pSup->pColAgg);
5151

H
Haojun Liao 已提交
5152
  int32_t i = 0, j = 0;
5153
  while (j < numOfCols && i < size) {
5154
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
H
Haojun Liao 已提交
5155 5156
    if (pAgg->colId == pSup->colId[j]) {
      pResBlock->pBlockAgg[pSup->slotId[j]] = pAgg;
5157 5158
      i += 1;
      j += 1;
H
Haojun Liao 已提交
5159
    } else if (pAgg->colId < pSup->colId[j]) {
5160
      i += 1;
H
Haojun Liao 已提交
5161
    } else if (pSup->colId[j] < pAgg->colId) {
5162 5163
      pResBlock->pBlockAgg[pSup->slotId[j]] = NULL;
      *allHave = false;
5164 5165 5166 5167
      j += 1;
    }
  }

H
Haojun Liao 已提交
5168
  *pBlockSMA = pResBlock->pBlockAgg;
5169
  pReader->cost.smaDataLoad += 1;
5170

D
dapan1121 已提交
5171 5172 5173
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
  pReader->cost.smaLoadTime += elapsedTime;

5174
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", %s", 0, pFBlock->uid, pReader->idStr);
H
Hongze Cheng 已提交
5175
  return code;
H
Hongze Cheng 已提交
5176 5177
}

H
Haojun Liao 已提交
5178 5179
STableBlockScanInfo* getTableBlockScanInfo(SSHashObj* pTableMap, uint64_t uid, const char* id) {
  STableBlockScanInfo** p = tSimpleHashGet(pTableMap, &uid, sizeof(uid));
H
Haojun Liao 已提交
5180 5181
  if (p == NULL || *p == NULL) {
    terrno = TSDB_CODE_INVALID_PARA;
H
Haojun Liao 已提交
5182
    int32_t size = tSimpleHashGetSize(pTableMap);
H
Haojun Liao 已提交
5183 5184 5185 5186 5187 5188 5189
    tsdbError("failed to locate the uid:%" PRIu64 " in query table uid list, total tables:%d, %s", uid, size, id);
    return NULL;
  }

  return *p;
}

H
Haojun Liao 已提交
5190
static SSDataBlock* doRetrieveDataBlock(STsdbReader* pReader) {
5191
  SReaderStatus*       pStatus = &pReader->status;
D
dapan1121 已提交
5192
  int32_t              code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
5193
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(&pStatus->blockIter);
5194

H
Haojun Liao 已提交
5195
  if (pReader->code != TSDB_CODE_SUCCESS) {
5196 5197 5198
    return NULL;
  }

H
Haojun Liao 已提交
5199
  STableBlockScanInfo* pBlockScanInfo = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
5200
  if (pBlockScanInfo == NULL) {
5201
    return NULL;
5202 5203
  }

D
dapan1121 已提交
5204 5205 5206 5207 5208 5209 5210 5211
  code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData, pBlockScanInfo->uid);
  if (code != TSDB_CODE_SUCCESS) {
    tBlockDataDestroy(&pStatus->fileBlockData);
    terrno = code;
    return NULL;
  }

  code = copyBlockDataToSDataBlock(pReader);
5212
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
5213
    tBlockDataDestroy(&pStatus->fileBlockData);
5214 5215
    terrno = code;
    return NULL;
5216
  }
5217

5218
  return pReader->resBlockInfo.pResBlock;
H
Hongze Cheng 已提交
5219 5220
}

H
Haojun Liao 已提交
5221
SSDataBlock* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
5222
  STsdbReader* pTReader = pReader;
5223 5224
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
5225
      pTReader = pReader->innerReader[0];
5226
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
5227
      pTReader = pReader->innerReader[1];
5228 5229 5230
    }
  }

5231 5232
  SReaderStatus* pStatus = &pTReader->status;
  if (pStatus->composedDataBlock) {
5233
    return pTReader->resBlockInfo.pResBlock;
5234 5235 5236 5237
  }

  SSDataBlock* ret = doRetrieveDataBlock(pTReader);

5238
  qTrace("tsdb/read-retrieve: %p, unlock read mutex", pReader);
5239
  tsdbReleaseReader(pReader);
5240 5241

  return ret;
5242 5243
}

H
Haojun Liao 已提交
5244
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
5245 5246
  int32_t code = TSDB_CODE_SUCCESS;

5247
  qTrace("tsdb/reader-reset: %p, take read mutex", pReader);
5248
  tsdbAcquireReader(pReader);
L
Liu Jicong 已提交
5249

H
Haojun Liao 已提交
5250
  if (pReader->flag == READER_STATUS_SUSPEND) {
5251 5252 5253 5254 5255
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
L
Liu Jicong 已提交
5256 5257
  }

H
Haojun Liao 已提交
5258
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->pReadSnap == NULL) {
5259
    tsdbDebug("tsdb reader reset return %p, %s", pReader->pReadSnap, pReader->idStr);
5260
    tsdbReleaseReader(pReader);
5261 5262
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
5263

5264
  SReaderStatus*  pStatus = &pReader->status;
H
Haojun Liao 已提交
5265
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
5266

L
Liu Jicong 已提交
5267
  pReader->order = pCond->order;
5268
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
H
Haojun Liao 已提交
5269 5270
  pStatus->loadFromFile = true;
  pStatus->pTableIter = NULL;
H
Haojun Liao 已提交
5271
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
5272

5273
  // allocate buffer in order to load data blocks from file
5274
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
5275

5276
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
5277
  tsdbDataFReaderClose(&pReader->pFileReader);
5278

H
Haojun Liao 已提交
5279
  int32_t numOfTables = tSimpleHashGetSize(pStatus->pTableMap);
L
Liu Jicong 已提交
5280

H
Haojun Liao 已提交
5281
  initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
5282
  resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
5283
  resetTableListIndex(&pReader->status);
H
Haojun Liao 已提交
5284

5285 5286 5287
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
  int64_t ts = asc ? pReader->window.skey - 1 : pReader->window.ekey + 1;
5288
  resetAllDataBlockScanInfo(pStatus->pTableMap, ts, step);
5289 5290

  // no data in files, let's try buffer in memory
H
Haojun Liao 已提交
5291 5292
  if (pStatus->fileIter.numOfFiles == 0) {
    pStatus->loadFromFile = false;
5293
    resetTableListIndex(pStatus);
5294 5295 5296
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
5297 5298
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
5299

5300
      tsdbReleaseReader(pReader);
5301 5302 5303
      return code;
    }
  }
H
Hongze Cheng 已提交
5304

H
Hongze Cheng 已提交
5305 5306 5307 5308
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, skey:%" PRId64 ", query range:%" PRId64 " - %" PRId64
            " in query %s",
            pReader, pReader->suid, numOfTables, pCond->twindows.skey, pReader->window.skey, pReader->window.ekey,
            pReader->idStr);
5309

5310
  tsdbReleaseReader(pReader);
5311

5312
  return code;
H
Hongze Cheng 已提交
5313
}
H
Hongze Cheng 已提交
5314

5315
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows, int32_t numOfBucket) {
5316 5317 5318
  if (numOfRows < startRow) {
    return 0;
  }
X
Xiaoyu Wang 已提交
5319
  int32_t bucketIndex = ((numOfRows - startRow) / bucketRange);
5320 5321 5322 5323
  if (bucketIndex == numOfBucket) {
    bucketIndex -= 1;
  }
  return bucketIndex;
5324
}
H
Hongze Cheng 已提交
5325

5326 5327 5328 5329
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
5330
  pTableBlockInfo->numOfVgroups = 1;
H
Hongze Cheng 已提交
5331

5332 5333
  const int32_t numOfBuckets = 20.0;

5334
  // find the start data block in file
dengyihao's avatar
dengyihao 已提交
5335
  tsdbAcquireReader(pReader);
H
Haojun Liao 已提交
5336
  if (pReader->flag == READER_STATUS_SUSPEND) {
5337 5338 5339 5340 5341
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
dengyihao's avatar
dengyihao 已提交
5342
  }
5343
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
5344

5345 5346 5347
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
5348

X
Xiaoyu Wang 已提交
5349
  int32_t bucketRange = ceil(((double)(pc->maxRows - pc->minRows)) / numOfBuckets);
H
Hongze Cheng 已提交
5350

5351
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
5352

H
Haojun Liao 已提交
5353
  int32_t numOfTables = (int32_t)tSimpleHashGetSize(pStatus->pTableMap);
5354
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
5355

5356 5357
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
5358

5359 5360
  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
H
Haojun Liao 已提交
5361
  }
H
Hongze Cheng 已提交
5362

5363
  pTableBlockInfo->numOfTables = numOfTables;
5364
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
5365

5366 5367
  while (true) {
    if (hasNext) {
H
Hongze Cheng 已提交
5368
      SDataBlk* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
5369

5370 5371
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
5372

5373 5374 5375
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
5376

5377 5378 5379
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
5380

5381 5382 5383
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
5384

5385
      pTableBlockInfo->totalSize += pBlock->aSubBlock[0].szBlock;
5386

5387
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows, numOfBuckets);
5388
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
5389

H
Haojun Liao 已提交
5390
      hasNext = blockIteratorNext(&pStatus->blockIter, pReader->idStr);
5391 5392
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
H
Haojun Liao 已提交
5393
      if ((code != TSDB_CODE_SUCCESS) || (pStatus->loadFromFile == false)) {
5394 5395
        break;
      }
H
refact  
Hongze Cheng 已提交
5396

5397 5398
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
      hasNext = (pBlockIter->numOfBlocks > 0);
5399
    }
H
refact  
Hongze Cheng 已提交
5400

H
Hongze Cheng 已提交
5401 5402
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
5403
  }
dengyihao's avatar
dengyihao 已提交
5404
  tsdbReleaseReader(pReader);
H
refact  
Hongze Cheng 已提交
5405 5406
  return code;
}
H
Hongze Cheng 已提交
5407

H
refact  
Hongze Cheng 已提交
5408
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
5409
  int32_t code = TSDB_CODE_SUCCESS;
5410
  int64_t rows = 0;
H
Hongze Cheng 已提交
5411

5412
  SReaderStatus* pStatus = &pReader->status;
5413
  tsdbAcquireReader(pReader);
H
Haojun Liao 已提交
5414
  if (pReader->flag == READER_STATUS_SUSPEND) {
5415 5416 5417 5418 5419
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
5420 5421
  }

H
Haojun Liao 已提交
5422 5423
  int32_t iter = 0;
  pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, NULL, &iter);
H
Hongze Cheng 已提交
5424

5425
  while (pStatus->pTableIter != NULL) {
5426
    STableBlockScanInfo* pBlockScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
5427 5428

    STbData* d = NULL;
5429
    if (pReader->pReadSnap->pMem != NULL) {
H
Hongze Cheng 已提交
5430
      d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
5431 5432 5433 5434 5435 5436
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
5437
    if (pReader->pReadSnap->pIMem != NULL) {
H
Hongze Cheng 已提交
5438
      di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
5439 5440 5441 5442 5443 5444
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
H
Haojun Liao 已提交
5445
    pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, pStatus->pTableIter, &iter);
5446
  }
H
Hongze Cheng 已提交
5447

5448
  tsdbReleaseReader(pReader);
5449

H
refact  
Hongze Cheng 已提交
5450
  return rows;
H
Hongze Cheng 已提交
5451
}
D
dapan1121 已提交
5452

H
Haojun Liao 已提交
5453
int32_t tsdbGetTableSchema(void* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
5454
  SMetaReader mr = {0};
H
Haojun Liao 已提交
5455
  metaReaderInit(&mr, ((SVnode*)pVnode)->pMeta, 0);
5456
  int32_t code = metaReaderGetTableEntryByUidCache(&mr, uid);
D
dapan1121 已提交
5457 5458 5459 5460 5461 5462 5463
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
5464

5465
  // only child table and ordinary table is allowed, super table is not allowed.
D
dapan1121 已提交
5466
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
5467
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
5468
    *suid = mr.me.ctbEntry.suid;
5469
    code = metaReaderGetTableEntryByUidCache(&mr, *suid);
D
dapan1121 已提交
5470 5471 5472 5473 5474
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
5475
  } else if (mr.me.type == TSDB_NORMAL_TABLE) {  // do nothing
H
Haojun Liao 已提交
5476 5477 5478 5479
  } else {
    terrno = TSDB_CODE_INVALID_PARA;
    metaReaderClear(&mr);
    return terrno;
D
dapan1121 已提交
5480 5481 5482
  }

  metaReaderClear(&mr);
L
Liu Jicong 已提交
5483

5484
  // get the newest table schema version
H
Haojun Liao 已提交
5485
  code = metaGetTbTSchemaEx(((SVnode*)pVnode)->pMeta, *suid, uid, -1, pSchema);
5486
  return code;
D
dapan1121 已提交
5487
}
H
Hongze Cheng 已提交
5488

H
Hongze Cheng 已提交
5489
int32_t tsdbTakeReadSnap(STsdbReader* pReader, _query_reseek_func_t reseek, STsdbReadSnap** ppSnap) {
H
Hongze Cheng 已提交
5490 5491 5492
  int32_t        code = 0;
  STsdb*         pTsdb = pReader->pTsdb;
  SVersionRange* pRange = &pReader->verRange;
H
Hongze Cheng 已提交
5493 5494

  // alloc
H
Hongze Cheng 已提交
5495 5496
  STsdbReadSnap* pSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(*pSnap));
  if (pSnap == NULL) {
H
Hongze Cheng 已提交
5497 5498 5499 5500 5501
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
H
Hongze Cheng 已提交
5502
  taosThreadRwlockRdlock(&pTsdb->rwLock);
H
Hongze Cheng 已提交
5503 5504

  // take snapshot
H
Hongze Cheng 已提交
5505
  if (pTsdb->mem && (pRange->minVer <= pTsdb->mem->maxVer && pRange->maxVer >= pTsdb->mem->minVer)) {
H
Hongze Cheng 已提交
5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516
    pSnap->pMem = pTsdb->mem;
    pSnap->pNode = taosMemoryMalloc(sizeof(*pSnap->pNode));
    if (pSnap->pNode == NULL) {
      taosThreadRwlockUnlock(&pTsdb->rwLock);
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
    pSnap->pNode->pQHandle = pReader;
    pSnap->pNode->reseek = reseek;

    tsdbRefMemTable(pTsdb->mem, pSnap->pNode);
H
Hongze Cheng 已提交
5517 5518
  }

H
Hongze Cheng 已提交
5519
  if (pTsdb->imem && (pRange->minVer <= pTsdb->imem->maxVer && pRange->maxVer >= pTsdb->imem->minVer)) {
H
Hongze Cheng 已提交
5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530
    pSnap->pIMem = pTsdb->imem;
    pSnap->pINode = taosMemoryMalloc(sizeof(*pSnap->pINode));
    if (pSnap->pINode == NULL) {
      taosThreadRwlockUnlock(&pTsdb->rwLock);
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
    pSnap->pINode->pQHandle = pReader;
    pSnap->pINode->reseek = reseek;

    tsdbRefMemTable(pTsdb->imem, pSnap->pINode);
H
Hongze Cheng 已提交
5531 5532
  }

H
Hongze Cheng 已提交
5533
  // fs
H
Hongze Cheng 已提交
5534
  code = tsdbFSRef(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
5535 5536 5537 5538
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
5539 5540

  // unlock
H
Hongze Cheng 已提交
5541
  taosThreadRwlockUnlock(&pTsdb->rwLock);
H
Hongze Cheng 已提交
5542

5543
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
5544

H
Hongze Cheng 已提交
5545
_exit:
H
Hongze Cheng 已提交
5546 5547 5548 5549 5550 5551 5552 5553 5554 5555
  if (code) {
    *ppSnap = NULL;
    if (pSnap) {
      if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
      if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
      taosMemoryFree(pSnap);
    }
  } else {
    *ppSnap = pSnap;
  }
H
Hongze Cheng 已提交
5556 5557 5558
  return code;
}

5559
void tsdbUntakeReadSnap(STsdbReader* pReader, STsdbReadSnap* pSnap, bool proactive) {
H
Hongze Cheng 已提交
5560 5561
  STsdb* pTsdb = pReader->pTsdb;

H
Hongze Cheng 已提交
5562 5563
  if (pSnap) {
    if (pSnap->pMem) {
5564
      tsdbUnrefMemTable(pSnap->pMem, pSnap->pNode, proactive);
H
Hongze Cheng 已提交
5565 5566 5567
    }

    if (pSnap->pIMem) {
5568
      tsdbUnrefMemTable(pSnap->pIMem, pSnap->pINode, proactive);
H
Hongze Cheng 已提交
5569 5570
    }

H
Hongze Cheng 已提交
5571
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
5572 5573
    if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
    if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
H
Hongze Cheng 已提交
5574
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
5575
  }
5576
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
5577
}
5578 5579 5580 5581 5582

// if failed, do nothing
void tsdbReaderSetId(STsdbReader* pReader, const char* idstr) {
  taosMemoryFreeClear(pReader->idStr);
  pReader->idStr = taosStrdup(idstr);
5583
}
H
Haojun Liao 已提交
5584

H
Haojun Liao 已提交
5585
void tsdbReaderSetCloseFlag(STsdbReader* pReader) { pReader->code = TSDB_CODE_TSC_QUERY_CANCELLED; }
5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617

/*-------------todo:refactor the implementation of those APIs in this file to seperate the API into two files------*/
// opt perf, do NOT create so many readers
int64_t tsdbGetLastTimestamp(SVnode* pVnode, void* pTableList, int32_t numOfTables, const char* pIdStr) {
  SQueryTableDataCond cond = {.type = TIMEWINDOW_RANGE_CONTAINED, .numOfCols = 1, .order = TSDB_ORDER_DESC,
                              .startVersion = -1, .endVersion = -1};
  cond.twindows.skey = INT64_MIN;
  cond.twindows.ekey = INT64_MAX;

  cond.colList = taosMemoryCalloc(1, sizeof(SColumnInfo));
  cond.pSlotList = taosMemoryMalloc(sizeof(int32_t) * cond.numOfCols);
  if (cond.colList == NULL || cond.pSlotList == NULL) {
    // todo
  }

  cond.colList[0].colId = 1;
  cond.colList[0].slotId = 0;
  cond.colList[0].type = TSDB_DATA_TYPE_TIMESTAMP;

  cond.pSlotList[0] = 0;

  STableKeyInfo* pTableKeyInfo = pTableList;
  STsdbReader* pReader = NULL;
  SSDataBlock* pBlock = createDataBlock();

  SColumnInfoData data = {0};
  data.info = (SColumnInfo) {.type = TSDB_DATA_TYPE_TIMESTAMP, .colId = 1, .bytes = TSDB_KEYSIZE};
  blockDataAppendColInfo(pBlock, &data);

  int64_t key = INT64_MIN;

  for(int32_t i = 0; i < numOfTables; ++i) {
5618
    int32_t code = tsdbReaderOpen(pVnode, &cond, &pTableKeyInfo[i], 1, pBlock, (void**)&pReader, pIdStr, false, NULL);
5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    bool hasData = false;
    code = tsdbNextDataBlock(pReader, &hasData);
    if (!hasData || code != TSDB_CODE_SUCCESS) {
      continue;
    }

    SColumnInfoData* pCol = taosArrayGet(pBlock->pDataBlock, 0);
    int64_t k = *(int64_t*)pCol->pData;

    if (key < k) {
      key = k;
    }

    tsdbReaderClose(pReader);
  }

  return 0;
}