tsdbRead.c 183.1 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Haojun Liao 已提交
16
#include "osDef.h"
H
Hongze Cheng 已提交
17
#include "tsdb.h"
18
#include "tsimplehash.h"
19

H
Hongze Cheng 已提交
20
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
21
#define getCurrentKeyInLastBlock(_r) ((_r)->currentKey)
H
Hongze Cheng 已提交
22

H
Haojun Liao 已提交
23
typedef enum {
H
Haojun Liao 已提交
24 25 26
  READER_STATUS_SUSPEND = 0x1,
  READER_STATUS_NORMAL = 0x2,
} EReaderStatus;
H
Hongze Cheng 已提交
27

28 29 30 31 32 33
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

D
dapan1121 已提交
34 35 36 37 38
typedef enum {
  READ_MODE_COUNT_ONLY = 0x1,
  READ_MODE_ALL,
} EReadMode;

39
typedef struct {
dengyihao's avatar
dengyihao 已提交
40
  STbDataIter* iter;
41 42 43 44
  int32_t      index;
  bool         hasVal;
} SIterInfo;

45 46
typedef struct {
  int32_t numOfBlocks;
47
  int32_t numOfLastFiles;
48 49
} SBlockNumber;

50
typedef struct SBlockIndex {
51 52
  int32_t     ordinalIndex;
  int64_t     inFileOffset;
H
Haojun Liao 已提交
53
  STimeWindow window;  // todo replace it with overlap flag.
54 55
} SBlockIndex;

H
Haojun Liao 已提交
56
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
57 58
  uint64_t  uid;
  TSKEY     lastKey;
59
  TSKEY     lastKeyInStt;       // last accessed key in stt
H
Hongze Cheng 已提交
60
  SMapData  mapData;            // block info (compressed)
61
  SArray*   pBlockList;         // block data index list, SArray<SBlockIndex>
H
Hongze Cheng 已提交
62 63 64 65 66 67
  SIterInfo iter;               // mem buffer skip list iterator
  SIterInfo iiter;              // imem buffer skip list iterator
  SArray*   delSkyline;         // delete info for this table
  int32_t   fileDelIndex;       // file block delete index
  int32_t   lastBlockDelIndex;  // delete index for last block
  bool      iterInit;           // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
68 69 70
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
71
  int64_t uid;
72
  int64_t offset;
H
Haojun Liao 已提交
73
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
74 75

typedef struct SBlockOrderSupporter {
76 77 78 79
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
80 81 82
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
83 84 85
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
86
  int64_t headFileLoad;
87
  double  headFileLoadTime;
88
  int64_t smaDataLoad;
89
  double  smaLoadTime;
90 91
  int64_t lastBlockLoad;
  double  lastBlockLoadTime;
H
Haojun Liao 已提交
92 93
  int64_t composedBlocks;
  double  buildComposedBlockTime;
H
Haojun Liao 已提交
94
  double  createScanInfoList;
X
Xiaoyu Wang 已提交
95 96 97
  //  double  getTbFromMemTime;
  //  double  getTbFromIMemTime;
  double initDelSkylineIterTime;
H
Hongze Cheng 已提交
98 99 100
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
101 102 103 104 105 106 107
  SArray*        pColAgg;
  SColumnDataAgg tsColAgg;
  int16_t*       colId;
  int16_t*       slotId;
  int32_t        numOfCols;
  char**         buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
  bool           smaValid;  // the sma on all queried columns are activated
H
Hongze Cheng 已提交
108 109
} SBlockLoadSuppInfo;

110
typedef struct SLastBlockReader {
H
Hongze Cheng 已提交
111 112 113 114 115
  STimeWindow        window;
  SVersionRange      verRange;
  int32_t            order;
  uint64_t           uid;
  SMergeTree         mergeTree;
116
  SSttBlockLoadInfo* pInfo;
117
  int64_t            currentKey;
118 119
} SLastBlockReader;

120
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
121 122 123
  int32_t           numOfFiles;  // number of total files
  int32_t           index;       // current accessed index in the list
  SArray*           pFileList;   // data file list
124
  int32_t           order;
H
Hongze Cheng 已提交
125
  SLastBlockReader* pLastBlockReader;  // last file block reader
126
} SFilesetIter;
H
Haojun Liao 已提交
127 128

typedef struct SFileDataBlockInfo {
129
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
130
  uint64_t uid;
131
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
132 133 134
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
H
Haojun Liao 已提交
135 136 137 138 139 140
  int32_t    numOfBlocks;
  int32_t    index;
  SArray*    blockList;  // SArray<SFileDataBlockInfo>
  int32_t    order;
  SDataBlk   block;  // current SDataBlk data
  SSHashObj* pTableMap;
H
Haojun Liao 已提交
141 142 143
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
144 145 146 147
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
148 149
} SFileBlockDumpInfo;

150
typedef struct STableUidList {
151 152
  uint64_t* tableUidList;  // access table uid list in uid ascending order list
  int32_t   currentIndex;  // index in table uid list
153
} STableUidList;
154

H
Haojun Liao 已提交
155
typedef struct SReaderStatus {
H
Hongze Cheng 已提交
156 157
  bool                  loadFromFile;       // check file stage
  bool                  composedDataBlock;  // the returned data block is a composed block or not
158
  bool                  mapDataCleaned;     // mapData has been cleaned up alreay or not
H
Haojun Liao 已提交
159
  SSHashObj*            pTableMap;          // SHash<STableBlockScanInfo>
160
  STableBlockScanInfo** pTableIter;         // table iterator used in building in-memory buffer data blocks.
161
  STableUidList         uidList;            // check tables in uid order, to avoid the repeatly load of blocks in STT.
H
Hongze Cheng 已提交
162 163 164 165 166
  SFileBlockDumpInfo    fBlockDumpInfo;
  SDFileSet*            pCurrentFileset;  // current opened file set
  SBlockData            fileBlockData;
  SFilesetIter          fileIter;
  SDataBlockIter        blockIter;
167
  SLDataIter*           pLDataIter;
H
Haojun Liao 已提交
168
  SRowMerger            merger;
169
  SColumnInfoData*      pPrimaryTsCol;      // primary time stamp output col info data
H
Haojun Liao 已提交
170 171
} SReaderStatus;

172
typedef struct SBlockInfoBuf {
H
Hongze Cheng 已提交
173 174 175
  int32_t currentIndex;
  SArray* pData;
  int32_t numPerBucket;
D
dapan1121 已提交
176
  int32_t numOfTables;
177 178
} SBlockInfoBuf;

H
Haojun Liao 已提交
179 180 181 182 183 184 185
typedef struct STsdbReaderAttr {
  STSchema*     pSchema;
  EReadMode     readMode;
  uint64_t      rowsNum;
  STimeWindow   window;
  bool          freeBlock;
  SVersionRange verRange;
H
Haojun Liao 已提交
186
  int16_t       order;
H
Haojun Liao 已提交
187 188
} STsdbReaderAttr;

189 190 191 192 193 194
typedef struct SResultBlockInfo {
  SSDataBlock* pResBlock;
  bool         freeBlock;
  int64_t      capacity;
} SResultBlockInfo;

H
Hongze Cheng 已提交
195
struct STsdbReader {
H
Haojun Liao 已提交
196
  STsdb*             pTsdb;
197 198
  SVersionRange      verRange;
  TdThreadMutex      readerMutex;
H
Haojun Liao 已提交
199 200
  EReaderStatus      flag;
  int32_t            code;
H
Haojun Liao 已提交
201 202
  uint64_t           suid;
  int16_t            order;
D
dapan1121 已提交
203 204
  EReadMode          readMode;
  uint64_t           rowsNum;
H
Haojun Liao 已提交
205
  STimeWindow        window;  // the primary query time window that applies to all queries
206
  SResultBlockInfo   resBlockInfo;
H
Haojun Liao 已提交
207
  SReaderStatus      status;
208 209
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
210
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
211
  STsdbReadSnap*     pReadSnap;
212
  SIOCostSummary     cost;
213
  SHashObj**         pIgnoreTables;
H
Haojun Liao 已提交
214 215 216 217 218 219
  STSchema*          pSchema;      // the newest version schema
  SSHashObj*         pSchemaMap;   // keep the retrieved schema info, to avoid the overhead by repeatly load schema
  SDataFReader*      pFileReader;  // the file reader
  SDelFReader*       pDelFReader;  // the del file reader
  SArray*            pDelIdx;      // del file block index;
  SBlockInfoBuf      blockInfoBuf;
220
  EContentData       step;
H
Haojun Liao 已提交
221
  STsdbReader*       innerReader[2];
H
Hongze Cheng 已提交
222
};
H
Hongze Cheng 已提交
223

H
Haojun Liao 已提交
224
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
225 226
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
227
static TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
H
Haojun Liao 已提交
228
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
H
Hongze Cheng 已提交
229
static int32_t  doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
230
                                       SRowMerger* pMerger, SVersionRange* pVerRange, const char* id);
H
Haojun Liao 已提交
231
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, STsdbReader* pReader);
H
Hongze Cheng 已提交
232
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow,
H
Haojun Liao 已提交
233
                                     STableBlockScanInfo* pScanInfo);
234
static int32_t  doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
H
Hongze Cheng 已提交
235
                                         int32_t rowIndex);
236
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
237
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order,
H
Hongze Cheng 已提交
238
                               SVersionRange* pVerRange);
239

H
Hongze Cheng 已提交
240
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
H
Haojun Liao 已提交
241
                                        TSDBROW* pResRow, STsdbReader* pReader, bool* freeTSRow);
H
Hongze Cheng 已提交
242
static int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo,
H
Hongze Cheng 已提交
243
                                  STsdbReader* pReader, SRow** pTSRow);
244 245
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                                     STsdbReader* pReader);
246

dengyihao's avatar
dengyihao 已提交
247 248 249 250
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
251
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Hongze Cheng 已提交
252 253
static bool          hasDataInLastBlock(SLastBlockReader* pLastBlockReader);
static int32_t       doBuildDataBlock(STsdbReader* pReader);
C
Cary Xu 已提交
254
static TSDBKEY       getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
255
static bool          hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo);
256
static void          initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter);
257
static int32_t       getInitialDelIndex(const SArray* pDelSkyline, int32_t order);
C
Cary Xu 已提交
258

H
Haojun Liao 已提交
259
static STableBlockScanInfo* getTableBlockScanInfo(SSHashObj* pTableMap, uint64_t uid, const char* id);
260

C
Cary Xu 已提交
261
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
H
Haojun Liao 已提交
262

263 264
static int32_t setColumnIdSlotList(SBlockLoadSuppInfo* pSupInfo, SColumnInfo* pCols, const int32_t* pSlotIdList,
                                   int32_t numOfCols) {
265
  pSupInfo->smaValid = true;
266
  pSupInfo->numOfCols = numOfCols;
267
  pSupInfo->colId = taosMemoryMalloc(numOfCols * (sizeof(int16_t) * 2 + POINTER_BYTES));
H
Haojun Liao 已提交
268 269
  if (pSupInfo->colId == NULL) {
    taosMemoryFree(pSupInfo->colId);
H
Haojun Liao 已提交
270 271
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
272

H
Haojun Liao 已提交
273
  pSupInfo->slotId = (int16_t*)((char*)pSupInfo->colId + (sizeof(int16_t) * numOfCols));
274
  pSupInfo->buildBuf = (char**)((char*)pSupInfo->slotId + (sizeof(int16_t) * numOfCols));
H
Haojun Liao 已提交
275
  for (int32_t i = 0; i < numOfCols; ++i) {
H
Haojun Liao 已提交
276 277
    pSupInfo->colId[i] = pCols[i].colId;
    pSupInfo->slotId[i] = pSlotIdList[i];
278

H
Haojun Liao 已提交
279 280
    if (IS_VAR_DATA_TYPE(pCols[i].type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCols[i].bytes);
H
Haojun Liao 已提交
281 282
    } else {
      pSupInfo->buildBuf[i] = NULL;
283
    }
H
Haojun Liao 已提交
284
  }
H
Hongze Cheng 已提交
285

H
Haojun Liao 已提交
286 287
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
288

H
Haojun Liao 已提交
289
static int32_t updateBlockSMAInfo(STSchema* pSchema, SBlockLoadSuppInfo* pSupInfo) {
290 291
  int32_t i = 0, j = 0;

H
Hongze Cheng 已提交
292
  while (i < pSchema->numOfCols && j < pSupInfo->numOfCols) {
293
    STColumn* pTCol = &pSchema->columns[i];
H
Haojun Liao 已提交
294
    if (pTCol->colId == pSupInfo->colId[j]) {
295 296
      if (!IS_BSMA_ON(pTCol)) {
        pSupInfo->smaValid = false;
H
Haojun Liao 已提交
297
        return TSDB_CODE_SUCCESS;
298 299 300 301
      }

      i += 1;
      j += 1;
H
Haojun Liao 已提交
302
    } else if (pTCol->colId < pSupInfo->colId[j]) {
303 304 305
      // do nothing
      i += 1;
    } else {
H
Haojun Liao 已提交
306
      return TSDB_CODE_INVALID_PARA;
307 308
    }
  }
H
Haojun Liao 已提交
309 310

  return TSDB_CODE_SUCCESS;
311 312
}

313
static int32_t initBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
H
Hongze Cheng 已提交
314
  int32_t num = numOfTables / pBuf->numPerBucket;
315 316 317 318 319
  int32_t remainder = numOfTables % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

H
Hongze Cheng 已提交
320
  for (int32_t i = 0; i < num; ++i) {
321 322 323 324
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
325

326 327 328 329 330 331 332
    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
333
    }
334
    taosArrayPush(pBuf->pData, &p);
H
Haojun Liao 已提交
335
  }
H
Hongze Cheng 已提交
336

D
dapan1121 已提交
337 338 339 340 341 342 343 344 345 346 347
  pBuf->numOfTables = numOfTables;

  return TSDB_CODE_SUCCESS;
}

static int32_t ensureBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
  if (numOfTables <= pBuf->numOfTables) {
    return TSDB_CODE_SUCCESS;
  }

  if (pBuf->numOfTables > 0) {
348
    STableBlockScanInfo** p = (STableBlockScanInfo**)taosArrayPop(pBuf->pData);
D
dapan1121 已提交
349
    taosMemoryFree(*p);
D
dapan1121 已提交
350 351
    pBuf->numOfTables /= pBuf->numPerBucket;
  }
352

D
dapan1121 已提交
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  int32_t num = (numOfTables - pBuf->numOfTables) / pBuf->numPerBucket;
  int32_t remainder = (numOfTables - pBuf->numOfTables) % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

  for (int32_t i = 0; i < num; ++i) {
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
    taosArrayPush(pBuf->pData, &p);
  }

  pBuf->numOfTables = numOfTables;

H
Haojun Liao 已提交
378 379
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
380

381 382
static void clearBlockScanInfoBuf(SBlockInfoBuf* pBuf) {
  size_t num = taosArrayGetSize(pBuf->pData);
H
Hongze Cheng 已提交
383
  for (int32_t i = 0; i < num; ++i) {
384 385 386 387 388 389 390 391 392
    char** p = taosArrayGet(pBuf->pData, i);
    taosMemoryFree(*p);
  }

  taosArrayDestroy(pBuf->pData);
}

static void* getPosInBlockInfoBuf(SBlockInfoBuf* pBuf, int32_t index) {
  int32_t bucketIndex = index / pBuf->numPerBucket;
H
Hongze Cheng 已提交
393
  char**  pBucket = taosArrayGet(pBuf->pData, bucketIndex);
394 395 396
  return (*pBucket) + (index % pBuf->numPerBucket) * sizeof(STableBlockScanInfo);
}

H
Haojun Liao 已提交
397 398 399 400 401 402 403 404 405 406
static int32_t uidComparFunc(const void* p1, const void* p2) {
  uint64_t pu1 = *(uint64_t*)p1;
  uint64_t pu2 = *(uint64_t*)p2;
  if (pu1 == pu2) {
    return 0;
  } else {
    return (pu1 < pu2) ? -1 : 1;
  }
}

407
// NOTE: speedup the whole processing by preparing the buffer for STableBlockScanInfo in batch model
H
Haojun Liao 已提交
408
static SSHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, SBlockInfoBuf* pBuf, const STableKeyInfo* idList,
X
Xiaoyu Wang 已提交
409
                                         STableUidList* pUidList, int32_t numOfTables) {
H
Haojun Liao 已提交
410
  // allocate buffer in order to load data blocks from file
411
  // todo use simple hash instead, optimize the memory consumption
H
Haojun Liao 已提交
412
  SSHashObj* pTableMap = tSimpleHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT));
413
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
414 415 416
    return NULL;
  }

H
Haojun Liao 已提交
417
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
418
  initBlockScanInfoBuf(pBuf, numOfTables);
H
Haojun Liao 已提交
419

H
Haojun Liao 已提交
420 421
  pUidList->tableUidList = taosMemoryMalloc(numOfTables * sizeof(uint64_t));
  if (pUidList->tableUidList == NULL) {
H
Haojun Liao 已提交
422
    tSimpleHashCleanup(pTableMap);
H
Haojun Liao 已提交
423 424
    return NULL;
  }
H
Haojun Liao 已提交
425

H
Haojun Liao 已提交
426
  pUidList->currentIndex = 0;
H
Haojun Liao 已提交
427

428
  for (int32_t j = 0; j < numOfTables; ++j) {
H
Haojun Liao 已提交
429
    STableBlockScanInfo* pScanInfo = getPosInBlockInfoBuf(pBuf, j);
H
Haojun Liao 已提交
430

431
    pScanInfo->uid = idList[j].uid;
H
Haojun Liao 已提交
432
    pUidList->tableUidList[j] = idList[j].uid;
H
Haojun Liao 已提交
433

434
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
H
Haojun Liao 已提交
435
      int64_t skey = pTsdbReader->window.skey;
436
      pScanInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
H
Haojun Liao 已提交
437
      pScanInfo->lastKeyInStt = skey;
wmmhello's avatar
wmmhello 已提交
438
    } else {
H
Haojun Liao 已提交
439
      int64_t ekey = pTsdbReader->window.ekey;
440
      pScanInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
H
Haojun Liao 已提交
441
      pScanInfo->lastKeyInStt = ekey;
H
Haojun Liao 已提交
442
    }
wmmhello's avatar
wmmhello 已提交
443

H
Haojun Liao 已提交
444
    tSimpleHashPut(pTableMap, &pScanInfo->uid, sizeof(uint64_t), &pScanInfo, POINTER_BYTES);
H
Hongze Cheng 已提交
445 446
    tsdbTrace("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, pScanInfo->uid,
              pScanInfo->lastKey, pTsdbReader->idStr);
H
Haojun Liao 已提交
447 448
  }

H
Haojun Liao 已提交
449
  taosSort(pUidList->tableUidList, numOfTables, sizeof(uint64_t), uidComparFunc);
H
Haojun Liao 已提交
450

H
Haojun Liao 已提交
451 452 453 454
  pTsdbReader->cost.createScanInfoList = (taosGetTimestampUs() - st) / 1000.0;
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, elapsed time:%.2f ms, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->cost.createScanInfoList,
            pTsdbReader->idStr);
455

456
  return pTableMap;
H
Hongze Cheng 已提交
457
}
H
Hongze Cheng 已提交
458

H
Haojun Liao 已提交
459 460 461 462 463
static void resetAllDataBlockScanInfo(SSHashObj* pTableMap, int64_t ts, int32_t step) {
  void   *p = NULL;
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pTableMap, p, &iter)) != NULL) {
H
Hongze Cheng 已提交
464
    STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;
465 466

    pInfo->iterInit = false;
H
Haojun Liao 已提交
467
    pInfo->iter.hasVal = false;
468
    pInfo->iiter.hasVal = false;
H
Haojun Liao 已提交
469

470 471
    if (pInfo->iter.iter != NULL) {
      pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
472 473
    }

H
Haojun Liao 已提交
474 475
    if (pInfo->iiter.iter != NULL) {
      pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
476 477
    }

478 479
    pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    pInfo->lastKey = ts;
480
    pInfo->lastKeyInStt = ts + step;
481 482 483
  }
}

484 485
static void clearBlockScanInfo(STableBlockScanInfo* p) {
  p->iterInit = false;
H
Haojun Liao 已提交
486 487

  p->iter.hasVal = false;
488
  p->iiter.hasVal = false;
489

490 491 492
  if (p->iter.iter != NULL) {
    p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
  }
493

494 495 496
  if (p->iiter.iter != NULL) {
    p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
  }
497

498 499 500 501
  p->delSkyline = taosArrayDestroy(p->delSkyline);
  p->pBlockList = taosArrayDestroy(p->pBlockList);
  tMapDataClear(&p->mapData);
}
502

H
Haojun Liao 已提交
503
static void destroyAllBlockScanInfo(SSHashObj* pTableMap) {
504
  void* p = NULL;
H
Haojun Liao 已提交
505 506 507
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pTableMap, p, &iter)) != NULL) {
508
    clearBlockScanInfo(*(STableBlockScanInfo**)p);
509 510
  }

H
Haojun Liao 已提交
511
  tSimpleHashCleanup(pTableMap);
512 513
}

514
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) { return pWindow->skey > pWindow->ekey; }
H
Hongze Cheng 已提交
515

516 517 518
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
519
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
520

521
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
522
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
523

dengyihao's avatar
dengyihao 已提交
524
  STimeWindow win = *pWindow;
525 526 527 528 529 530
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
531

H
Haojun Liao 已提交
532
// init file iterator
533
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet, STsdbReader* pReader) {
H
Hongze Cheng 已提交
534
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
535

536 537
  pIter->index = ASCENDING_TRAVERSE(pReader->order) ? -1 : numOfFileset;
  pIter->order = pReader->order;
H
Hongze Cheng 已提交
538
  pIter->pFileList = aDFileSet;
539
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
540

541 542 543 544
  if (pIter->pLastBlockReader == NULL) {
    pIter->pLastBlockReader = taosMemoryCalloc(1, sizeof(struct SLastBlockReader));
    if (pIter->pLastBlockReader == NULL) {
      int32_t code = TSDB_CODE_OUT_OF_MEMORY;
545
      tsdbError("failed to prepare the last block iterator, since:%s %s", tstrerror(code), pReader->idStr);
546 547
      return code;
    }
548 549
  }

550 551 552 553 554 555 556 557
  SLastBlockReader* pLReader = pIter->pLastBlockReader;
  pLReader->order = pReader->order;
  pLReader->window = pReader->window;
  pLReader->verRange = pReader->verRange;

  pLReader->uid = 0;
  tMergeTreeClose(&pLReader->mergeTree);

558
  if (pLReader->pInfo == NULL) {
559
    // here we ignore the first column, which is always be the primary timestamp column
560 561 562
    SBlockLoadSuppInfo* pInfo = &pReader->suppInfo;

    int32_t numOfStt = pReader->pTsdb->pVnode->config.sttTrigger;
X
Xiaoyu Wang 已提交
563
    pLReader->pInfo = tCreateLastBlockLoadInfo(pReader->pSchema, &pInfo->colId[1], pInfo->numOfCols - 1, numOfStt);
H
Haojun Liao 已提交
564 565 566 567
    if (pLReader->pInfo == NULL) {
      tsdbDebug("init fileset iterator failed, code:%s %s", tstrerror(terrno), pReader->idStr);
      return terrno;
    }
568 569
  }

570
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
H
Haojun Liao 已提交
571 572 573
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
574
static int32_t filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader, bool* hasNext) {
575 576
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
577
  pIter->index += step;
D
dapan1121 已提交
578
  int32_t code = 0;
579 580

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
D
dapan1121 已提交
581 582
    *hasNext = false;
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
583 584
  }

H
Haojun Liao 已提交
585 586 587
  SIOCostSummary* pSum = &pReader->cost;
  getLastBlockLoadInfo(pIter->pLastBlockReader->pInfo, &pSum->lastBlockLoad, &pReader->cost.lastBlockLoadTime);

588 589
  pIter->pLastBlockReader->uid = 0;
  tMergeTreeClose(&pIter->pLastBlockReader->mergeTree);
590
  resetLastBlockLoadInfo(pIter->pLastBlockReader->pInfo);
591

H
Haojun Liao 已提交
592 593
  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
594

595
  while (1) {
H
Haojun Liao 已提交
596 597 598
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
599

600
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
601

D
dapan1121 已提交
602
    code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
603 604 605
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
606

607 608
    pReader->cost.headFileLoad += 1;

609 610 611 612 613 614 615
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
D
dapan1121 已提交
616 617
      *hasNext = false;
      return TSDB_CODE_SUCCESS;
618 619 620 621
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
622
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
D
dapan1121 已提交
623 624
        *hasNext = false;
        return TSDB_CODE_SUCCESS;
625
      }
626 627
      continue;
    }
C
Cary Xu 已提交
628

629
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
630
              pReader->window.ekey, pReader->idStr);
D
dapan1121 已提交
631 632
    *hasNext = true;
    return TSDB_CODE_SUCCESS;
633
  }
634

635
_err:
D
dapan1121 已提交
636 637
  *hasNext = false;
  return code;
H
Haojun Liao 已提交
638 639
}

640
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
641 642
  pIter->order = order;
  pIter->index = -1;
643
  pIter->numOfBlocks = 0;
644 645 646 647 648 649 650
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
}

L
Liu Jicong 已提交
651
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
652

H
Haojun Liao 已提交
653
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
654 655
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
656 657
}

658 659 660 661 662 663 664 665
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
H
Haojun Liao 已提交
666
    SColumnInfoData colInfo = {0};
667 668 669 670 671 672 673 674 675 676 677 678 679
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }
  return pResBlock;
}

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
static int32_t tsdbInitReaderLock(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexInit(&pReader->readerMutex, NULL);

  qTrace("tsdb/read: %p, post-init read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbUninitReaderLock(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexDestroy(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-uninit read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbAcquireReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-take read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexLock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-take read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbTryAcquireReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexTryLock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-trytake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

static int32_t tsdbReleaseReader(STsdbReader* pReader) {
  int32_t code = -1;
  qTrace("tsdb/read: %p, pre-untake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  code = taosThreadMutexUnlock(&pReader->readerMutex);

  qTrace("tsdb/read: %p, post-untake read mutex: %p, code: %d", pReader, &pReader->readerMutex, code);

  return code;
}

735 736 737 738 739 740
void tsdbReleaseDataBlock(STsdbReader* pReader) {
  SReaderStatus* pStatus = &pReader->status;
  if (!pStatus->composedDataBlock) {
    tsdbReleaseReader(pReader);
  }
}
741

742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
static int32_t initResBlockInfo(SResultBlockInfo* pResBlockInfo, int64_t capacity, SSDataBlock* pResBlock, SQueryTableDataCond* pCond) {
  pResBlockInfo->capacity = capacity;
  pResBlockInfo->pResBlock = pResBlock;
  terrno = 0;

  if (pResBlockInfo->pResBlock == NULL) {
    pResBlockInfo->freeBlock = true;
    pResBlockInfo->pResBlock = createResBlock(pCond, pResBlockInfo->capacity);
  } else {
    pResBlockInfo->freeBlock = false;
  }

  return terrno;
}

757
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, void** ppReader, int32_t capacity,
H
Haojun Liao 已提交
758
                                SSDataBlock* pResBlock, const char* idstr) {
H
Haojun Liao 已提交
759
  int32_t      code = 0;
760
  int8_t       level = 0;
H
Haojun Liao 已提交
761
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
762 763
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
764
    goto _end;
H
Hongze Cheng 已提交
765 766
  }

C
Cary Xu 已提交
767
  if (VND_IS_TSMA(pVnode)) {
H
Haojun Liao 已提交
768
    tsdbDebug("vgId:%d, tsma is selected to query, %s", TD_VID(pVnode), idstr);
C
Cary Xu 已提交
769 770
  }

H
Haojun Liao 已提交
771
  initReaderStatus(&pReader->status);
772

L
Liu Jicong 已提交
773
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
774 775
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
776

777
  pReader->idStr = (idstr != NULL) ? taosStrdup(idstr) : NULL;
dengyihao's avatar
dengyihao 已提交
778
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
779
  pReader->type = pCond->type;
780
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
781
  pReader->blockInfoBuf.numPerBucket = 1000;  // 1000 tables per bucket
H
Hongze Cheng 已提交
782

783 784 785
  code = initResBlockInfo(&pReader->resBlockInfo, capacity, pResBlock, pCond);
  if (code != TSDB_CODE_SUCCESS) {
    goto _end;
H
Haojun Liao 已提交
786
  }
787

H
Haojun Liao 已提交
788 789 790 791 792
  if (pCond->numOfCols <= 0) {
    tsdbError("vgId:%d, invalid column number %d in query cond, %s", TD_VID(pVnode), pCond->numOfCols, idstr);
    code = TSDB_CODE_INVALID_PARA;
    goto _end;
  }
H
Hongze Cheng 已提交
793

794 795
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
796
  pSup->pColAgg = taosArrayInit(pCond->numOfCols, sizeof(SColumnDataAgg));
H
Haojun Liao 已提交
797
  if (pSup->pColAgg == NULL) {
798 799 800
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
801

802
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
803
  setColumnIdSlotList(pSup, pCond->colList, pCond->pSlotList, pCond->numOfCols);
804

H
Hongze Cheng 已提交
805
  code = tBlockDataCreate(&pReader->status.fileBlockData);
H
Haojun Liao 已提交
806 807 808 809 810
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

811
  if (pReader->suppInfo.colId[0] != PRIMARYKEY_TIMESTAMP_COL_ID) {
812
    tsdbError("the first column isn't primary timestamp, %d, %s", pReader->suppInfo.colId[0], pReader->idStr);
K
kailixu 已提交
813
    code = TSDB_CODE_INVALID_PARA;
814 815 816
    goto _end;
  }

817
  pReader->status.pPrimaryTsCol = taosArrayGet(pReader->resBlockInfo.pResBlock->pDataBlock, pSup->slotId[0]);
818 819 820 821
  int32_t type = pReader->status.pPrimaryTsCol->info.type;
  if (type != TSDB_DATA_TYPE_TIMESTAMP) {
    tsdbError("the first column isn't primary timestamp in result block, actual: %s, %s", tDataTypes[type].name,
              pReader->idStr);
K
kailixu 已提交
822
    code = TSDB_CODE_INVALID_PARA;
823 824
    goto _end;
  }
825

826
  tsdbInitReaderLock(pReader);
827

H
Hongze Cheng 已提交
828 829
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
830

H
Haojun Liao 已提交
831 832
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
833 834 835
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
836

H
Haojun Liao 已提交
837
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
X
Xiaoyu Wang 已提交
838
  int64_t    st = taosGetTimestampUs();
839 840 841
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
842
    goto _end;
H
Haojun Liao 已提交
843
  }
H
Hongze Cheng 已提交
844

H
Haojun Liao 已提交
845
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
846

847 848
  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
849
  if (num == 0) {
850
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
851 852
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
853

H
Haojun Liao 已提交
854
  // todo binary search to the start position
855 856
  int64_t et1 = taosGetTimestampUs();

X
Xiaoyu Wang 已提交
857
  SBlockIdx*     pBlockIdx = NULL;
858
  STableUidList* pList = &pReader->status.uidList;
H
Haojun Liao 已提交
859

H
Haojun Liao 已提交
860
  int32_t i = 0, j = 0;
X
Xiaoyu Wang 已提交
861
  while (i < num && j < numOfTables) {
H
Haojun Liao 已提交
862
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Hongze Cheng 已提交
863
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
864
      i += 1;
H
Haojun Liao 已提交
865 866 867
      continue;
    }

H
Haojun Liao 已提交
868 869
    if (pBlockIdx->uid < pList->tableUidList[j]) {
      i += 1;
H
Haojun Liao 已提交
870 871 872
      continue;
    }

H
Haojun Liao 已提交
873
    if (pBlockIdx->uid > pList->tableUidList[j]) {
H
Haojun Liao 已提交
874
      j += 1;
H
Haojun Liao 已提交
875
      continue;
H
Haojun Liao 已提交
876 877
    }

H
Haojun Liao 已提交
878
    if (pBlockIdx->uid == pList->tableUidList[j]) {
H
Haojun Liao 已提交
879
      // this block belongs to a table that is not queried.
H
Haojun Liao 已提交
880 881
      STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockIdx->uid, pReader->idStr);
      if (pScanInfo == NULL) {
882
        tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
883
        return terrno;
H
Haojun Liao 已提交
884 885 886 887 888 889 890
      }

      if (pScanInfo->pBlockList == NULL) {
        pScanInfo->pBlockList = taosArrayInit(4, sizeof(SBlockIndex));
      }

      taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
891

H
Haojun Liao 已提交
892
      i += 1;
H
Haojun Liao 已提交
893
      j += 1;
894
    }
H
Haojun Liao 已提交
895
  }
H
Hongze Cheng 已提交
896

897
  int64_t et2 = taosGetTimestampUs();
H
Haojun Liao 已提交
898 899 900
  tsdbDebug("load block index for %d/%d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
            numOfTables, (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0,
            pReader->idStr);
901 902 903

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

904
_end:
905
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
906 907
  return code;
}
H
Hongze Cheng 已提交
908

909 910 911 912 913 914 915 916 917 918 919 920
static void doCleanupTableScanInfo(STableBlockScanInfo* pScanInfo) {
  // reset the index in last block when handing a new file
  tMapDataClear(&pScanInfo->mapData);
  taosArrayClear(pScanInfo->pBlockList);
}

static void cleanupTableScanInfo(SReaderStatus* pStatus) {
  if (pStatus->mapDataCleaned) {
    return;
  }

  SSHashObj* pTableMap = pStatus->pTableMap;
921
  STableBlockScanInfo** px = NULL;
H
Haojun Liao 已提交
922 923
  int32_t iter = 0;

dengyihao's avatar
dengyihao 已提交
924
  while (1) {
H
Haojun Liao 已提交
925
    px = tSimpleHashIterate(pTableMap, px, &iter);
926 927 928 929
    if (px == NULL) {
      break;
    }

930
    doCleanupTableScanInfo(*px);
931
  }
932 933

  pStatus->mapDataCleaned = true;
934 935
}

936
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum, SArray* pTableScanInfoList) {
937 938 939 940
  size_t  sizeInDisk = 0;
  size_t  numOfTables = taosArrayGetSize(pIndexList);

  int64_t st = taosGetTimestampUs();
941
  cleanupTableScanInfo(&pReader->status);
942

943 944
  // set the flag for the new file
  pReader->status.mapDataCleaned = false;
dengyihao's avatar
dengyihao 已提交
945
  for (int32_t i = 0; i < numOfTables; ++i) {
X
Xiaoyu Wang 已提交
946
    SBlockIdx*           pBlockIdx = taosArrayGet(pIndexList, i);
H
Haojun Liao 已提交
947 948 949 950
    STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockIdx->uid, pReader->idStr);
    if (pScanInfo == NULL) {
      return terrno;
    }
H
Hongze Cheng 已提交
951

952
    tMapDataReset(&pScanInfo->mapData);
H
Hongze Cheng 已提交
953
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);
H
Haojun Liao 已提交
954
    taosArrayEnsureCap(pScanInfo->pBlockList, pScanInfo->mapData.nItem);
955

956
    sizeInDisk += pScanInfo->mapData.nData;
957 958 959 960 961 962 963 964 965 966 967 968 969

    int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
    STimeWindow w = pReader->window;
    if (ASCENDING_TRAVERSE(pReader->order)) {
      w.skey = pScanInfo->lastKey + step;
    } else {
      w.ekey = pScanInfo->lastKey + step;
    }

    if (isEmptyQueryTimeWindow(&w)) {
      continue;
    }

H
Haojun Liao 已提交
970
    SDataBlk block = {0};
971
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Haojun Liao 已提交
972
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
H
Hongze Cheng 已提交
973

974
      // 1. time range check
975 976
      // if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
      if (block.minKey.ts > w.ekey || block.maxKey.ts < w.skey) {
H
Haojun Liao 已提交
977 978
        continue;
      }
H
Hongze Cheng 已提交
979

980
      // 2. version range check
H
Hongze Cheng 已提交
981
      if (block.minVer > pReader->verRange.maxVer || block.maxVer < pReader->verRange.minVer) {
982 983
        continue;
      }
984

985
      SBlockIndex bIndex = {.ordinalIndex = j, .inFileOffset = block.aSubBlock->offset};
986
      bIndex.window = (STimeWindow){.skey = block.minKey.ts, .ekey = block.maxKey.ts};
987

H
Haojun Liao 已提交
988 989
      void* p1 = taosArrayPush(pScanInfo->pBlockList, &bIndex);
      if (p1 == NULL) {
990
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
991 992
        return TSDB_CODE_OUT_OF_MEMORY;
      }
993

994
      pBlockNum->numOfBlocks += 1;
H
Haojun Liao 已提交
995
    }
H
Hongze Cheng 已提交
996

H
Haojun Liao 已提交
997
    if (taosArrayGetSize(pScanInfo->pBlockList) > 0) {
998
      taosArrayPush(pTableScanInfoList, &pScanInfo);
999 1000 1001
    }
  }

H
Hongze Cheng 已提交
1002
  pBlockNum->numOfLastFiles = pReader->pFileReader->pSet->nSttF;
1003
  int32_t total = pBlockNum->numOfLastFiles + pBlockNum->numOfBlocks;
1004

1005
  double el = (taosGetTimestampUs() - st) / 1000.0;
H
Hongze Cheng 已提交
1006
  tsdbDebug(
1007
      "load block of %ld tables completed, blocks:%d in %d tables, last-files:%d, block-info-size:%.2f Kb, elapsed "
1008
      "time:%.2f ms %s",
1009 1010
      numOfTables, pBlockNum->numOfBlocks, (int32_t)taosArrayGetSize(pTableScanInfoList), pBlockNum->numOfLastFiles,
      sizeInDisk / 1000.0, el, pReader->idStr);
1011

1012
  pReader->cost.numOfBlocks += total;
1013
  pReader->cost.headFileLoadTime += el;
1014

H
Haojun Liao 已提交
1015 1016
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1017

1018
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
1019
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
1020
  pDumpInfo->allDumped = true;
1021
  pDumpInfo->lastKey = maxKey + step;
H
Haojun Liao 已提交
1022 1023
}

D
dapan1121 已提交
1024
static int32_t doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
1025
                            SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
1026
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
H
Hongze Cheng 已提交
1027
    if (!COL_VAL_IS_VALUE(pColVal)) {
1028
      colDataSetNULL(pColInfoData, rowIndex);
H
Haojun Liao 已提交
1029 1030
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
D
dapan1121 已提交
1031
      if (pColVal->value.nData > pColInfoData->info.bytes) {
1032 1033
        tsdbWarn("column cid:%d actual data len %d is bigger than schema len %d", pColVal->cid, pColVal->value.nData,
                 pColInfoData->info.bytes);
D
dapan1121 已提交
1034 1035
        return TSDB_CODE_TDB_INVALID_TABLE_SCHEMA_VER;
      }
1036 1037 1038 1039
      if (pColVal->value.nData > 0) {  // pData may be null, if nData is 0
        memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      }

1040
      colDataSetVal(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
H
Haojun Liao 已提交
1041 1042
    }
  } else {
1043
    colDataSetVal(pColInfoData, rowIndex, (const char*)&pColVal->value, !COL_VAL_IS_VALUE(pColVal));
H
Haojun Liao 已提交
1044
  }
D
dapan1121 已提交
1045 1046

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1047 1048
}

1049
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
H
Haojun Liao 已提交
1050 1051 1052
  size_t num = taosArrayGetSize(pBlockIter->blockList);
  if (num == 0) {
    ASSERT(pBlockIter->numOfBlocks == num);
1053 1054
    return NULL;
  }
1055 1056 1057

  SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pBlockInfo;
1058 1059
}

H
Hongze Cheng 已提交
1060
static SDataBlk* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
1061

C
Cary Xu 已提交
1062 1063 1064 1065 1066 1067
static int doBinarySearchKey(TSKEY* keyList, int num, int pos, TSKEY key, int order) {
  // start end position
  int s, e;
  s = pos;

  // check
H
Haojun Liao 已提交
1068
  ASSERT(pos >= 0 && pos < num && num > 0);
C
Cary Xu 已提交
1069 1070
  if (order == TSDB_ORDER_ASC) {
    // find the first position which is smaller than the key
H
Hongze Cheng 已提交
1071 1072
    e = num - 1;
    if (key < keyList[pos]) return -1;
C
Cary Xu 已提交
1073 1074
    while (1) {
      // check can return
H
Hongze Cheng 已提交
1075 1076 1077
      if (key >= keyList[e]) return e;
      if (key <= keyList[s]) return s;
      if (e - s <= 1) return s;
C
Cary Xu 已提交
1078 1079

      // change start or end position
H
Hongze Cheng 已提交
1080
      int mid = s + (e - s + 1) / 2;
C
Cary Xu 已提交
1081 1082
      if (keyList[mid] > key)
        e = mid;
H
Hongze Cheng 已提交
1083
      else if (keyList[mid] < key)
C
Cary Xu 已提交
1084 1085 1086 1087
        s = mid;
      else
        return mid;
    }
H
Hongze Cheng 已提交
1088
  } else {  // DESC
C
Cary Xu 已提交
1089
    // find the first position which is bigger than the key
H
Hongze Cheng 已提交
1090 1091
    e = 0;
    if (key > keyList[pos]) return -1;
C
Cary Xu 已提交
1092 1093
    while (1) {
      // check can return
H
Hongze Cheng 已提交
1094 1095 1096
      if (key <= keyList[e]) return e;
      if (key >= keyList[s]) return s;
      if (s - e <= 1) return s;
C
Cary Xu 已提交
1097 1098

      // change start or end position
H
Hongze Cheng 已提交
1099
      int mid = s - (s - e + 1) / 2;
C
Cary Xu 已提交
1100 1101
      if (keyList[mid] < key)
        e = mid;
H
Hongze Cheng 已提交
1102
      else if (keyList[mid] > key)
C
Cary Xu 已提交
1103 1104 1105 1106 1107 1108 1109
        s = mid;
      else
        return mid;
    }
  }
}

H
Haojun Liao 已提交
1110
static int32_t getEndPosInDataBlock(STsdbReader* pReader, SBlockData* pBlockData, SDataBlk* pBlock, int32_t pos) {
C
Cary Xu 已提交
1111 1112
  // NOTE: reverse the order to find the end position in data block
  int32_t endPos = -1;
H
Hongze Cheng 已提交
1113
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
C
Cary Xu 已提交
1114 1115 1116 1117 1118 1119

  if (asc && pReader->window.ekey >= pBlock->maxKey.ts) {
    endPos = pBlock->nRow - 1;
  } else if (!asc && pReader->window.skey <= pBlock->minKey.ts) {
    endPos = 0;
  } else {
C
Cary Xu 已提交
1120 1121
    int64_t key = asc ? pReader->window.ekey : pReader->window.skey;
    endPos = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, pReader->order);
C
Cary Xu 已提交
1122 1123
  }

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
  if ((pReader->verRange.maxVer >= pBlock->minVer && pReader->verRange.maxVer < pBlock->maxVer)||
      (pReader->verRange.minVer <= pBlock->maxVer && pReader->verRange.minVer > pBlock->minVer)) {
    int32_t i = endPos;

    if (asc) {
      for(; i >= 0; --i) {
        if (pBlockData->aVersion[i] <= pReader->verRange.maxVer) {
          break;
        }
      }
    } else {
      for(; i < pBlock->nRow; ++i) {
        if (pBlockData->aVersion[i] >= pReader->verRange.minVer) {
          break;
        }
      }
    }

    endPos = i;
  }

C
Cary Xu 已提交
1145 1146 1147
  return endPos;
}

H
Haojun Liao 已提交
1148
static void copyPrimaryTsCol(const SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Haojun Liao 已提交
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
                             int32_t dumpedRows, bool asc) {
  if (asc) {
    memcpy(pColData->pData, &pBlockData->aTSKEY[pDumpInfo->rowIndex], dumpedRows * sizeof(int64_t));
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    memcpy(pColData->pData, &pBlockData->aTSKEY[startIndex], dumpedRows * sizeof(int64_t));

    // todo: opt perf by extract the loop
    // reverse the array list
    int32_t  mid = dumpedRows >> 1u;
    int64_t* pts = (int64_t*)pColData->pData;
    for (int32_t j = 0; j < mid; ++j) {
      int64_t t = pts[j];
      pts[j] = pts[dumpedRows - j - 1];
      pts[dumpedRows - j - 1] = t;
    }
  }
}

H
Haojun Liao 已提交
1168 1169
// a faster version of copy procedure.
static void copyNumericCols(const SColData* pData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Hongze Cheng 已提交
1170
                            int32_t dumpedRows, bool asc) {
H
Haojun Liao 已提交
1171 1172 1173 1174 1175 1176 1177 1178
  uint8_t* p = NULL;
  if (asc) {
    p = pData->pData + tDataTypes[pData->type].bytes * pDumpInfo->rowIndex;
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    p = pData->pData + tDataTypes[pData->type].bytes * startIndex;
  }

H
Hongze Cheng 已提交
1179
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1180

H
Haojun Liao 已提交
1181
  // make sure it is aligned to 8bit, the allocated memory address is aligned to 256bit
1182
  //  ASSERT((((uint64_t)pColData->pData) & (0x8 - 1)) == 0);
H
Haojun Liao 已提交
1183 1184 1185 1186 1187 1188

  // 1. copy data in a batch model
  memcpy(pColData->pData, p, dumpedRows * tDataTypes[pData->type].bytes);

  // 2. reverse the array list in case of descending order scan data block
  if (!asc) {
H
Hongze Cheng 已提交
1189
    switch (pColData->info.type) {
H
Haojun Liao 已提交
1190 1191 1192
      case TSDB_DATA_TYPE_TIMESTAMP:
      case TSDB_DATA_TYPE_DOUBLE:
      case TSDB_DATA_TYPE_BIGINT:
H
Hongze Cheng 已提交
1193
      case TSDB_DATA_TYPE_UBIGINT: {
H
Haojun Liao 已提交
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
        int32_t  mid = dumpedRows >> 1u;
        int64_t* pts = (int64_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_BOOL:
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_UTINYINT: {
H
Hongze Cheng 已提交
1207
        int32_t mid = dumpedRows >> 1u;
H
Haojun Liao 已提交
1208 1209
        int8_t* pts = (int8_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
1210
          int8_t t = pts[j];
H
Haojun Liao 已提交
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_SMALLINT:
      case TSDB_DATA_TYPE_USMALLINT: {
        int32_t  mid = dumpedRows >> 1u;
        int16_t* pts = (int16_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_FLOAT:
      case TSDB_DATA_TYPE_INT:
      case TSDB_DATA_TYPE_UINT: {
        int32_t  mid = dumpedRows >> 1u;
        int32_t* pts = (int32_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
1235
          int32_t t = pts[j];
H
Haojun Liao 已提交
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }
    }
  }

  // 3. if the  null value exists, check items one-by-one
  if (pData->flag != HAS_VALUE) {
    int32_t rowIndex = 0;

    for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step, rowIndex++) {
      uint8_t v = tColDataGetBitValue(pData, j);
      if (v == 0 || v == 1) {
        colDataSetNull_f(pColData->nullbitmap, rowIndex);
        pColData->hasNull = true;
      }
    }
  }
}

1258
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
1259 1260 1261 1262
  SReaderStatus*      pStatus = &pReader->status;
  SDataBlockIter*     pBlockIter = &pStatus->blockIter;
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Hongze Cheng 已提交
1263

1264
  SBlockData*         pBlockData = &pStatus->fileBlockData;
C
Cary Xu 已提交
1265
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
H
Hongze Cheng 已提交
1266
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
1267
  SSDataBlock*        pResBlock = pReader->resBlockInfo.pResBlock;
H
Haojun Liao 已提交
1268
  int32_t             numOfOutputCols = pSupInfo->numOfCols;
D
dapan1121 已提交
1269
  int32_t             code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1270

H
Haojun Liao 已提交
1271
  SColVal cv = {0};
1272
  int64_t st = taosGetTimestampUs();
1273 1274
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
1275

1276 1277
  // no data exists, return directly.
  if (pBlockData->nRow == 0 || pBlockData->aTSKEY == 0) {
X
Xiaoyu Wang 已提交
1278 1279
    tsdbWarn("%p no need to copy since no data in blockData, table uid:%" PRIu64 " has been dropped, %s", pReader,
             pBlockInfo->uid, pReader->idStr);
1280 1281 1282 1283
    pResBlock->info.rows = 0;
    return 0;
  }

1284
  // row index of dump info remain the initial position, let's find the appropriate start position.
1285
  if ((pDumpInfo->rowIndex == 0 && asc) || (pDumpInfo->rowIndex == pBlock->nRow - 1 && (!asc))) {
1286
    if (asc && pReader->window.skey <= pBlock->minKey.ts && pReader->verRange.minVer <= pBlock->minVer) {
1287
      // pDumpInfo->rowIndex = 0;
1288
    } else if (!asc && pReader->window.ekey >= pBlock->maxKey.ts && pReader->verRange.maxVer >= pBlock->maxVer) {
1289
      // pDumpInfo->rowIndex = pBlock->nRow - 1;
H
Haojun Liao 已提交
1290
    } else {  // find the appropriate the start position in current block, and set it to be the current rowIndex
1291
      int32_t pos = asc ? pBlock->nRow - 1 : 0;
C
Cary Xu 已提交
1292 1293 1294
      int32_t order = asc ? TSDB_ORDER_DESC : TSDB_ORDER_ASC;
      int64_t key = asc ? pReader->window.skey : pReader->window.ekey;
      pDumpInfo->rowIndex = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, order);
H
Haojun Liao 已提交
1295 1296 1297 1298 1299 1300 1301 1302 1303

      if (pDumpInfo->rowIndex < 0) {
        tsdbError(
            "%p failed to locate the start position in current block, global index:%d, table index:%d, brange:%" PRId64
            "-%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 " %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->minVer,
            pBlock->maxVer, pReader->idStr);
        return TSDB_CODE_INVALID_PARA;
      }
1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326

      ASSERT(pReader->verRange.minVer <= pBlock->maxVer && pReader->verRange.maxVer >= pBlock->minVer);

      // find the appropriate start position that satisfies the version requirement.
      if ((pReader->verRange.maxVer >= pBlock->minVer && pReader->verRange.maxVer < pBlock->maxVer)||
          (pReader->verRange.minVer <= pBlock->maxVer && pReader->verRange.minVer > pBlock->minVer)) {
        int32_t i = pDumpInfo->rowIndex;
        if (asc) {
          for(; i < pBlock->nRow; ++i) {
            if (pBlockData->aVersion[i] >= pReader->verRange.minVer) {
              break;
            }
          }
        } else {
          for(; i >= 0; --i) {
            if (pBlockData->aVersion[i] <= pReader->verRange.maxVer) {
              break;
            }
          }
        }

        pDumpInfo->rowIndex = i;
      }
1327
    }
C
Cary Xu 已提交
1328 1329 1330 1331 1332 1333 1334 1335 1336 1337
  }

  // time window check
  int32_t endIndex = getEndPosInDataBlock(pReader, pBlockData, pBlock, pDumpInfo->rowIndex);
  if (endIndex == -1) {
    setBlockAllDumped(pDumpInfo, pReader->window.ekey, pReader->order);
    return TSDB_CODE_SUCCESS;
  }

  endIndex += step;
H
Haojun Liao 已提交
1338
  int32_t dumpedRows = asc ? (endIndex - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex - endIndex);
1339 1340
  if (dumpedRows > pReader->resBlockInfo.capacity) {  // output buffer check
    dumpedRows = pReader->resBlockInfo.capacity;
1341 1342 1343
  } else if (dumpedRows <= 0) {  // no qualified rows in current data block, abort directly.
    setBlockAllDumped(pDumpInfo, pReader->window.ekey, pReader->order);
    return TSDB_CODE_SUCCESS;
1344 1345
  }

H
Haojun Liao 已提交
1346
  int32_t i = 0;
C
Cary Xu 已提交
1347 1348
  int32_t rowIndex = 0;

H
Haojun Liao 已提交
1349 1350
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
1351
    copyPrimaryTsCol(pBlockData, pDumpInfo, pColData, dumpedRows, asc);
1352 1353 1354
    i += 1;
  }

1355
  int32_t colIndex = 0;
H
Hongze Cheng 已提交
1356
  int32_t num = pBlockData->nColData;
1357
  while (i < numOfOutputCols && colIndex < num) {
1358 1359
    rowIndex = 0;

H
Hongze Cheng 已提交
1360
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
H
Haojun Liao 已提交
1361
    if (pData->cid < pSupInfo->colId[i]) {
1362
      colIndex += 1;
H
Haojun Liao 已提交
1363 1364
    } else if (pData->cid == pSupInfo->colId[i]) {
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
1365

H
Hongze Cheng 已提交
1366
      if (pData->flag == HAS_NONE || pData->flag == HAS_NULL || pData->flag == (HAS_NULL | HAS_NONE)) {
1367
        colDataSetNNULL(pColData, 0, dumpedRows);
C
Cary Xu 已提交
1368
      } else {
H
Haojun Liao 已提交
1369
        if (IS_MATHABLE_TYPE(pColData->info.type)) {
H
Haojun Liao 已提交
1370 1371
          copyNumericCols(pData, pDumpInfo, pColData, dumpedRows, asc);
        } else {  // varchar/nchar type
H
Haojun Liao 已提交
1372
          for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step) {
C
Cary Xu 已提交
1373
            tColDataGetValue(pData, j, &cv);
D
dapan1121 已提交
1374 1375 1376 1377
            code = doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
            if (code) {
              return code;
            }
C
Cary Xu 已提交
1378 1379
          }
        }
H
Haojun Liao 已提交
1380
      }
C
Cary Xu 已提交
1381

1382
      colIndex += 1;
1383
      i += 1;
1384
    } else {  // the specified column does not exist in file block, fill with null data
H
Haojun Liao 已提交
1385
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
1386
      colDataSetNNULL(pColData, 0, dumpedRows);
1387
      i += 1;
H
Haojun Liao 已提交
1388
    }
1389 1390
  }

1391
  // fill the mis-matched columns with null value
1392
  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
1393
    pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
1394
    colDataSetNNULL(pColData, 0, dumpedRows);
1395
    i += 1;
H
Haojun Liao 已提交
1396
  }
H
Haojun Liao 已提交
1397

1398
  pResBlock->info.dataLoad = 1;
H
Haojun Liao 已提交
1399 1400
  pResBlock->info.rows = dumpedRows;
  pDumpInfo->rowIndex += step * dumpedRows;
1401

1402
  // check if current block are all handled
C
Cary Xu 已提交
1403 1404
  if (pDumpInfo->rowIndex >= 0 && pDumpInfo->rowIndex < pBlock->nRow) {
    int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1405 1406 1407
    if (outOfTimeWindow(ts, &pReader->window)) {  // the remain data has out of query time window, ignore current block
      setBlockAllDumped(pDumpInfo, ts, pReader->order);
    }
C
Cary Xu 已提交
1408
  } else {
1409 1410
    int64_t ts = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
    setBlockAllDumped(pDumpInfo, ts, pReader->order);
C
Cary Xu 已提交
1411
  }
H
Haojun Liao 已提交
1412

1413
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
1414
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
1415

1416
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
1417
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
1418
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", uid:%" PRIu64 " elapsed time:%.2f ms, %s",
H
Haojun Liao 已提交
1419
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, dumpedRows,
H
Haojun Liao 已提交
1420
            unDumpedRows, pBlock->minVer, pBlock->maxVer, pBlockInfo->uid, elapsedTime, pReader->idStr);
1421 1422 1423 1424

  return TSDB_CODE_SUCCESS;
}

1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444
static FORCE_INLINE STSchema* getTableSchemaImpl(STsdbReader* pReader, uint64_t uid) {
  ASSERT(pReader->pSchema == NULL);

  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, -1, &pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS || pReader->pSchema == NULL) {
    terrno = code;
    tsdbError("failed to get table schema, uid:%" PRIu64 ", it may have been dropped, ver:-1, %s", uid, pReader->idStr);
    return NULL;
  }

  code = tsdbRowMergerInit(&pReader->status.merger, pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    tsdbError("failed to init merger, code:%s, %s", tstrerror(code), pReader->idStr);
    return NULL;
  }

  return pReader->pSchema;
}

1445 1446
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData,
                                   uint64_t uid) {
1447 1448 1449
  int32_t   code = 0;
  STSchema* pSchema = pReader->pSchema;
  int64_t   st = taosGetTimestampUs();
1450

1451
  tBlockDataReset(pBlockData);
1452 1453 1454 1455 1456 1457 1458

  if (pReader->pSchema == NULL) {
    pSchema = getTableSchemaImpl(pReader, uid);
    if (pSchema == NULL) {
      tsdbDebug("%p table uid:%" PRIu64 " has been dropped, no data existed, %s", pReader, uid, pReader->idStr);
      return code;
    }
1459 1460 1461
  }

  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
X
Xiaoyu Wang 已提交
1462
  TABLEID             tid = {.suid = pReader->suid, .uid = uid};
1463
  code = tBlockDataInit(pBlockData, &tid, pSchema, &pSup->colId[1], pSup->numOfCols - 1);
1464 1465 1466 1467
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1468
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
1469
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1470

H
Hongze Cheng 已提交
1471
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
1472
  code = tsdbReadDataBlock(pReader->pFileReader, pBlock, pBlockData);
1473 1474 1475
  if (code != TSDB_CODE_SUCCESS) {
    tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
              ", rows:%d, code:%s %s",
1476
              pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
1477 1478 1479
              tstrerror(code), pReader->idStr);
    return code;
  }
1480

1481
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
1482

1483 1484 1485 1486
  tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
1487 1488 1489

  pReader->cost.blockLoadTime += elapsedTime;
  pDumpInfo->allDumped = false;
1490

H
Haojun Liao 已提交
1491
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1492
}
H
Hongze Cheng 已提交
1493

H
Haojun Liao 已提交
1494 1495 1496
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
1497

H
Haojun Liao 已提交
1498 1499 1500 1501
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
1502

H
Haojun Liao 已提交
1503 1504
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
1505

H
Haojun Liao 已提交
1506 1507
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
1508 1509
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
1510

H
Haojun Liao 已提交
1511 1512 1513 1514
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
1515

H
Haojun Liao 已提交
1516 1517
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1518

H
Haojun Liao 已提交
1519
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
1520
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
1521
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
1522

H
Haojun Liao 已提交
1523
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
1524

H
Haojun Liao 已提交
1525 1526
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
1527

H
Haojun Liao 已提交
1528 1529 1530 1531 1532 1533 1534
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
1535

1536
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
1537
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
1538

1539 1540 1541
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

H
Haojun Liao 已提交
1542
static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter, const char* idStr) {
1543 1544
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
1545
    STableBlockScanInfo* pScanInfo = getTableBlockScanInfo(pBlockIter->pTableMap, pBlockInfo->uid, idStr);
H
Haojun Liao 已提交
1546
    if (pScanInfo == NULL) {
H
Haojun Liao 已提交
1547
      return terrno;
H
Haojun Liao 已提交
1548 1549
    }

H
Haojun Liao 已提交
1550 1551
    SBlockIndex* pIndex = taosArrayGet(pScanInfo->pBlockList, pBlockInfo->tbBlockIdx);
    tMapDataGetItemByIdx(&pScanInfo->mapData, pIndex->ordinalIndex, &pBlockIter->block, tGetDataBlk);
1552
  }
1553 1554 1555 1556 1557 1558

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1559
}
H
Hongze Cheng 已提交
1560

1561
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks, SArray* pTableList) {
1562
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
1563

1564
  SBlockOrderSupporter sup = {0};
1565
  pBlockIter->numOfBlocks = numOfBlocks;
1566
  taosArrayClear(pBlockIter->blockList);
1567
  pBlockIter->pTableMap = pReader->status.pTableMap;
1568

1569
  // access data blocks according to the offset of each block in asc/desc order.
1570
  int32_t numOfTables = taosArrayGetSize(pTableList);
H
Haojun Liao 已提交
1571

1572
  int64_t st = taosGetTimestampUs();
1573
  int32_t code = initBlockOrderSupporter(&sup, numOfTables);
1574 1575 1576
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
1577

1578
  int32_t cnt = 0;
H
Haojun Liao 已提交
1579

1580
  for (int32_t i = 0; i < numOfTables; ++i) {
1581 1582
    STableBlockScanInfo* pTableScanInfo = taosArrayGetP(pTableList, i);
    ASSERT(pTableScanInfo->pBlockList != NULL && taosArrayGetSize(pTableScanInfo->pBlockList) > 0);
H
Haojun Liao 已提交
1583

1584 1585
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
1586

1587 1588 1589
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1590
      return TSDB_CODE_OUT_OF_MEMORY;
1591
    }
H
Haojun Liao 已提交
1592

1593
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
1594

1595 1596 1597
    for (int32_t k = 0; k < num; ++k) {
      SBlockIndex* pIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
      sup.pDataBlockInfo[sup.numOfTables][k] =
1598
          (SBlockOrderWrapper){.uid = pTableScanInfo->uid, .offset = pIndex->inFileOffset};
1599 1600 1601 1602 1603
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
1604

H
Haojun Liao 已提交
1605 1606 1607 1608
  if (numOfBlocks != cnt && sup.numOfTables != numOfTables) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_INVALID_PARA;
  }
H
Haojun Liao 已提交
1609

1610
  // since there is only one table qualified, blocks are not sorted
1611 1612
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
1613 1614
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
1615
    }
1616

1617
    int64_t et = taosGetTimestampUs();
1618
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
1619
              pReader, numOfBlocks, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
1620

1621
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1622
    cleanupBlockOrderSupporter(&sup);
H
Haojun Liao 已提交
1623
    doSetCurrentBlock(pBlockIter, pReader->idStr);
1624
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1625
  }
H
Haojun Liao 已提交
1626

1627 1628
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
1629

1630
  SMultiwayMergeTreeInfo* pTree = NULL;
H
Haojun Liao 已提交
1631 1632

  uint8_t ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
1633 1634
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1635
    return TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
1636
  }
H
Haojun Liao 已提交
1637

1638 1639 1640 1641
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
1642

1643 1644
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
1645

1646 1647 1648 1649
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
1650

1651 1652
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
1653
  }
H
Haojun Liao 已提交
1654

1655
  int64_t et = taosGetTimestampUs();
H
Hongze Cheng 已提交
1656 1657
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, numOfBlocks,
            (et - st) / 1000.0, pReader->idStr);
1658 1659
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
1660

1661
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1662
  doSetCurrentBlock(pBlockIter, pReader->idStr);
1663

1664
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1665
}
H
Hongze Cheng 已提交
1666

H
Haojun Liao 已提交
1667
static bool blockIteratorNext(SDataBlockIter* pBlockIter, const char* idStr) {
1668 1669
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

1670
  int32_t step = asc ? 1 : -1;
1671
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
1672 1673 1674
    return false;
  }

1675
  pBlockIter->index += step;
H
Haojun Liao 已提交
1676
  doSetCurrentBlock(pBlockIter, idStr);
1677

1678 1679 1680
  return true;
}

1681 1682 1683
/**
 * This is an two rectangles overlap cases.
 */
H
Hongze Cheng 已提交
1684
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SDataBlk* pBlock) {
1685 1686
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
H
Hongze Cheng 已提交
1687 1688
         (pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
         (pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
H
Haojun Liao 已提交
1689
}
H
Hongze Cheng 已提交
1690

1691
static bool getNeighborBlockOfSameTable(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
1692
                                        int32_t* nextIndex, int32_t order, SBlockIndex* pBlockIndex) {
1693
  bool asc = ASCENDING_TRAVERSE(order);
H
Haojun Liao 已提交
1694
  if (asc && pBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
1695
    return false;
1696 1697
  }

H
Haojun Liao 已提交
1698
  if (!asc && pBlockInfo->tbBlockIdx == 0) {
1699
    return false;
1700 1701
  }

1702
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1703
  *nextIndex = pBlockInfo->tbBlockIdx + step;
1704 1705
  *pBlockIndex = *(SBlockIndex*)taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
  //  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, pIndex->ordinalIndex, pBlock, tGetDataBlk);
1706
  return true;
1707 1708 1709
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
1710
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1711 1712
  int32_t index = pBlockIter->index;

1713
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  return -1;
}

1725
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1726
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
1727 1728 1729 1730
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1731 1732 1733 1734 1735
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1736

1737 1738 1739
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1740

H
Haojun Liao 已提交
1741
  doSetCurrentBlock(pBlockIter, "");
1742 1743 1744
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
1745
// todo: this attribute could be acquired during extractin the global ordered block list.
1746
static bool overlapWithNeighborBlock(SDataBlk* pBlock, SBlockIndex* pNeighborBlockIndex, int32_t order) {
1747 1748
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
1749
    return pBlock->maxKey.ts == pNeighborBlockIndex->window.skey;
1750
  } else {
1751
    return pBlock->minKey.ts == pNeighborBlockIndex->window.ekey;
1752
  }
H
Haojun Liao 已提交
1753
}
H
Hongze Cheng 已提交
1754

H
Hongze Cheng 已提交
1755
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SDataBlk* pBlock) {
H
Haojun Liao 已提交
1756
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1757

1758
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1759
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1760
}
H
Hongze Cheng 已提交
1761

H
Hongze Cheng 已提交
1762
static bool keyOverlapFileBlock(TSDBKEY key, SDataBlk* pBlock, SVersionRange* pVerRange) {
H
Hongze Cheng 已提交
1763 1764
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVer >= pVerRange->minVer) &&
         (pBlock->minVer <= pVerRange->maxVer);
H
Haojun Liao 已提交
1765 1766
}

H
Hongze Cheng 已提交
1767 1768
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock,
                                       int32_t startIndex) {
1769 1770
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

1771
  for (int32_t i = startIndex; i < num; i += 1) {
1772 1773
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
H
Hongze Cheng 已提交
1774
      if (p->version >= pBlock->minVer) {
1775 1776 1777
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
H
Hongze Cheng 已提交
1778
      if (p->version >= pBlock->minVer) {
1779 1780
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
H
Hongze Cheng 已提交
1781 1782
          if (pnext->ts >= pBlock->minKey.ts) {
            return true;
1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

H
Hongze Cheng 已提交
1796
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock, int32_t order) {
1797 1798 1799 1800
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1801
  // ts is not overlap
1802
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1803
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1804 1805 1806 1807 1808
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1809
  if (ASCENDING_TRAVERSE(order)) {
1810
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, pBlockScanInfo->fileDelIndex);
1811 1812
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1813
    while (1) {
1814 1815 1816 1817
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
1818 1819
        if (p->ts == pBlock->minKey.ts && p->version < pBlock->maxVer && index > 0) {
          index -= 1;
1820
        }
1821
        break;
1822 1823 1824
      }
    }

1825
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, index);
1826
  }
1827 1828
}

C
Cary Xu 已提交
1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841
typedef struct {
  bool overlapWithNeighborBlock;
  bool hasDupTs;
  bool overlapWithDelInfo;
  bool overlapWithLastBlock;
  bool overlapWithKeyInBuf;
  bool partiallyRequired;
  bool moreThanCapcity;
} SDataBlockToLoadInfo;

static void getBlockToLoadInfo(SDataBlockToLoadInfo* pInfo, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                               STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader,
                               STsdbReader* pReader) {
1842 1843
  int32_t     neighborIndex = 0;
  SBlockIndex bIndex = {0};
1844

1845
  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pScanInfo, &neighborIndex, pReader->order, &bIndex);
1846

1847
  // overlap with neighbor
1848
  if (hasNeighbor) {
1849
    pInfo->overlapWithNeighborBlock = overlapWithNeighborBlock(pBlock, &bIndex, pReader->order);
1850 1851
  }

1852
  // has duplicated ts of different version in this block
C
Cary Xu 已提交
1853 1854
  pInfo->hasDupTs = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  pInfo->overlapWithDelInfo = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1855

1856 1857 1858
  if (hasDataInLastBlock(pLastBlockReader)) {
    int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
    pInfo->overlapWithLastBlock = !(pBlock->maxKey.ts < tsLast || pBlock->minKey.ts > tsLast);
1859 1860
  }

1861
  pInfo->moreThanCapcity = pBlock->nRow > pReader->resBlockInfo.capacity;
C
Cary Xu 已提交
1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
  pInfo->partiallyRequired = dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock);
  pInfo->overlapWithKeyInBuf = keyOverlapFileBlock(keyInBuf, pBlock, &pReader->verRange);
}

// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
// 5. delete info should not overlap with current block data
// 6. current block should not contain the duplicated ts
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
1876

C
Cary Xu 已提交
1877 1878 1879
  bool loadDataBlock =
      (info.overlapWithNeighborBlock || info.hasDupTs || info.partiallyRequired || info.overlapWithKeyInBuf ||
       info.moreThanCapcity || info.overlapWithDelInfo || info.overlapWithLastBlock);
1880 1881 1882 1883

  // log the reason why load the datablock for profile
  if (loadDataBlock) {
    tsdbDebug("%p uid:%" PRIu64
X
Xiaoyu Wang 已提交
1884
              " need to load the datablock, overlapneighbor:%d, hasDup:%d, partiallyRequired:%d, "
1885
              "overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithlastBlock:%d, %s",
C
Cary Xu 已提交
1886 1887 1888
              pReader, pBlockInfo->uid, info.overlapWithNeighborBlock, info.hasDupTs, info.partiallyRequired,
              info.overlapWithKeyInBuf, info.moreThanCapcity, info.overlapWithDelInfo, info.overlapWithLastBlock,
              pReader->idStr);
1889 1890 1891
  }

  return loadDataBlock;
H
Haojun Liao 已提交
1892 1893
}

C
Cary Xu 已提交
1894 1895 1896 1897 1898 1899 1900 1901 1902
static bool isCleanFileDataBlock(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                 STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
  bool isCleanFileBlock = !(info.overlapWithNeighborBlock || info.hasDupTs || info.overlapWithKeyInBuf ||
                            info.overlapWithDelInfo || info.overlapWithLastBlock);
  return isCleanFileBlock;
}

1903
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1904
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1905 1906
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1907

1908
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
1909 1910

  int64_t st = taosGetTimestampUs();
1911
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->resBlockInfo.capacity, pReader);
H
Haojun Liao 已提交
1912

H
Haojun Liao 已提交
1913
  blockDataUpdateTsWindow(pBlock, pReader->suppInfo.slotId[0]);
H
Haojun Liao 已提交
1914
  pBlock->info.id.uid = pBlockScanInfo->uid;
1915

1916
  setComposedBlockFlag(pReader, true);
1917

1918
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
D
dapan1121 已提交
1919
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%" PRId64 ", brange:%" PRId64
X
Xiaoyu Wang 已提交
1920
            " - %" PRId64 ", uid:%" PRIu64 ",  %s",
1921
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
1922
            pBlockScanInfo->uid, pReader->idStr);
1923 1924

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1925 1926 1927
  return code;
}

1928
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key,
1929
                                            SFileBlockDumpInfo* pDumpInfo, bool* copied) {
1930 1931 1932
  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
D
dapan1121 已提交
1933 1934 1935
  int32_t code = TSDB_CODE_SUCCESS;

  *copied = false;
1936 1937
  bool asc = (pReader->order == TSDB_ORDER_ASC);
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && asc) || (pDumpInfo->rowIndex > 0 && (!asc))) {
1938
    int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
1939 1940

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1941
    if (nextKey != key) {  // merge is not needed
1942
      code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
D
dapan1121 已提交
1943 1944 1945
      if (code) {
        return code;
      }
1946
      pDumpInfo->rowIndex += step;
D
dapan1121 已提交
1947
      *copied = true;
1948 1949 1950
    }
  }

D
dapan1121 已提交
1951
  return code;
1952 1953
}

1954
static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo,
1955
                                  SVersionRange* pVerRange) {
X
Xiaoyu Wang 已提交
1956
  int32_t step = ASCENDING_TRAVERSE(pLastBlockReader->order) ? 1 : -1;
H
Haojun Liao 已提交
1957

1958 1959
  while (1) {
    bool hasVal = tMergeTreeNext(&pLastBlockReader->mergeTree);
1960
    if (!hasVal) {  // the next value will be the accessed key in stt
1961
      pScanInfo->lastKeyInStt += step;
1962 1963 1964
      return false;
    }

1965
    TSDBROW* pRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
1966
    int64_t key = pRow->pBlockData->aTSKEY[pRow->iRow];
1967
    int64_t ver = pRow->pBlockData->aVersion[pRow->iRow];
1968

1969
    pLastBlockReader->currentKey = key;
1970
    pScanInfo->lastKeyInStt = key;
1971 1972

    if (!hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->lastBlockDelIndex, key, ver, pLastBlockReader->order, pVerRange)) {
1973 1974 1975 1976 1977 1978
      return true;
    }
  }
}

static bool tryCopyDistinctRowFromSttBlock(TSDBROW* fRow, SLastBlockReader* pLastBlockReader,
1979 1980
                                           STableBlockScanInfo* pScanInfo, int64_t ts, STsdbReader* pReader,
                                           bool* copied) {
D
dapan1121 已提交
1981 1982 1983 1984
  int32_t code = TSDB_CODE_SUCCESS;

  *copied = false;

1985
  bool hasVal = nextRowFromLastBlocks(pLastBlockReader, pScanInfo, &pReader->verRange);
1986 1987 1988
  if (hasVal) {
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 != ts) {
1989
      code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
D
dapan1121 已提交
1990 1991 1992
      if (code) {
        return code;
      }
1993

D
dapan1121 已提交
1994 1995
      *copied = true;
      return code;
1996 1997
    }
  } else {
1998
    code = doAppendRowFromFileBlock(pReader->resBlockInfo.pResBlock, pReader, fRow->pBlockData, fRow->iRow);
D
dapan1121 已提交
1999 2000 2001
    if (code) {
      return code;
    }
2002

D
dapan1121 已提交
2003 2004
    *copied = true;
    return code;
2005 2006
  }

D
dapan1121 已提交
2007
  return code;
2008 2009
}

H
Haojun Liao 已提交
2010 2011 2012
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
  // always set the newest schema version in pReader->pSchema
  if (pReader->pSchema == NULL) {
2013 2014
    STSchema* ps = getTableSchemaImpl(pReader, uid);
    if (ps == NULL) {
2015 2016
      return NULL;
    }
H
Haojun Liao 已提交
2017 2018
  }

2019
  if (pReader->pSchema && sversion == pReader->pSchema->version) {
H
Haojun Liao 已提交
2020 2021 2022
    return pReader->pSchema;
  }

2023 2024
  void** p = tSimpleHashGet(pReader->pSchemaMap, &sversion, sizeof(sversion));
  if (p != NULL) {
2025
    return *(STSchema**)p;
H
Haojun Liao 已提交
2026 2027
  }

2028
  STSchema* ptr = NULL;
2029
  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &ptr);
2030
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2031 2032
    terrno = code;
    return NULL;
H
Haojun Liao 已提交
2033
  } else {
2034 2035 2036 2037 2038 2039
    code = tSimpleHashPut(pReader->pSchemaMap, &sversion, sizeof(sversion), &ptr, POINTER_BYTES);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = code;
      return NULL;
    }
    return ptr;
H
Haojun Liao 已提交
2040
  }
H
Haojun Liao 已提交
2041 2042
}

2043
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
2044
                                     SIterInfo* pIter, int64_t key, SLastBlockReader* pLastBlockReader) {
2045
  SRowMerger*         pMerger = &pReader->status.merger;
H
Hongze Cheng 已提交
2046
  SRow*               pTSRow = NULL;
2047 2048 2049
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2050
  int64_t tsLast = INT64_MIN;
2051
  if (hasDataInLastBlock(pLastBlockReader)) {
2052 2053
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
2054

H
Hongze Cheng 已提交
2055 2056
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2057

2058 2059 2060 2061 2062 2063 2064 2065 2066
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

2067 2068
  int64_t minKey = 0;
  if (pReader->order == TSDB_ORDER_ASC) {
H
Hongze Cheng 已提交
2069
    minKey = INT64_MAX;  // chosen the minimum value
2070
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2071 2072
      minKey = tsLast;
    }
2073

2074 2075 2076
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2077

2078
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2079 2080 2081 2082
      minKey = key;
    }
  } else {
    minKey = INT64_MIN;
2083
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2084 2085 2086 2087 2088 2089 2090
      minKey = tsLast;
    }

    if (minKey < k.ts) {
      minKey = k.ts;
    }

2091
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2092 2093
      minKey = key;
    }
2094 2095
  }

2096
  // todo remove init
2097 2098
  bool init = false;

2099
  // ASC: file block ---> last block -----> imem -----> mem
H
Hongze Cheng 已提交
2100
  // DESC: mem -----> imem -----> last block -----> file block
2101 2102
  if (pReader->order == TSDB_ORDER_ASC) {
    if (minKey == key) {
2103
      init = true;
2104
      int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2105 2106 2107
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
2108
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2109 2110
    }

2111
    if (minKey == tsLast) {
2112
      TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2113
      if (init) {
2114
        tsdbRowMergerAdd(pMerger, fRow1, NULL);
H
Haojun Liao 已提交
2115
      } else {
2116
        init = true;
2117
        int32_t code = tsdbRowMergerAdd(pMerger, fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2118 2119 2120
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2121
      }
2122
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2123
    }
2124

2125
    if (minKey == k.ts) {
K
kailixu 已提交
2126 2127 2128 2129
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      if (pSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
2130
      if (init) {
2131
        tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2132
      } else {
2133
        init = true;
2134
        int32_t code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2135 2136 2137 2138
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
      }
H
Haojun Liao 已提交
2139
      int32_t code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2140 2141
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2142 2143 2144 2145 2146
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2147
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2148 2149 2150 2151
      if (pSchema == NULL) {
        return terrno;
      }

2152
      int32_t   code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2153 2154 2155 2156
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2157
      code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
2158
      if (code != TSDB_CODE_SUCCESS || pMerger->pTSchema == NULL) {
H
Haojun Liao 已提交
2159 2160
        return code;
      }
2161 2162
    }

2163
    if (minKey == tsLast) {
2164
      TSDBROW* fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2165
      if (init) {
2166
        tsdbRowMergerAdd(pMerger, fRow1, NULL);
H
Haojun Liao 已提交
2167
      } else {
2168
        init = true;
2169
        int32_t code = tsdbRowMergerAdd(pMerger, fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2170 2171 2172
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2173
      }
2174
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2175 2176 2177
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2178
      if (init) {
2179
        tsdbRowMergerAdd(pMerger, &fRow, NULL);
H
Haojun Liao 已提交
2180
      } else {
2181
        init = true;
2182
        int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2183 2184 2185
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2186
      }
H
Haojun Liao 已提交
2187
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2188
    }
2189 2190
  }

2191
  int32_t code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2192 2193 2194 2195
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2196
  code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2197 2198

  taosMemoryFree(pTSRow);
2199
  tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2200 2201

  return code;
2202 2203
}

2204 2205 2206
static int32_t doMergeFileBlockAndLastBlock(SLastBlockReader* pLastBlockReader, STsdbReader* pReader,
                                            STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                            bool mergeBlockData) {
2207
  SRowMerger* pMerger = &pReader->status.merger;
2208
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2209 2210 2211 2212 2213 2214 2215 2216 2217 2218

  int64_t  tsLastBlock = getCurrentKeyInLastBlock(pLastBlockReader);
  bool     copied = false;
  int32_t  code = TSDB_CODE_SUCCESS;
  SRow*    pTSRow = NULL;
  TSDBROW* pRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);

  // create local variable to hold the row value
  TSDBROW  fRow = {.iRow = pRow->iRow, .type = TSDBROW_COL_FMT, .pBlockData = pRow->pBlockData};

2219
  tsdbTrace("fRow ptr:%p, %d, uid:%" PRIu64 ", %s", pRow->pBlockData, pRow->iRow, pLastBlockReader->uid, pReader->idStr);
2220

2221 2222
  // only last block exists
  if ((!mergeBlockData) || (tsLastBlock != pBlockData->aTSKEY[pDumpInfo->rowIndex])) {
D
dapan1121 已提交
2223 2224 2225 2226
    code = tryCopyDistinctRowFromSttBlock(&fRow, pLastBlockReader, pBlockScanInfo, tsLastBlock, pReader, &copied);
    if (code) {
      return code;
    }
2227

D
dapan1121 已提交
2228
    if (copied) {
2229
      pBlockScanInfo->lastKey = tsLastBlock;
2230 2231
      return TSDB_CODE_SUCCESS;
    } else {
2232
      code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2233 2234 2235
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2236

2237 2238
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tsdbRowMergerAdd(pMerger, pRow1, NULL);
2239
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, pMerger, &pReader->verRange, pReader->idStr);
2240

2241
      code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2242 2243 2244
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2245

2246
      code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2247 2248

      taosMemoryFree(pTSRow);
2249
      tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2250 2251 2252 2253

      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2254 2255
    }
  } else {  // not merge block data
2256
    code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2257 2258 2259 2260
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2261
    doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, pMerger, &pReader->verRange, pReader->idStr);
2262 2263

    // merge with block data if ts == key
H
Haojun Liao 已提交
2264
    if (tsLastBlock == pBlockData->aTSKEY[pDumpInfo->rowIndex]) {
H
Haojun Liao 已提交
2265
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2266 2267
    }

2268
    code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2269 2270 2271 2272
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2273
    code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2274 2275

    taosMemoryFree(pTSRow);
2276
    tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2277 2278 2279 2280

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
2281
  }
2282 2283 2284 2285

  return TSDB_CODE_SUCCESS;
}

2286 2287
static int32_t mergeFileBlockAndLastBlock(STsdbReader* pReader, SLastBlockReader* pLastBlockReader, int64_t key,
                                          STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
2288
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2289 2290 2291 2292 2293 2294 2295 2296 2297 2298
  SRowMerger* pMerger = &pReader->status.merger;

  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }
2299

2300
  if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
2301
    // no last block available, only data block exists
2302
    if (!hasDataInLastBlock(pLastBlockReader)) {
2303 2304 2305 2306 2307 2308 2309 2310 2311
      return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
    }

    // row in last file block
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
    int64_t ts = getCurrentKeyInLastBlock(pLastBlockReader);
    ASSERT(ts >= key);

    if (ASCENDING_TRAVERSE(pReader->order)) {
2312
      if (key < ts) {  // imem, mem are all empty, file blocks (data blocks and last block) exist
2313 2314
        return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
      } else if (key == ts) {
H
Haojun Liao 已提交
2315 2316
        SRow*       pTSRow = NULL;
        int32_t code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2317 2318 2319 2320
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

H
Haojun Liao 已提交
2321
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2322

2323 2324
        TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
2325

H
Haojun Liao 已提交
2326
        doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, ts, pMerger, &pReader->verRange, pReader->idStr);
2327

H
Haojun Liao 已提交
2328
        code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2329 2330 2331 2332
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

2333
        code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2334

2335
        taosMemoryFree(pTSRow);
2336
        tsdbRowMergerClear(pMerger);
2337
        return code;
2338
      } else {
2339
        return TSDB_CODE_SUCCESS;
2340
      }
2341
    } else {  // desc order
2342
      return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, pBlockData, true);
2343
    }
2344
  } else {  // only last block exists
2345
    return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, NULL, false);
H
Haojun Liao 已提交
2346
  }
2347 2348
}

2349 2350
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                     SLastBlockReader* pLastBlockReader) {
2351
  SRowMerger*         pMerger = &pReader->status.merger;
H
Hongze Cheng 已提交
2352
  SRow*               pTSRow = NULL;
H
Haojun Liao 已提交
2353
  int32_t             code = TSDB_CODE_SUCCESS;
2354 2355 2356
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SArray*             pDelList = pBlockScanInfo->delSkyline;

2357 2358
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
2359

2360
  int64_t tsLast = INT64_MIN;
2361 2362 2363
  if (hasDataInLastBlock(pLastBlockReader)) {
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
2364

H
Hongze Cheng 已提交
2365
  int64_t key = hasDataInFileBlock(pBlockData, pDumpInfo) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2366

2367 2368 2369 2370 2371 2372
  TSDBKEY   k = TSDBROW_KEY(pRow);
  TSDBKEY   ik = TSDBROW_KEY(piRow);
  STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
  if (pSchema == NULL) {
    return code;
  }
2373

2374 2375 2376 2377
  STSchema* piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
  if (piSchema == NULL) {
    return code;
  }
2378

2379 2380 2381 2382 2383 2384 2385 2386 2387
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

2388
  int64_t minKey = 0;
2389 2390 2391 2392 2393
  if (ASCENDING_TRAVERSE(pReader->order)) {
    minKey = INT64_MAX;  // let's find the minimum
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2394

2395 2396 2397
    if (minKey > ik.ts) {
      minKey = ik.ts;
    }
2398

2399
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2400 2401
      minKey = key;
    }
2402

2403
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2404 2405 2406
      minKey = tsLast;
    }
  } else {
H
Hongze Cheng 已提交
2407
    minKey = INT64_MIN;  // let find the maximum ts value
2408 2409 2410 2411 2412 2413 2414 2415
    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < ik.ts) {
      minKey = ik.ts;
    }

2416
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2417 2418 2419
      minKey = key;
    }

2420
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2421 2422
      minKey = tsLast;
    }
2423 2424 2425 2426
  }

  bool init = false;

2427 2428 2429 2430
  // ASC: file block -----> last block -----> imem -----> mem
  // DESC: mem -----> imem -----> last block -----> file block
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (minKey == key) {
2431
      init = true;
2432
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2433
      code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2434 2435 2436 2437
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2438
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2439 2440
    }

2441
    if (minKey == tsLast) {
2442
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2443
      if (init) {
2444
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
H
Haojun Liao 已提交
2445
      } else {
2446
        init = true;
2447
        code = tsdbRowMergerAdd(pMerger, pRow1, pReader->pSchema);
H
Haojun Liao 已提交
2448 2449 2450
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2451
      }
H
Haojun Liao 已提交
2452

2453
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2454 2455 2456
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2457
      if (init) {
2458
        tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2459
      } else {
2460
        init = true;
2461
        code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2462 2463 2464
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2465
      }
H
Haojun Liao 已提交
2466

H
Haojun Liao 已提交
2467
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2468 2469
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2470
      }
2471 2472
    }

2473
    if (minKey == k.ts) {
H
Haojun Liao 已提交
2474
      if (init) {
2475
        tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2476
      } else {
2477
        // STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2478
        code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2479 2480 2481
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2482
      }
H
Haojun Liao 已提交
2483
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2484 2485
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2486 2487 2488 2489 2490
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2491
      code = tsdbRowMergerAdd(pMerger, pRow, pSchema);
H
Haojun Liao 已提交
2492 2493 2494 2495
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2496
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2497 2498 2499
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2500 2501 2502
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2503
      if (init) {
2504
        tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2505
      } else {
2506
        init = true;
2507
        code = tsdbRowMergerAdd(pMerger, piRow, piSchema);
H
Haojun Liao 已提交
2508 2509 2510
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2511
      }
H
Haojun Liao 已提交
2512
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
2513 2514
      if (code != TSDB_CODE_SUCCESS) {
        return code;
2515 2516 2517 2518
      }
    }

    if (minKey == tsLast) {
2519
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2520
      if (init) {
2521
        tsdbRowMergerAdd(pMerger, pRow1, NULL);
H
Haojun Liao 已提交
2522
      } else {
2523
        init = true;
2524
        code = tsdbRowMergerAdd(pMerger, pRow1, pReader->pSchema);
H
Haojun Liao 已提交
2525 2526 2527
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2528
      }
2529
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, pMerger, &pReader->verRange, pReader->idStr);
2530 2531 2532
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2533
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2534
      if (!init) {
2535
        code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2536 2537 2538
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
H
Haojun Liao 已提交
2539
      } else {
2540
        tsdbRowMergerAdd(pMerger, &fRow, NULL);
2541
      }
H
Haojun Liao 已提交
2542
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2543 2544 2545
    }
  }

2546
  code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2547 2548 2549 2550
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2551
  code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2552 2553

  taosMemoryFree(pTSRow);
2554
  tsdbRowMergerClear(pMerger);
2555
  return code;
2556 2557
}

2558 2559 2560 2561 2562 2563 2564 2565 2566
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

  int32_t code = TSDB_CODE_SUCCESS;

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
2567 2568
    // startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
    startKey = (TSDBKEY){.ts = pBlockScanInfo->lastKey + 1, .version = pReader->verRange.minVer};
2569
  } else {
2570 2571
    // startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
    startKey = (TSDBKEY){.ts = pBlockScanInfo->lastKey - 1, .version = pReader->verRange.maxVer};
2572 2573 2574
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));
D
dapan1121 已提交
2575
  int64_t st = 0;
2576 2577 2578 2579 2580 2581 2582 2583 2584

  STbData* d = NULL;
  if (pReader->pReadSnap->pMem != NULL) {
    d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
    if (d != NULL) {
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);

H
Haojun Liao 已提交
2585
        tsdbDebug("%p uid:%" PRIu64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
2586
                  "-%" PRId64 " %s",
2587 2588
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2589
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
2590 2591 2592 2593 2594
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2595
    tsdbDebug("%p uid:%" PRIu64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
  }

  STbData* di = NULL;
  if (pReader->pReadSnap->pIMem != NULL) {
    di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
    if (di != NULL) {
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);

H
Haojun Liao 已提交
2606
        tsdbDebug("%p uid:%" PRIu64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
2607
                  "-%" PRId64 " %s",
2608 2609
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2610
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
2611 2612 2613 2614 2615
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2616
    tsdbDebug("%p uid:%" PRIu64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2617 2618
  }

2619
  st = taosGetTimestampUs();
2620
  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);
2621
  pReader->cost.initDelSkylineIterTime += (taosGetTimestampUs() - st) / 1000.0;
2622 2623 2624 2625 2626

  pBlockScanInfo->iterInit = true;
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
2627 2628
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
2629 2630 2631 2632 2633 2634 2635 2636
  // it is an multi-table data block
  if (pBlockData->aUid != NULL) {
    uint64_t uid = pBlockData->aUid[pDumpInfo->rowIndex];
    if (uid != pBlockScanInfo->uid) {  // move to next row
      return false;
    }
  }

2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

2648
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, ts, ver, pReader->order,
2649
                     &pReader->verRange)) {
2650 2651 2652
    return false;
  }

2653 2654 2655
  return true;
}

2656
static bool initLastBlockReader(SLastBlockReader* pLBlockReader, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
2657
  // the last block reader has been initialized for this table.
2658
  if (pLBlockReader->uid == pScanInfo->uid) {
2659
    return hasDataInLastBlock(pLBlockReader);
2660 2661
  }

2662 2663
  if (pLBlockReader->uid != 0) {
    tMergeTreeClose(&pLBlockReader->mergeTree);
2664 2665
  }

2666 2667
  initMemDataIterator(pScanInfo, pReader);
  pLBlockReader->uid = pScanInfo->uid;
2668

2669 2670
  STimeWindow w = pLBlockReader->window;
  if (ASCENDING_TRAVERSE(pLBlockReader->order)) {
2671
    w.skey = pScanInfo->lastKeyInStt;
2672
  } else {
2673
    w.ekey = pScanInfo->lastKeyInStt;
2674 2675
  }

X
Xiaoyu Wang 已提交
2676 2677
  tsdbDebug("init last block reader, window:%" PRId64 "-%" PRId64 ", uid:%" PRIu64 ", %s", w.skey, w.ekey,
            pScanInfo->uid, pReader->idStr);
2678 2679
  int32_t code = tMergeTreeOpen(&pLBlockReader->mergeTree, (pLBlockReader->order == TSDB_ORDER_DESC),
                                pReader->pFileReader, pReader->suid, pScanInfo->uid, &w, &pLBlockReader->verRange,
2680
                                pLBlockReader->pInfo, false, pReader->idStr, false, pReader->status.pLDataIter);
2681 2682 2683 2684
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

2685
  return nextRowFromLastBlocks(pLBlockReader, pScanInfo, &pReader->verRange);
2686 2687
}

H
Hongze Cheng 已提交
2688
static bool hasDataInLastBlock(SLastBlockReader* pLastBlockReader) { return pLastBlockReader->mergeTree.pIter != NULL; }
2689

2690
bool hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo) {
H
Haojun Liao 已提交
2691
  if ((pBlockData->nRow > 0) && (pBlockData->nRow != pDumpInfo->totalRows)) {
2692
    return false;  // this is an invalid result.
2693
  }
2694
  return pBlockData->nRow > 0 && (!pDumpInfo->allDumped);
2695
}
2696

2697 2698
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                              STsdbReader* pReader) {
2699
  SRowMerger*         pMerger = &pReader->status.merger;
2700
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2701 2702
  bool                copied = false;
  int32_t             code = tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo, &copied);
D
dapan1121 已提交
2703 2704 2705
  if (code) {
    return code;
  }
2706

2707 2708 2709 2710 2711 2712 2713 2714 2715
  // merge is not initialized yet, due to the fact that the pReader->pSchema is not initialized
  if (pMerger->pArray == NULL) {
    ASSERT(pReader->pSchema == NULL);
    STSchema* ps = getTableSchemaImpl(pReader, pBlockScanInfo->uid);
    if (ps == NULL) {
      return terrno;
    }
  }

D
dapan1121 已提交
2716
  if (copied) {
2717
    pBlockScanInfo->lastKey = key;
2718 2719
    return TSDB_CODE_SUCCESS;
  } else {
C
Cary Xu 已提交
2720 2721
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);

H
Hongze Cheng 已提交
2722
    SRow*      pTSRow = NULL;
2723
    code = tsdbRowMergerAdd(pMerger, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2724 2725 2726 2727
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
2728
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader);
2729
    code = tsdbRowMergerGetRow(pMerger, &pTSRow);
2730 2731 2732 2733
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2734
    code = doAppendRowFromTSRow(pReader->resBlockInfo.pResBlock, pReader, pTSRow, pBlockScanInfo);
2735 2736

    taosMemoryFree(pTSRow);
2737
    tsdbRowMergerClear(pMerger);
D
dapan1121 已提交
2738
    return code;
2739 2740 2741
  }
}

H
Haojun Liao 已提交
2742 2743
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
                                          SBlockData* pBlockData, SLastBlockReader* pLastBlockReader) {
2744 2745
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2746
  TSDBROW *pRow = NULL, *piRow = NULL;
2747
  int64_t key = (pBlockData->nRow > 0 && (!pDumpInfo->allDumped)) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2748 2749 2750
  if (pBlockScanInfo->iter.hasVal) {
    pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  }
C
Cary Xu 已提交
2751

2752 2753 2754
  if (pBlockScanInfo->iiter.hasVal) {
    piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
  }
C
Cary Xu 已提交
2755

2756 2757 2758 2759
  // two levels of mem-table does contain the valid rows
  if (pRow != NULL && piRow != NULL) {
    return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
  }
2760

2761 2762 2763 2764
  // imem + file + last block
  if (pBlockScanInfo->iiter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key, pLastBlockReader);
  }
2765

2766 2767 2768
  // mem + file + last block
  if (pBlockScanInfo->iter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key, pLastBlockReader);
2769
  }
2770 2771 2772

  // files data blocks + last block
  return mergeFileBlockAndLastBlock(pReader, pLastBlockReader, key, pBlockScanInfo, pBlockData);
2773 2774
}

H
Haojun Liao 已提交
2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814
static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pBlockScanInfo,
                                     STsdbReader* pReader, bool* loadNeighbor) {
  int32_t     code = TSDB_CODE_SUCCESS;
  int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
  int32_t     nextIndex = -1;
  SBlockIndex nxtBIndex = {0};

  *loadNeighbor = false;
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);

  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pBlockScanInfo, &nextIndex, pReader->order, &nxtBIndex);
  if (!hasNeighbor) {  // do nothing
    return code;
  }

  if (overlapWithNeighborBlock(pBlock, &nxtBIndex, pReader->order)) {  // load next block
    SReaderStatus*  pStatus = &pReader->status;
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

    // 1. find the next neighbor block in the scan block list
    SFileDataBlockInfo fb = {.uid = pBlockInfo->uid, .tbBlockIdx = nextIndex};
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);

    // 2. remove it from the scan block list
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);

    // 3. load the neighbor block, and set it to be the currently accessed file data block
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pBlockInfo->uid);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    // 4. check the data values
    initBlockDumpInfo(pReader, pBlockIter);
    *loadNeighbor = true;
  }

  return code;
}

2815
static void updateComposedBlockInfo(STsdbReader* pReader, double el, STableBlockScanInfo* pBlockScanInfo) {
2816
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827

  pResBlock->info.id.uid = (pBlockScanInfo != NULL) ? pBlockScanInfo->uid : 0;
  pResBlock->info.dataLoad = 1;
  blockDataUpdateTsWindow(pResBlock, pReader->suppInfo.slotId[0]);

  setComposedBlockFlag(pReader, true);

  pReader->cost.composedBlocks += 1;
  pReader->cost.buildComposedBlockTime += el;
}

2828
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2829 2830
  int32_t code = TSDB_CODE_SUCCESS;

2831
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
2832

H
Hongze Cheng 已提交
2833
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
C
Cary Xu 已提交
2834 2835
  SLastBlockReader*   pLastBlockReader = pReader->status.fileIter.pLastBlockReader;

2836
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
C
Cary Xu 已提交
2837
  int64_t st = taosGetTimestampUs();
2838
  int32_t step = asc ? 1 : -1;
2839
  double  el = 0;
2840 2841
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2842 2843 2844

  STableBlockScanInfo* pBlockScanInfo = NULL;
  if (pBlockInfo != NULL) {
D
dapan1121 已提交
2845
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
2846 2847 2848
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
      return code;
    }
G
Ganlin Zhao 已提交
2849

H
Haojun Liao 已提交
2850 2851
    pBlockScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr);
    if (pBlockScanInfo == NULL) {
H
Haojun Liao 已提交
2852 2853 2854
      goto _end;
    }

G
Ganlin Zhao 已提交
2855
    TSDBKEY keyInBuf = getCurrentKeyInBuf(pBlockScanInfo, pReader);
C
Cary Xu 已提交
2856 2857

    // it is a clean block, load it directly
H
Hongze Cheng 已提交
2858
    if (isCleanFileDataBlock(pReader, pBlockInfo, pBlock, pBlockScanInfo, keyInBuf, pLastBlockReader) &&
2859
        pBlock->nRow <= pReader->resBlockInfo.capacity) {
G
Ganlin Zhao 已提交
2860
      if (asc || (!hasDataInLastBlock(pLastBlockReader) && (pBlock->maxKey.ts > keyInBuf.ts))) {
D
dapan1121 已提交
2861 2862 2863 2864
        code = copyBlockDataToSDataBlock(pReader);
        if (code) {
          goto _end;
        }
2865 2866

        // record the last key value
H
Hongze Cheng 已提交
2867
        pBlockScanInfo->lastKey = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
H
Haojun Liao 已提交
2868 2869
        goto _end;
      }
C
Cary Xu 已提交
2870 2871
    }
  } else {  // file blocks not exist
2872
    pBlockScanInfo = *pReader->status.pTableIter;
D
dapan1121 已提交
2873
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockScanInfo->uid, sizeof(pBlockScanInfo->uid))) {
2874 2875 2876
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
      return code;
    }
2877 2878
  }

2879
  SBlockData* pBlockData = &pReader->status.fileBlockData;
2880

2881
  while (1) {
2882
    bool hasBlockData = false;
2883
    {
2884 2885
      while (pBlockData->nRow > 0 &&
             pBlockData->uid == pBlockScanInfo->uid) {  // find the first qualified row in data block
2886 2887 2888 2889 2890
        if (isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
          hasBlockData = true;
          break;
        }

2891 2892
        pDumpInfo->rowIndex += step;

2893
        pBlock = getCurrentBlock(&pReader->status.blockIter);
2894
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
H
Haojun Liao 已提交
2895
          pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);  // NOTE: get the new block info
H
Haojun Liao 已提交
2896

H
Haojun Liao 已提交
2897 2898 2899 2900 2901
          // continue check for the next file block if the last ts in the current block
          // is overlapped with the next neighbor block
          bool loadNeighbor = false;
          code = loadNeighborIfOverlap(pBlockInfo, pBlockScanInfo, pReader, &loadNeighbor);
          if ((!loadNeighbor) || (code != 0)) {
2902 2903
            setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
            break;
2904
          }
2905 2906
        }
      }
2907
    }
2908

2909
    // no data in last block and block, no need to proceed.
2910
    if (hasBlockData == false) {
2911
      break;
2912 2913
    }

D
dapan1121 已提交
2914 2915 2916 2917
    code = buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
    if (code) {
      goto _end;
    }
2918

2919
    // currently loaded file data block is consumed
2920
    if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
2921
      pBlock = getCurrentBlock(&pReader->status.blockIter);
2922
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2923 2924 2925
      break;
    }

H
Haojun Liao 已提交
2926
    if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
2927
      break;
2928 2929 2930
    }
  }

H
Hongze Cheng 已提交
2931
_end:
2932 2933
  el = (taosGetTimestampUs() - st) / 1000.0;
  updateComposedBlockInfo(pReader, el, pBlockScanInfo);
2934

2935
  if (pResBlock->info.rows > 0) {
2936 2937
    tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
              ", elapsed time:%.2f ms %s",
H
Haojun Liao 已提交
2938
              pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
H
Haojun Liao 已提交
2939
              pResBlock->info.rows, el, pReader->idStr);
2940
  }
2941

H
Haojun Liao 已提交
2942
  return code;
2943 2944 2945 2946
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

2947 2948 2949 2950 2951 2952 2953 2954
int32_t getInitialDelIndex(const SArray* pDelSkyline, int32_t order) {
  if (pDelSkyline == NULL) {
    return 0;
  }

  return ASCENDING_TRAVERSE(order) ? 0 : taosArrayGetSize(pDelSkyline) - 1;
}

dengyihao's avatar
dengyihao 已提交
2955 2956
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
2957 2958 2959
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
2960

2961
  int32_t code = 0;
2962 2963
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
2964
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
2965
  if (pDelFile && taosArrayGetSize(pReader->pDelIdx) > 0) {
2966
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
2967
    SDelIdx* pIdx = taosArraySearch(pReader->pDelIdx, &idx, tCmprDelIdx, TD_EQ);
2968

H
Haojun Liao 已提交
2969
    if (pIdx != NULL) {
H
Haojun Liao 已提交
2970
      code = tsdbReadDelDatav1(pReader->pDelFReader, pIdx, pDelData, pReader->verRange.maxVer);
2971 2972 2973
    }
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
2974
    }
2975
  }
2976

2977 2978 2979 2980
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
2981 2982 2983 2984
      if (p->version <= pReader->verRange.maxVer) {
        taosArrayPush(pDelData, p);
      }

2985 2986
      p = p->pNext;
    }
2987 2988
  }

2989 2990 2991
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
2992 2993 2994
      if (p->version <= pReader->verRange.maxVer) {
        taosArrayPush(pDelData, p);
      }
2995 2996 2997 2998 2999 3000 3001 3002 3003 3004
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
3005 3006 3007 3008 3009 3010 3011
  int32_t index = getInitialDelIndex(pBlockScanInfo->delSkyline, pReader->order);

  pBlockScanInfo->iter.index = index;
  pBlockScanInfo->iiter.index = index;
  pBlockScanInfo->fileDelIndex = index;
  pBlockScanInfo->lastBlockDelIndex = index;

3012 3013
  return code;

3014 3015 3016
_err:
  taosArrayDestroy(pDelData);
  return code;
3017 3018
}

C
Cary Xu 已提交
3019
TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
3020
  bool asc = ASCENDING_TRAVERSE(pReader->order);
X
Xiaoyu Wang 已提交
3021
  //  TSKEY initialVal = asc? TSKEY_MIN:TSKEY_MAX;
3022

X
Xiaoyu Wang 已提交
3023
  TSDBKEY key = {.ts = TSKEY_INITIAL_VAL}, ikey = {.ts = TSKEY_INITIAL_VAL};
3024

X
Xiaoyu Wang 已提交
3025
  bool     hasKey = false, hasIKey = false;
3026
  TSDBROW* pRow = getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
3027
  if (pRow != NULL) {
3028
    hasKey = true;
3029 3030 3031
    key = TSDBROW_KEY(pRow);
  }

3032 3033 3034 3035
  TSDBROW* pIRow = getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
  if (pIRow != NULL) {
    hasIKey = true;
    ikey = TSDBROW_KEY(pIRow);
3036 3037
  }

3038
  if (hasKey) {
X
Xiaoyu Wang 已提交
3039
    if (hasIKey) {  // has data in mem & imem
3040 3041
      if (asc) {
        return key.ts <= ikey.ts ? key : ikey;
X
Xiaoyu Wang 已提交
3042 3043
      } else {
        return key.ts <= ikey.ts ? ikey : key;
3044 3045 3046
      }
    } else {  // no data in imem
      return key;
3047
    }
3048 3049 3050 3051
  } else {
    // no data in mem & imem, return the initial value
    // only imem has data, return ikey
    return ikey;
3052 3053 3054
  }
}

3055
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum, SArray* pTableList) {
H
Haojun Liao 已提交
3056
  SReaderStatus* pStatus = &pReader->status;
3057
  pBlockNum->numOfBlocks = 0;
3058
  pBlockNum->numOfLastFiles = 0;
3059

H
Haojun Liao 已提交
3060
  size_t  numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
3061
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
3062 3063

  while (1) {
H
Haojun Liao 已提交
3064
    // only check here, since the iterate data in memory is very fast.
H
Haojun Liao 已提交
3065 3066
    if (pReader->code != TSDB_CODE_SUCCESS) {
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
H
Haojun Liao 已提交
3067
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3068
      return pReader->code;
H
Haojun Liao 已提交
3069 3070
    }

dengyihao's avatar
dengyihao 已提交
3071
    bool    hasNext = false;
D
dapan1121 已提交
3072
    int32_t code = filesetIteratorNext(&pStatus->fileIter, pReader, &hasNext);
H
Haojun Liao 已提交
3073
    if (code != TSDB_CODE_SUCCESS) {
D
dapan1121 已提交
3074 3075 3076
      taosArrayDestroy(pIndexList);
      return code;
    }
dengyihao's avatar
dengyihao 已提交
3077

3078
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
3079 3080 3081
      break;
    }

H
Haojun Liao 已提交
3082
    taosArrayClear(pIndexList);
D
dapan1121 已提交
3083
    code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
H
Haojun Liao 已提交
3084
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3085
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3086 3087 3088
      return code;
    }

H
Hongze Cheng 已提交
3089
    if (taosArrayGetSize(pIndexList) > 0 || pReader->pFileReader->pSet->nSttF > 0) {
3090
      code = doLoadFileBlock(pReader, pIndexList, pBlockNum, pTableList);
H
Haojun Liao 已提交
3091
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3092
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
3093 3094 3095
        return code;
      }

3096
      if (pBlockNum->numOfBlocks + pBlockNum->numOfLastFiles > 0) {
H
Haojun Liao 已提交
3097 3098 3099
        break;
      }
    }
3100

H
Haojun Liao 已提交
3101 3102 3103
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
3104
  taosArrayDestroy(pIndexList);
3105

H
Haojun Liao 已提交
3106 3107 3108 3109 3110 3111 3112
  if (pReader->pReadSnap != NULL) {
    SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
    if (pReader->pDelFReader == NULL && pDelFile != NULL) {
      int32_t code = tsdbDelFReaderOpen(&pReader->pDelFReader, pDelFile, pReader->pTsdb);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
3113

H
Haojun Liao 已提交
3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124
      pReader->pDelIdx = taosArrayInit(4, sizeof(SDelIdx));
      if (pReader->pDelIdx == NULL) {
        code = TSDB_CODE_OUT_OF_MEMORY;
        return code;
      }

      code = tsdbReadDelIdx(pReader->pDelFReader, pReader->pDelIdx);
      if (code != TSDB_CODE_SUCCESS) {
        taosArrayDestroy(pReader->pDelIdx);
        return code;
      }
3125 3126 3127
    }
  }

H
Haojun Liao 已提交
3128 3129 3130
  return TSDB_CODE_SUCCESS;
}

X
Xiaoyu Wang 已提交
3131
static void resetTableListIndex(SReaderStatus* pStatus) {
3132
  STableUidList* pList = &pStatus->uidList;
3133

H
Haojun Liao 已提交
3134 3135
  pList->currentIndex = 0;
  uint64_t uid = pList->tableUidList[0];
H
Haojun Liao 已提交
3136
  pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
3137 3138
}

3139
static bool moveToNextTable(STableUidList* pOrderedCheckInfo, SReaderStatus* pStatus) {
3140
  pOrderedCheckInfo->currentIndex += 1;
H
Haojun Liao 已提交
3141
  if (pOrderedCheckInfo->currentIndex >= tSimpleHashGetSize(pStatus->pTableMap)) {
3142 3143 3144 3145 3146
    pStatus->pTableIter = NULL;
    return false;
  }

  uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
H
Haojun Liao 已提交
3147
  pStatus->pTableIter = tSimpleHashGet(pStatus->pTableMap, &uid, sizeof(uid));
3148
  return (pStatus->pTableIter != NULL);
3149 3150
}

3151
static int32_t doLoadLastBlockSequentially(STsdbReader* pReader) {
3152
  SReaderStatus*    pStatus = &pReader->status;
3153
  SLastBlockReader* pLastBlockReader = pStatus->fileIter.pLastBlockReader;
3154
  STableUidList*    pUidList = &pStatus->uidList;
D
dapan1121 已提交
3155
  int32_t           code = TSDB_CODE_SUCCESS;
3156

H
Haojun Liao 已提交
3157
  if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
H
Haojun Liao 已提交
3158
    return TSDB_CODE_SUCCESS;
3159
  }
3160

3161
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
3162

3163
  while (1) {
3164
    if (pReader->code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3165 3166
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
H
Haojun Liao 已提交
3167 3168
    }

3169
    // load the last data block of current table
H
Hongze Cheng 已提交
3170
    STableBlockScanInfo* pScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
D
dapan1121 已提交
3171
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pScanInfo->uid, sizeof(pScanInfo->uid))) {
D
dapan1121 已提交
3172 3173 3174 3175
      // reset the index in last block when handing a new file
      doCleanupTableScanInfo(pScanInfo);
      pStatus->mapDataCleaned = true;

3176 3177 3178 3179 3180 3181 3182
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
      if (!hasNexTable) {
        return TSDB_CODE_SUCCESS;
      }

      continue;
    }
3183

3184 3185 3186 3187 3188 3189
    // reset the index in last block when handing a new file
    doCleanupTableScanInfo(pScanInfo);
    pStatus->mapDataCleaned = true;

    bool hasDataInLastFile = initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
    if (!hasDataInLastFile) {
3190
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
3191
      if (!hasNexTable) {
3192 3193
        return TSDB_CODE_SUCCESS;
      }
3194

3195
      continue;
3196 3197
    }

3198 3199 3200 3201 3202 3203 3204 3205 3206
    int64_t st = taosGetTimestampUs();
    while (1) {
      bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

      // no data in last block and block, no need to proceed.
      if (hasBlockLData == false) {
        break;
      }

D
dapan1121 已提交
3207 3208 3209 3210
      code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
      if (code) {
        return code;
      }
3211

3212
      if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
3213 3214
        break;
      }
3215 3216
    }

3217 3218 3219 3220
    double el = (taosGetTimestampUs() - st) / 1000.0;
    updateComposedBlockInfo(pReader, el, pScanInfo);

    if (pResBlock->info.rows > 0) {
3221 3222
      tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
                ", elapsed time:%.2f ms %s",
3223 3224
                pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                pResBlock->info.rows, el, pReader->idStr);
3225 3226
      return TSDB_CODE_SUCCESS;
    }
3227

3228
    // current table is exhausted, let's try next table
3229
    bool hasNexTable = moveToNextTable(pUidList, pStatus);
3230
    if (!hasNexTable) {
3231 3232
      return TSDB_CODE_SUCCESS;
    }
3233 3234 3235
  }
}

3236
static int32_t doBuildDataBlock(STsdbReader* pReader) {
H
Hongze Cheng 已提交
3237
  int32_t   code = TSDB_CODE_SUCCESS;
3238 3239 3240

  SReaderStatus*       pStatus = &pReader->status;
  SDataBlockIter*      pBlockIter = &pStatus->blockIter;
3241 3242 3243
  STableBlockScanInfo* pScanInfo = NULL;
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(pBlockIter);
  SLastBlockReader*    pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
3244 3245
  SDataBlk*            pBlock = getCurrentBlock(pBlockIter);

D
dapan1121 已提交
3246
  if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &pBlockInfo->uid, sizeof(pBlockInfo->uid))) {
3247 3248 3249
    setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
    return code;
  }
3250

H
Haojun Liao 已提交
3251 3252
  if (pReader->code != TSDB_CODE_SUCCESS) {
    return pReader->code;
3253 3254
  }

H
Haojun Liao 已提交
3255
  pScanInfo = getTableBlockScanInfo(pReader->status.pTableMap, pBlockInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
3256
  if (pScanInfo == NULL) {
H
Haojun Liao 已提交
3257
    return terrno;
H
Haojun Liao 已提交
3258 3259
  }

3260

3261
  initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
C
Cary Xu 已提交
3262
  TSDBKEY keyInBuf = getCurrentKeyInBuf(pScanInfo, pReader);
3263

3264
  if (fileBlockShouldLoad(pReader, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader)) {
3265
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pScanInfo->uid);
3266 3267
    if (code != TSDB_CODE_SUCCESS) {
      return code;
3268 3269 3270
    }

    // build composed data block
3271
    code = buildComposedDataBlock(pReader);
C
Cary Xu 已提交
3272
  } else if (bufferDataInFileBlockGap(pReader->order, keyInBuf, pBlock)) {
3273
    // data in memory that are earlier than current file block
3274
    // rows in buffer should be less than the file block in asc, greater than file block in desc
3275
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
3276
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
3277 3278 3279 3280
  } else {
    if (hasDataInLastBlock(pLastBlockReader) && !ASCENDING_TRAVERSE(pReader->order)) {
      // only return the rows in last block
      int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
H
Hongze Cheng 已提交
3281
      ASSERT(tsLast >= pBlock->maxKey.ts);
3282

3283 3284 3285
      SBlockData* pBData = &pReader->status.fileBlockData;
      tBlockDataReset(pBData);

3286
      SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
3287
      tsdbDebug("load data in last block firstly, due to desc scan data, %s", pReader->idStr);
3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298

      int64_t st = taosGetTimestampUs();

      while (1) {
        bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

        // no data in last block and block, no need to proceed.
        if (hasBlockLData == false) {
          break;
        }

D
dapan1121 已提交
3299 3300 3301 3302
        code = buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
        if (code) {
          return code;
        }
3303

3304
        if (pResBlock->info.rows >= pReader->resBlockInfo.capacity) {
3305 3306 3307 3308 3309 3310 3311 3312
          break;
        }
      }

      double el = (taosGetTimestampUs() - st) / 1000.0;
      updateComposedBlockInfo(pReader, el, pScanInfo);

      if (pResBlock->info.rows > 0) {
3313 3314
        tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64 " rows:%" PRId64
                  ", elapsed time:%.2f ms %s",
3315 3316 3317
                  pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                  pResBlock->info.rows, el, pReader->idStr);
      }
H
Hongze Cheng 已提交
3318
    } else {  // whole block is required, return it directly
3319
      SDataBlockInfo* pInfo = &pReader->resBlockInfo.pResBlock->info;
3320
      pInfo->rows = pBlock->nRow;
H
Haojun Liao 已提交
3321
      pInfo->id.uid = pScanInfo->uid;
3322
      pInfo->dataLoad = 0;
3323 3324 3325
      pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
      setComposedBlockFlag(pReader, false);
      setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
3326

3327
      // update the last key for the corresponding table
H
Hongze Cheng 已提交
3328
      pScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? pInfo->window.ekey : pInfo->window.skey;
X
Xiaoyu Wang 已提交
3329 3330
      tsdbDebug("%p uid:%" PRIu64
                " clean file block retrieved from file, global index:%d, "
H
Haojun Liao 已提交
3331 3332 3333
                "table index:%d, rows:%d, brange:%" PRId64 "-%" PRId64 ", %s",
                pReader, pScanInfo->uid, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->nRow, pBlock->minKey.ts,
                pBlock->maxKey.ts, pReader->idStr);
3334
    }
3335 3336
  }

H
Haojun Liao 已提交
3337
  return (pReader->code != TSDB_CODE_SUCCESS)? pReader->code:code;
3338 3339
}

D
dapan1121 已提交
3340
static int32_t doSumFileBlockRows(STsdbReader* pReader, SDataFReader* pFileReader) {
D
dapan1121 已提交
3341 3342 3343 3344 3345 3346 3347
  int64_t    st = taosGetTimestampUs();
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
    goto _end;
  }

H
Haojun Liao 已提交
3348
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
D
dapan1121 已提交
3349 3350 3351 3352 3353 3354 3355 3356

  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
    return TSDB_CODE_SUCCESS;
  }

3357
  SBlockIdx* pBlockIdx = NULL;
D
dapan1121 已提交
3358 3359 3360 3361 3362 3363
  for (int32_t i = 0; i < num; ++i) {
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
    if (pBlockIdx->suid != pReader->suid) {
      continue;
    }

H
Haojun Liao 已提交
3364
    STableBlockScanInfo** p = tSimpleHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(pBlockIdx->uid));
D
dapan1121 已提交
3365
    if (p == NULL) {
D
dapan1121 已提交
3366 3367 3368
      continue;
    }

3369
    STableBlockScanInfo* pScanInfo = *p;
D
dapan1121 已提交
3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384
    tMapDataReset(&pScanInfo->mapData);
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);

    SDataBlk block = {0};
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
      pReader->rowsNum += block.nRow;
    }
  }

_end:
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
  return code;
}

D
dapan1121 已提交
3385
static int32_t doSumSttBlockRows(STsdbReader* pReader) {
3386 3387 3388
  int32_t            code = TSDB_CODE_SUCCESS;
  SLastBlockReader*  pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
  SSttBlockLoadInfo* pBlockLoadInfo = NULL;
D
dapan1121 已提交
3389 3390 3391

  for (int32_t i = 0; i < pReader->pFileReader->pSet->nSttF; ++i) {  // open all last file
    pBlockLoadInfo = &pLastBlockReader->pInfo[i];
3392

D
dapan1121 已提交
3393 3394 3395
    code = tsdbReadSttBlk(pReader->pFileReader, i, pBlockLoadInfo->aSttBlk);
    if (code) {
      return code;
D
dapan1121 已提交
3396 3397 3398 3399
    }

    size_t size = taosArrayGetSize(pBlockLoadInfo->aSttBlk);
    if (size >= 1) {
3400 3401 3402
      SSttBlk* pStart = taosArrayGet(pBlockLoadInfo->aSttBlk, 0);
      SSttBlk* pEnd = taosArrayGet(pBlockLoadInfo->aSttBlk, size - 1);

D
dapan1121 已提交
3403 3404 3405 3406 3407 3408 3409
      // all identical
      if (pStart->suid == pEnd->suid) {
        if (pStart->suid != pReader->suid) {
          // no qualified stt block existed
          taosArrayClear(pBlockLoadInfo->aSttBlk);
          continue;
        }
H
Haojun Liao 已提交
3410 3411
        for (int32_t j = 0; j < size; ++j) {
          SSttBlk* p = taosArrayGet(pBlockLoadInfo->aSttBlk, j);
D
dapan1121 已提交
3412 3413
          pReader->rowsNum += p->nRow;
        }
D
dapan1121 已提交
3414
      } else {
H
Haojun Liao 已提交
3415 3416
        for (int32_t j = 0; j < size; ++j) {
          SSttBlk* p = taosArrayGet(pBlockLoadInfo->aSttBlk, j);
D
dapan1121 已提交
3417 3418 3419 3420
          uint64_t s = p->suid;
          if (s < pReader->suid) {
            continue;
          }
3421

D
dapan1121 已提交
3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434
          if (s == pReader->suid) {
            pReader->rowsNum += p->nRow;
          } else if (s > pReader->suid) {
            break;
          }
        }
      }
    }
  }

  return code;
}

D
dapan1121 已提交
3435
static int32_t readRowsCountFromFiles(STsdbReader* pReader) {
3436
  int32_t code = TSDB_CODE_SUCCESS;
D
dapan1121 已提交
3437 3438

  while (1) {
3439 3440
    bool hasNext = false;
    code = filesetIteratorNext(&pReader->status.fileIter, pReader, &hasNext);
D
dapan1121 已提交
3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456
    if (code) {
      return code;
    }

    if (!hasNext) {  // no data files on disk
      break;
    }

    code = doSumFileBlockRows(pReader, pReader->pFileReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    code = doSumSttBlockRows(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
3457
    }
D
dapan1121 已提交
3458 3459 3460 3461 3462 3463 3464
  }

  pReader->status.loadFromFile = false;

  return code;
}

D
dapan1121 已提交
3465
static int32_t readRowsCountFromMem(STsdbReader* pReader) {
3466 3467
  int32_t code = TSDB_CODE_SUCCESS;
  int64_t memNum = 0, imemNum = 0;
D
dapan1121 已提交
3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480
  if (pReader->pReadSnap->pMem != NULL) {
    tsdbMemTableCountRows(pReader->pReadSnap->pMem, pReader->status.pTableMap, &memNum);
  }

  if (pReader->pReadSnap->pIMem != NULL) {
    tsdbMemTableCountRows(pReader->pReadSnap->pIMem, pReader->status.pTableMap, &imemNum);
  }

  pReader->rowsNum += memNum + imemNum;

  return code;
}

H
Haojun Liao 已提交
3481
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
3482
  SReaderStatus* pStatus = &pReader->status;
3483
  STableUidList* pUidList = &pStatus->uidList;
3484

3485
  while (1) {
3486
    if (pReader->code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3487 3488
      tsdbWarn("tsdb reader is stopped ASAP, code:%s, %s", strerror(pReader->code), pReader->idStr);
      return pReader->code;
3489
    }
3490

3491
    STableBlockScanInfo** pBlockScanInfo = pStatus->pTableIter;
D
dapan1121 已提交
3492
    if (pReader->pIgnoreTables && taosHashGet(*pReader->pIgnoreTables, &(*pBlockScanInfo)->uid, sizeof((*pBlockScanInfo)->uid))) {
3493 3494 3495 3496
      bool hasNexTable = moveToNextTable(pUidList, pStatus);
      if (!hasNexTable) {
        return TSDB_CODE_SUCCESS;
      }
3497
      pBlockScanInfo = pStatus->pTableIter;
3498 3499
    }
    
3500
    initMemDataIterator(*pBlockScanInfo, pReader);
3501

3502
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
3503
    int32_t code = buildDataBlockFromBuf(pReader, *pBlockScanInfo, endKey);
H
Haojun Liao 已提交
3504 3505 3506 3507
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

3508
    if (pReader->resBlockInfo.pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
3509
      return TSDB_CODE_SUCCESS;
3510 3511
    }

3512 3513 3514
    // current table is exhausted, let's try next table
    bool hasNexTable = moveToNextTable(pUidList, pStatus);
    if (!hasNexTable) {
H
Haojun Liao 已提交
3515
      return TSDB_CODE_SUCCESS;
3516 3517 3518 3519
    }
  }
}

3520
// set the correct start position in case of the first/last file block, according to the query time window
3521
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3522 3523 3524 3525
  int64_t             lastKey = ASCENDING_TRAVERSE(pReader->order) ? INT64_MIN : INT64_MAX;
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo) {
H
Haojun Liao 已提交
3526
    STableBlockScanInfo* pScanInfo = tSimpleHashGet(pBlockIter->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
3527 3528 3529
    if (pScanInfo) {
      lastKey = pScanInfo->lastKey;
    }
3530
  }
3531 3532 3533
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
3534 3535 3536

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
3537
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
3538
  pDumpInfo->lastKey = lastKey;
3539 3540
}

3541
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3542
  SBlockNumber num = {0};
3543 3544
  SArray* pTableList = taosArrayInit(40, POINTER_BYTES);

H
Haojun Liao 已提交
3545
  int32_t code = moveToNextFile(pReader, &num, pTableList);
3546
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3547
    taosArrayDestroy(pTableList);
3548 3549 3550 3551
    return code;
  }

  // all data files are consumed, try data in buffer
3552
  if (num.numOfBlocks + num.numOfLastFiles == 0) {
3553
    pReader->status.loadFromFile = false;
H
Haojun Liao 已提交
3554
    taosArrayDestroy(pTableList);
3555 3556 3557 3558
    return code;
  }

  // initialize the block iterator for a new fileset
3559
  if (num.numOfBlocks > 0) {
3560
    code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks, pTableList);
H
Hongze Cheng 已提交
3561
  } else {  // no block data, only last block exists
3562
    tBlockDataReset(&pReader->status.fileBlockData);
3563
    resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
3564
    resetTableListIndex(&pReader->status);
3565
  }
3566 3567

  // set the correct start position according to the query time window
3568
  initBlockDumpInfo(pReader, pBlockIter);
H
Haojun Liao 已提交
3569
  taosArrayDestroy(pTableList);
3570 3571 3572
  return code;
}

3573
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
3574 3575
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
3576 3577
}

3578 3579 3580 3581
typedef enum {
  TSDB_READ_RETURN = 0x1,
  TSDB_READ_CONTINUE = 0x2,
} ERetrieveType;
3582

3583 3584 3585
static ERetrieveType doReadDataFromLastFiles(STsdbReader* pReader) {
  int32_t         code = TSDB_CODE_SUCCESS;
  SSDataBlock*    pResBlock = pReader->resBlockInfo.pResBlock;
3586 3587
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

3588 3589 3590
  while(1) {
    terrno = 0;

3591 3592
    code = doLoadLastBlockSequentially(pReader);
    if (code != TSDB_CODE_SUCCESS) {
3593 3594
      terrno = code;
      return TSDB_READ_RETURN;
3595 3596
    }

3597 3598
    if (pResBlock->info.rows > 0) {
      return TSDB_READ_RETURN;
3599 3600
    }

3601
    // all data blocks are checked in this last block file, now let's try the next file
3602 3603
    ASSERT(pReader->status.pTableIter == NULL);
    code = initForFirstBlockInFile(pReader, pBlockIter);
3604

3605
    // error happens or all the data files are completely checked
H
Haojun Liao 已提交
3606
    if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
3607 3608 3609
      terrno = code;
      return TSDB_READ_RETURN;
    }
3610

3611 3612 3613 3614
    if (pBlockIter->numOfBlocks > 0) { // there are data blocks existed.
      return TSDB_READ_CONTINUE;
    } else {  // all blocks in data file are checked, let's check the data in last files
      resetTableListIndex(&pReader->status);
3615
    }
3616 3617
  }
}
3618

3619 3620 3621 3622 3623 3624 3625 3626
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
  int32_t code = TSDB_CODE_SUCCESS;
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

  SDataBlockIter* pBlockIter = &pReader->status.blockIter;
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;

  if (pBlockIter->numOfBlocks == 0) {
3627
    // let's try to extract data from stt files.
3628
    ERetrieveType type = doReadDataFromLastFiles(pReader);
3629
    if (type == TSDB_READ_RETURN) {
3630
      return terrno;
3631 3632 3633
    }

    code = doBuildDataBlock(pReader);
H
Haojun Liao 已提交
3634
    if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
3635 3636 3637 3638
      return code;
    }
  }

3639
  while (1) {
3640 3641
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

3642
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
3643
      code = buildComposedDataBlock(pReader);
3644 3645 3646 3647
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
H
Haojun Liao 已提交
3648
        bool hasNext = blockIteratorNext(&pReader->status.blockIter, pReader->idStr);
3649 3650
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
3651
        } else {
3652 3653
          // all data blocks in files are checked, let's check the data in last files.
          ASSERT(pReader->status.pCurrentFileset->nSttF > 0);
H
Haojun Liao 已提交
3654

3655 3656 3657 3658 3659
          // data blocks in current file are exhausted, let's try the next file now
          SBlockData* pBlockData = &pReader->status.fileBlockData;
          if (pBlockData->uid != 0) {
            tBlockDataClear(pBlockData);
          }
3660

3661 3662 3663
          tBlockDataReset(pBlockData);
          resetDataBlockIterator(pBlockIter, pReader->order);
          resetTableListIndex(&pReader->status);
3664

3665
          ERetrieveType type = doReadDataFromLastFiles(pReader);
3666
          if (type == TSDB_READ_RETURN) {
3667
            return terrno;
3668
          }
3669
        }
H
Haojun Liao 已提交
3670
      }
3671 3672

      code = doBuildDataBlock(pReader);
3673 3674
    }

H
Haojun Liao 已提交
3675
    if (code != TSDB_CODE_SUCCESS || pResBlock->info.rows > 0) {
3676 3677 3678
      return code;
    }
  }
3679
}
H
refact  
Hongze Cheng 已提交
3680

3681 3682
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
3683
  if (VND_IS_RSMA(pVnode)) {
3684
    int8_t  level = 0;
3685 3686
    int8_t  precision = pVnode->config.tsdbCfg.precision;
    int64_t now = taosGetTimestamp(precision);
H
Haojun Liao 已提交
3687 3688 3689
    int64_t offset = tsQueryRsmaTolerance * ((precision == TSDB_TIME_PRECISION_MILLI)   ? 1L
                                             : (precision == TSDB_TIME_PRECISION_MICRO) ? 1000L
                                                                                        : 1000000L);
3690

3691
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
3692 3693 3694 3695 3696 3697 3698
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
3699
      if ((now - pRetention->keep) <= (winSKey + offset)) {
3700 3701 3702 3703 3704
        break;
      }
      ++level;
    }

3705
    const char* str = (idStr != NULL) ? idStr : "";
3706 3707

    if (level == TSDB_RETENTION_L0) {
3708
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
3709
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
3710 3711
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
3712
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
3713
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
3714 3715
      return VND_RSMA1(pVnode);
    } else {
3716
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
3717
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
3718 3719 3720 3721 3722 3723 3724
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
3725
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
3726
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
3727 3728

  int64_t endVer = 0;
3729 3730
  if (pCond->endVersion == -1) {
    // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
3731 3732
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
3733
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
3734 3735
  }

H
Haojun Liao 已提交
3736
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
3737 3738
}

3739
bool hasBeenDropped(const SArray* pDelList, int32_t* index, int64_t key, int64_t ver, int32_t order, SVersionRange* pVerRange) {
3740 3741 3742
  if (pDelList == NULL) {
    return false;
  }
H
Haojun Liao 已提交
3743

L
Liu Jicong 已提交
3744 3745 3746
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
3747

3748 3749 3750
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
3751
      ASSERT(key >= last->ts);
3752

3753
      if (key > last->ts) {
3754
        return false;
3755
      } else if (key == last->ts) {
3756
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
3757
        return (prev->version >= ver && prev->version <= pVerRange->maxVer &&
3758
                prev->version >= pVerRange->minVer);
3759 3760
      }
    } else {
3761 3762 3763
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

3764
      if (key < pCurrent->ts) {
3765 3766 3767
        return false;
      }

3768
      if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
3769
          pVerRange->maxVer >= pCurrent->version) {
3770 3771 3772
        return true;
      }

3773
      while (pNext->ts <= key && (*index) < num - 1) {
3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

3785
          if (pCurrent->ts <= key && pNext->ts >= key && pCurrent->version >= ver &&
3786
              pVerRange->maxVer >= pCurrent->version) {
3787 3788 3789 3790 3791 3792
            return true;
          }
        }
      }

      return false;
3793 3794
    }
  } else {
3795 3796
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
3797

3798
      if (key < pFirst->ts) {
3799
        return false;
3800 3801
      } else if (key == pFirst->ts) {
        return pFirst->version >= ver;
3802 3803 3804
      } else {
        ASSERT(0);
      }
3805
    } else {
3806 3807 3808
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

3809
      if (key > pCurrent->ts) {
3810 3811 3812
        return false;
      }

3813
      if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
3814 3815 3816
        return true;
      }

3817
      while (pPrev->ts >= key && (*index) > 1) {
3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

3829
          if (pPrev->ts <= key && pCurrent->ts >= key && pPrev->version >= ver) {
3830 3831 3832
            return true;
          }
        }
3833 3834 3835 3836 3837
      }

      return false;
    }
  }
3838 3839

  return false;
3840 3841
}

3842
TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
3843
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
3844 3845
    return NULL;
  }
H
Hongze Cheng 已提交
3846

3847
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3848 3849
  TSDBKEY  key = TSDBROW_KEY(pRow);

3850
  if (outOfTimeWindow(key.ts, &pReader->window)) {
3851
    pIter->hasVal = false;
H
Haojun Liao 已提交
3852 3853
    return NULL;
  }
H
Hongze Cheng 已提交
3854

3855
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
3856
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
3857
      (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3858 3859
    return pRow;
  }
H
Hongze Cheng 已提交
3860

3861
  while (1) {
3862 3863
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3864 3865
      return NULL;
    }
H
Hongze Cheng 已提交
3866

3867
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3868

H
Haojun Liao 已提交
3869
    key = TSDBROW_KEY(pRow);
3870
    if (outOfTimeWindow(key.ts, &pReader->window)) {
3871
      pIter->hasVal = false;
H
Haojun Liao 已提交
3872 3873
      return NULL;
    }
H
Hongze Cheng 已提交
3874

dengyihao's avatar
dengyihao 已提交
3875
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
3876
        (!hasBeenDropped(pDelList, &pIter->index, key.ts, key.version, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3877 3878 3879 3880
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
3881

H
Haojun Liao 已提交
3882 3883 3884
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, STsdbReader* pReader) {
  SRowMerger* pMerger = &pReader->status.merger;

H
Haojun Liao 已提交
3885
  while (1) {
3886 3887
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3888 3889
      break;
    }
H
Hongze Cheng 已提交
3890

3891
    // data exists but not valid
3892
    TSDBROW* pRow = getValidMemRow(pIter, pDelList, pReader);
3893 3894 3895 3896 3897
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
3898
    TSDBKEY k = TSDBROW_KEY(pRow);
3899
    if (k.ts != ts) {
H
Haojun Liao 已提交
3900 3901 3902
      break;
    }

3903 3904 3905 3906 3907
    if (pRow->type == TSDBROW_ROW_FMT) {
      STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
      if (pTSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
3908

3909 3910
      tsdbRowMergerAdd(pMerger, pRow, pTSchema);
    } else {  // column format
3911
      tsdbRowMergerAdd(pMerger, pRow, NULL);
3912
    }
H
Haojun Liao 已提交
3913 3914 3915 3916 3917
  }

  return TSDB_CODE_SUCCESS;
}

3918
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
3919
                                          SVersionRange* pVerRange, int32_t step) {
3920
  while (rowIndex < pBlockData->nRow && rowIndex >= 0 && pBlockData->aTSKEY[rowIndex] == key) {
3921
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
3922
      rowIndex += step;
3923 3924 3925 3926
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
3927
    tsdbRowMergerAdd(pMerger, &fRow, NULL);
3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

H
Hongze Cheng 已提交
3939
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SDataBlk* pBlock,
3940 3941
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
3942
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
3943
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
3944
  bool                asc = ASCENDING_TRAVERSE(pReader->order);
3945

3946
  *state = CHECK_FILEBLOCK_QUIT;
3947
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
3948

3949
  bool    loadNeighbor = true;
H
Haojun Liao 已提交
3950
  int32_t code = loadNeighborIfOverlap(pFBlock, pScanInfo, pReader, &loadNeighbor);
3951

H
Haojun Liao 已提交
3952
  if (loadNeighbor && (code == TSDB_CODE_SUCCESS)) {
3953 3954
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
3955
    if ((pDumpInfo->rowIndex >= pDumpInfo->totalRows && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
3956 3957 3958 3959
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

H
Haojun Liao 已提交
3960
  return code;
3961 3962
}

H
Haojun Liao 已提交
3963
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
3964 3965
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

H
Haojun Liao 已提交
3966
  SRowMerger* pMerger = &pReader->status.merger;
3967
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
3968
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
3969
  int32_t step = asc ? 1 : -1;
3970

3971
  pDumpInfo->rowIndex += step;
3972
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
3973 3974 3975
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
3976

3977 3978 3979 3980
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
3981

3982
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
3983
      SDataBlk*           pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
H
Haojun Liao 已提交
3984 3985 3986 3987 3988
      if (pFileBlockInfo == NULL) {
        st = CHECK_FILEBLOCK_QUIT;
        break;
      }

3989 3990 3991
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
3992
      }
3993
    }
H
Haojun Liao 已提交
3994
  }
3995

H
Haojun Liao 已提交
3996 3997 3998
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3999
int32_t doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
4000
                               SRowMerger* pMerger, SVersionRange* pVerRange, const char* idStr) {
4001
  while (nextRowFromLastBlocks(pLastBlockReader, pScanInfo, pVerRange)) {
4002 4003
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 == ts) {
4004 4005
      TSDBROW* pRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tsdbRowMergerAdd(pMerger, pRow1, NULL);
4006
    } else {
4007 4008 4009
      tsdbTrace("uid:%" PRIu64 " last del index:%d, del range:%d, lastKeyInStt:%" PRId64 ", %s", pScanInfo->uid,
                pScanInfo->lastBlockDelIndex, (int32_t)taosArrayGetSize(pScanInfo->delSkyline), pScanInfo->lastKeyInStt,
                idStr);
4010 4011 4012 4013 4014 4015 4016
      break;
    }
  }

  return TSDB_CODE_SUCCESS;
}

4017
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, TSDBROW* pResRow,
4018
                                 STsdbReader* pReader, bool* freeTSRow) {
H
Haojun Liao 已提交
4019
  TSDBROW* pNextRow = NULL;
4020
  TSDBROW  current = *pRow;
4021

4022 4023
  {  // if the timestamp of the next valid row has a different ts, return current row directly
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
4024

4025
    if (!pIter->hasVal) {
4026
      *pResRow = *pRow;
4027
      *freeTSRow = false;
4028
      return TSDB_CODE_SUCCESS;
4029
    } else {  // has next point in mem/imem
4030
      pNextRow = getValidMemRow(pIter, pDelList, pReader);
4031
      if (pNextRow == NULL) {
H
Haojun Liao 已提交
4032
        *pResRow = current;
4033
        *freeTSRow = false;
4034
        return TSDB_CODE_SUCCESS;
4035 4036
      }

H
Hongze Cheng 已提交
4037
      if (TSDBROW_TS(&current) != TSDBROW_TS(pNextRow)) {
H
Haojun Liao 已提交
4038
        *pResRow = current;
4039
        *freeTSRow = false;
4040
        return TSDB_CODE_SUCCESS;
4041
      }
4042
    }
4043 4044
  }

H
Haojun Liao 已提交
4045
  terrno = 0;
4046
  int32_t code = 0;
H
Haojun Liao 已提交
4047

4048 4049 4050 4051 4052 4053 4054
  // start to merge duplicated rows
  if (current.type == TSDBROW_ROW_FMT) {
    // get the correct schema for data in memory
    STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(&current), pReader, uid);
    if (pTSchema == NULL) {
      return terrno;
    }
H
Haojun Liao 已提交
4055

H
Haojun Liao 已提交
4056
    code = tsdbRowMergerAdd(&pReader->status.merger, &current, pTSchema);
4057 4058 4059
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
H
Haojun Liao 已提交
4060

4061 4062 4063 4064
    STSchema* pTSchema1 = doGetSchemaForTSRow(TSDBROW_SVERSION(pNextRow), pReader, uid);
    if (pTSchema1 == NULL) {
      return terrno;
    }
H
Haojun Liao 已提交
4065

H
Haojun Liao 已提交
4066
    tsdbRowMergerAdd(&pReader->status.merger,pNextRow, pTSchema1);
4067
  } else {  // let's merge rows in file block
H
Haojun Liao 已提交
4068
    code = tsdbRowMergerAdd(&pReader->status.merger, &current, pReader->pSchema);
4069 4070 4071
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
H
Haojun Liao 已提交
4072

H
Haojun Liao 已提交
4073
    tsdbRowMergerAdd(&pReader->status.merger,pNextRow, NULL);
4074
  }
H
Haojun Liao 已提交
4075

H
Haojun Liao 已提交
4076
  code = doMergeRowsInBuf(pIter, uid, TSDBROW_TS(&current), pDelList, pReader);
H
Haojun Liao 已提交
4077 4078 4079 4080
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

H
Haojun Liao 已提交
4081
  code = tsdbRowMergerGetRow(&pReader->status.merger, &pResRow->pTSRow);
4082 4083 4084
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
M
Minglei Jin 已提交
4085

wmmhello's avatar
wmmhello 已提交
4086
  pResRow->type = TSDBROW_ROW_FMT;
4087
  tsdbRowMergerClear(&pReader->status.merger);
4088
  *freeTSRow = true;
4089

4090
  return TSDB_CODE_SUCCESS;
4091 4092
}

4093
int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
H
Hongze Cheng 已提交
4094
                           SRow** pTSRow) {
H
Haojun Liao 已提交
4095
  SRowMerger* pMerger = &pReader->status.merger;
H
Haojun Liao 已提交
4096

4097 4098 4099
  TSDBKEY   k = TSDBROW_KEY(pRow);
  TSDBKEY   ik = TSDBROW_KEY(piRow);
  STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
4100 4101 4102 4103
  if (pSchema == NULL) {
    return terrno;
  }

4104
  STSchema* piSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
4105 4106 4107
  if (piSchema == NULL) {
    return terrno;
  }
4108

4109
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
H
Haojun Liao 已提交
4110
    int32_t code = tsdbRowMergerAdd(&pReader->status.merger, piRow, piSchema);
H
Haojun Liao 已提交
4111 4112 4113 4114
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
4115
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4116 4117 4118
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4119

H
Haojun Liao 已提交
4120
    tsdbRowMergerAdd(&pReader->status.merger,pRow, pSchema);
H
Haojun Liao 已提交
4121
    code =
H
Haojun Liao 已提交
4122
        doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4123 4124 4125 4126
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4127
  } else {
H
Haojun Liao 已提交
4128 4129
    int32_t code = tsdbRowMergerAdd(&pReader->status.merger, pRow, pSchema);
    if (code != TSDB_CODE_SUCCESS || pMerger->pTSchema == NULL) {
H
Haojun Liao 已提交
4130 4131 4132
      return code;
    }

H
Haojun Liao 已提交
4133
    code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4134 4135 4136
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4137

H
Haojun Liao 已提交
4138 4139
    tsdbRowMergerAdd(&pReader->status.merger, piRow, piSchema);
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, pReader);
H
Haojun Liao 已提交
4140 4141 4142
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4143
  }
4144

H
Haojun Liao 已提交
4145
  int32_t code = tsdbRowMergerGetRow(pMerger, pTSRow);
4146
  tsdbRowMergerClear(pMerger);
4147
  return code;
4148 4149
}

4150
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, TSDBROW* pResRow, int64_t endKey,
4151
                            bool* freeTSRow) {
4152 4153
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
4154
  SArray*  pDelList = pBlockScanInfo->delSkyline;
4155
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
4156

4157 4158
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
4159
  if (pBlockScanInfo->iter.hasVal) {
4160 4161 4162 4163 4164 4165
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

4166
  if (pBlockScanInfo->iiter.hasVal) {
4167 4168 4169 4170 4171 4172
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

4173
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
4174
    TSDBKEY k = TSDBROW_KEY(pRow);
4175
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
4176

4177
    int32_t code = TSDB_CODE_SUCCESS;
4178 4179
    if (ik.ts != k.ts) {
      if (((ik.ts < k.ts) && asc) || ((ik.ts > k.ts) && (!asc))) {  // ik.ts < k.ts
4180
        code = doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pResRow, pReader, freeTSRow);
4181
      } else if (((k.ts < ik.ts) && asc) || ((k.ts > ik.ts) && (!asc))) {
4182
        code = doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, pResRow, pReader, freeTSRow);
4183
      }
4184
    } else {  // ik.ts == k.ts
4185
      *freeTSRow = true;
4186 4187
      pResRow->type = TSDBROW_ROW_FMT;
      code = doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pResRow->pTSRow);
4188 4189 4190
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
4191
    }
4192

4193
    return code;
H
Haojun Liao 已提交
4194 4195
  }

4196
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
4197
    return doMergeMemTableMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pResRow, pReader,
H
Hongze Cheng 已提交
4198
                                    freeTSRow);
H
Haojun Liao 已提交
4199 4200
  }

4201
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
4202
    return doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pResRow, pReader, freeTSRow);
H
Haojun Liao 已提交
4203 4204 4205 4206 4207
  }

  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
4208
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, SRow* pTSRow, STableBlockScanInfo* pScanInfo) {
H
Haojun Liao 已提交
4209
  int32_t outputRowIndex = pBlock->info.rows;
4210
  int64_t uid = pScanInfo->uid;
4211
  int32_t code = TSDB_CODE_SUCCESS;
4212

4213
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
4214
  STSchema*           pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
4215 4216 4217
  if (pSchema == NULL) {
    return terrno;
  }
4218

4219
  SColVal colVal = {0};
4220
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
4221

4222
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
4223
    SColumnInfoData* pColData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4224
    ((int64_t*)pColData->pData)[outputRowIndex] = pTSRow->ts;
4225 4226 4227
    i += 1;
  }

H
Haojun Liao 已提交
4228
  while (i < pSupInfo->numOfCols && j < pSchema->numOfCols) {
H
Haojun Liao 已提交
4229
    col_id_t colId = pSupInfo->colId[i];
4230 4231

    if (colId == pSchema->columns[j].colId) {
H
Haojun Liao 已提交
4232
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4233

H
Hongze Cheng 已提交
4234
      tRowGet(pTSRow, pSchema, j, &colVal);
D
dapan1121 已提交
4235 4236 4237 4238
      code = doCopyColVal(pColInfoData, outputRowIndex, i, &colVal, pSupInfo);
      if (code) {
        return code;
      }
4239 4240 4241
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
H
Haojun Liao 已提交
4242
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
4243

4244
      colDataSetNULL(pColInfoData, outputRowIndex);
4245 4246 4247
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
4248
    }
4249 4250
  }

4251
  // set null value since current column does not exist in the "pSchema"
H
Haojun Liao 已提交
4252
  while (i < pSupInfo->numOfCols) {
H
Haojun Liao 已提交
4253
    SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
4254
    colDataSetNULL(pColInfoData, outputRowIndex);
4255 4256 4257
    i += 1;
  }

4258
  pBlock->info.dataLoad = 1;
4259
  pBlock->info.rows += 1;
4260
  pScanInfo->lastKey = pTSRow->ts;
4261 4262 4263
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
4264 4265
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                 int32_t rowIndex) {
4266 4267
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;
D
dapan1121 已提交
4268
  int32_t code = TSDB_CODE_SUCCESS;
4269 4270

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
4271
  ((int64_t*)pReader->status.pPrimaryTsCol->pData)[outputRowIndex] = pBlockData->aTSKEY[rowIndex];
4272
  i += 1;
4273 4274

  SColVal cv = {0};
H
Hongze Cheng 已提交
4275
  int32_t numOfInputCols = pBlockData->nColData;
H
Haojun Liao 已提交
4276
  int32_t numOfOutputCols = pSupInfo->numOfCols;
4277

4278
  while (i < numOfOutputCols && j < numOfInputCols) {
H
Haojun Liao 已提交
4279
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, j);
H
Haojun Liao 已提交
4280
    if (pData->cid < pSupInfo->colId[i]) {
4281 4282 4283 4284
      j += 1;
      continue;
    }

H
Haojun Liao 已提交
4285 4286
    SColumnInfoData* pCol = TARRAY_GET_ELEM(pResBlock->pDataBlock, pSupInfo->slotId[i]);
    if (pData->cid == pSupInfo->colId[i]) {
4287
      tColDataGetValue(pData, rowIndex, &cv);
D
dapan1121 已提交
4288 4289 4290 4291
      code = doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
      if (code) {
        return code;
      }
4292
      j += 1;
H
Haojun Liao 已提交
4293 4294
    } else if (pData->cid > pCol->info.colId) {
      // the specified column does not exist in file block, fill with null data
4295
      colDataSetNULL(pCol, outputRowIndex);
4296 4297 4298 4299 4300 4301
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
4302
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
4303
    colDataSetNULL(pCol, outputRowIndex);
4304 4305 4306
    i += 1;
  }

4307
  pResBlock->info.dataLoad = 1;
4308 4309 4310 4311
  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

4312 4313
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
4314
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
4315
  int32_t      code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
4316 4317

  do {
4318
    //    SRow* pTSRow = NULL;
4319
    TSDBROW row = {.type = -1};
4320
    bool    freeTSRow = false;
4321 4322
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &row, endKey, &freeTSRow);
    if (row.type == -1) {
4323
      break;
H
Haojun Liao 已提交
4324 4325
    }

4326
    if (row.type == TSDBROW_ROW_FMT) {
D
dapan1121 已提交
4327
      code = doAppendRowFromTSRow(pBlock, pReader, row.pTSRow, pBlockScanInfo);
4328

4329 4330 4331
      if (freeTSRow) {
        taosMemoryFree(row.pTSRow);
      }
D
dapan1121 已提交
4332 4333 4334 4335

      if (code) {
        return code;
      }
4336
    } else {
D
dapan1121 已提交
4337 4338 4339 4340
      code = doAppendRowFromFileBlock(pBlock, pReader, row.pBlockData, row.iRow);
      if (code) {
        break;
      }
4341
    }
H
Haojun Liao 已提交
4342 4343

    // no data in buffer, return immediately
4344
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
4345 4346 4347
      break;
    }

4348
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
4349 4350 4351 4352
      break;
    }
  } while (1);

D
dapan1121 已提交
4353
  return code;
H
Haojun Liao 已提交
4354
}
H
Hongze Cheng 已提交
4355

4356 4357
// TODO refactor: with createDataBlockScanInfo
int32_t tsdbSetTableList(STsdbReader* pReader, const void* pTableList, int32_t num) {
H
Haojun Liao 已提交
4358
  int32_t size = tSimpleHashGetSize(pReader->status.pTableMap);
4359

4360
  STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4361 4362 4363
  int32_t iter = 0;

  while ((p = tSimpleHashIterate(pReader->status.pTableMap, p, &iter)) != NULL) {
4364
    clearBlockScanInfo(*p);
4365 4366
  }

D
dapan1121 已提交
4367 4368 4369 4370 4371
  if (size < num) {
    int32_t code = ensureBlockScanInfoBuf(&pReader->blockInfoBuf, num);
    if (code) {
      return code;
    }
4372 4373 4374 4375 4376 4377 4378

    char* p1 = taosMemoryRealloc(pReader->status.uidList.tableUidList, sizeof(uint64_t) * num);
    if (p1 == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    pReader->status.uidList.tableUidList = (uint64_t*)p1;
D
dapan1121 已提交
4379
  }
4380

H
Haojun Liao 已提交
4381
  tSimpleHashClear(pReader->status.pTableMap);
4382
  STableUidList* pUidList = &pReader->status.uidList;
H
Haojun Liao 已提交
4383
  pUidList->currentIndex = 0;
4384

4385 4386
  STableKeyInfo* pList = (STableKeyInfo*)pTableList;
  for (int32_t i = 0; i < num; ++i) {
4387 4388
    STableBlockScanInfo* pInfo = getPosInBlockInfoBuf(&pReader->blockInfoBuf, i);
    pInfo->uid = pList[i].uid;
H
Haojun Liao 已提交
4389 4390
    pUidList->tableUidList[i] = pList[i].uid;

4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401
    // todo extract method
    if (ASCENDING_TRAVERSE(pReader->order)) {
      int64_t skey = pReader->window.skey;
      pInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
      pInfo->lastKeyInStt = skey;
    } else {
      int64_t ekey = pReader->window.ekey;
      pInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
      pInfo->lastKeyInStt = ekey;
    }

H
Haojun Liao 已提交
4402
    tSimpleHashPut(pReader->status.pTableMap, &pInfo->uid, sizeof(uint64_t), &pInfo, POINTER_BYTES);
4403 4404
  }

H
Hongze Cheng 已提交
4405 4406 4407
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
4408 4409 4410 4411 4412 4413
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
4414

dengyihao's avatar
dengyihao 已提交
4415 4416 4417 4418 4419 4420
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
4421

4422
uint64_t tsdbGetReaderMaxVersion(STsdbReader* pReader) { return pReader->verRange.maxVer; }
4423

4424
static int32_t doOpenReaderImpl(STsdbReader* pReader) {
4425 4426
  SReaderStatus*  pStatus = &pReader->status;
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
4427

4428 4429
  initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
  resetDataBlockIterator(&pStatus->blockIter, pReader->order);
4430

4431 4432 4433
  int32_t code = TSDB_CODE_SUCCESS;
  if (pStatus->fileIter.numOfFiles == 0) {
    pStatus->loadFromFile = false;
D
dapan1121 已提交
4434 4435
  } else if (READ_MODE_COUNT_ONLY == pReader->readMode) {
    // DO NOTHING
4436
  } else {
4437
    code = initForFirstBlockInFile(pReader, pBlockIter);
4438
  }
4439 4440 4441

  if (!pStatus->loadFromFile) {
    resetTableListIndex(pStatus);
4442
  }
4443 4444

  return code;
4445 4446
}

4447
static void freeSchemaFunc(void* param) {
G
Ganlin Zhao 已提交
4448 4449
  void **p = (void **)param;
  taosMemoryFreeClear(*p);
4450 4451
}

H
refact  
Hongze Cheng 已提交
4452
// ====================================== EXPOSED APIs ======================================
4453 4454
int32_t tsdbReaderOpen(void* pVnode, SQueryTableDataCond* pCond, void* pTableList, int32_t numOfTables,
                       SSDataBlock* pResBlock, void** ppReader, const char* idstr, bool countOnly, SHashObj** pIgnoreTables) {
4455
  STimeWindow window = pCond->twindows;
4456
  SVnodeCfg* pConf = &(((SVnode*)pVnode)->config);
4457

4458
  int32_t capacity = pConf->tsdbCfg.maxRows;
4459 4460
  if (pResBlock != NULL) {
    blockDataEnsureCapacity(pResBlock, capacity);
H
Haojun Liao 已提交
4461 4462 4463
  }

  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, capacity, pResBlock, idstr);
4464
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
4465 4466
    goto _err;
  }
H
Hongze Cheng 已提交
4467

4468
  // check for query time window
H
Haojun Liao 已提交
4469
  STsdbReader* pReader = *ppReader;
4470
  if (isEmptyQueryTimeWindow(&pReader->window) && pCond->type == TIMEWINDOW_RANGE_CONTAINED) {
H
Haojun Liao 已提交
4471 4472 4473
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
4474

4475 4476
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
4477
    int32_t order = pCond->order;
4478
    if (order == TSDB_ORDER_ASC) {
4479
      pCond->twindows.ekey = window.skey - 1;
4480 4481 4482
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
4483
      pCond->twindows.skey = window.ekey + 1;
4484 4485 4486 4487
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

4488
    // here we only need one more row, so the capacity is set to be ONE.
4489
    code = tsdbReaderCreate(pVnode, pCond, (void**)&((STsdbReader*)pReader)->innerReader[0], 1, pResBlock, idstr);
4490 4491 4492 4493 4494
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
4495
      pCond->twindows.skey = window.ekey + 1;
4496
      pCond->twindows.ekey = INT64_MAX;
4497
    } else {
4498
      pCond->twindows.skey = INT64_MIN;
4499
      pCond->twindows.ekey = window.ekey - 1;
4500
    }
4501 4502
    pCond->order = order;

4503
    code = tsdbReaderCreate(pVnode, pCond, (void**)&((STsdbReader*)pReader)->innerReader[1], 1, pResBlock, idstr);
4504 4505 4506 4507 4508
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

H
Haojun Liao 已提交
4509
  // NOTE: the endVersion in pCond is the data version not schema version, so pCond->endVersion is not correct here.
4510 4511
  //  no valid error code set in metaGetTbTSchema, so let's set the error code here.
  //  we should proceed in case of tmq processing.
4512
  if (pCond->suid != 0) {
4513
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, -1, 1);
H
Haojun Liao 已提交
4514
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
4515
      tsdbError("failed to get table schema, suid:%" PRIu64 ", ver:-1, %s", pReader->suid, pReader->idStr);
H
Haojun Liao 已提交
4516
    }
4517 4518
  } else if (numOfTables > 0) {
    STableKeyInfo* pKey = pTableList;
4519
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1, 1);
H
Haojun Liao 已提交
4520
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
4521
      tsdbError("failed to get table schema, uid:%" PRIu64 ", ver:-1, %s", pKey->uid, pReader->idStr);
H
Haojun Liao 已提交
4522
    }
4523 4524
  }

H
Haojun Liao 已提交
4525
  if (pReader->pSchema != NULL) {
4526
    tsdbRowMergerInit(&pReader->status.merger, pReader->pSchema);
H
Haojun Liao 已提交
4527 4528
  }

4529 4530
  pReader->pSchemaMap = tSimpleHashInit(8, taosFastHash);
  if (pReader->pSchemaMap == NULL) {
4531
    tsdbError("failed init schema hash for reader %s", pReader->idStr);
4532 4533 4534 4535 4536
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _err;
  }

  tSimpleHashSetFreeFp(pReader->pSchemaMap, freeSchemaFunc);
4537
  if (pReader->pSchema != NULL) {
H
Haojun Liao 已提交
4538 4539 4540 4541
    code = updateBlockSMAInfo(pReader->pSchema, &pReader->suppInfo);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
4542
  }
4543

4544
  STsdbReader* p = (pReader->innerReader[0] != NULL) ? pReader->innerReader[0] : pReader;
X
Xiaoyu Wang 已提交
4545 4546
  pReader->status.pTableMap =
      createDataBlockScanInfo(p, &pReader->blockInfoBuf, pTableList, &pReader->status.uidList, numOfTables);
H
Haojun Liao 已提交
4547 4548
  if (pReader->status.pTableMap == NULL) {
    *ppReader = NULL;
S
Shengliang Guan 已提交
4549
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
4550 4551
    goto _err;
  }
H
Hongze Cheng 已提交
4552

4553
  pReader->status.pLDataIter = taosMemoryCalloc(pConf->sttTrigger, sizeof(SLDataIter));
H
Haojun Liao 已提交
4554 4555 4556 4557 4558
  if (pReader->status.pLDataIter == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    goto _err;
  }

H
Haojun Liao 已提交
4559
  pReader->flag = READER_STATUS_SUSPEND;
4560

D
dapan1121 已提交
4561 4562 4563
  if (countOnly) {
    pReader->readMode = READ_MODE_COUNT_ONLY;
  }
4564

4565 4566
  pReader->pIgnoreTables = pIgnoreTables;

4567 4568 4569 4570 4571
  tsdbDebug("%p total numOfTable:%d, window:%" PRId64 " - %" PRId64 ", verRange:%" PRId64 " - %" PRId64
            " in this query %s",
            pReader, numOfTables, pReader->window.skey, pReader->window.ekey, pReader->verRange.minVer,
            pReader->verRange.maxVer, pReader->idStr);

H
Hongze Cheng 已提交
4572
  return code;
H
Hongze Cheng 已提交
4573 4574

_err:
H
Haojun Liao 已提交
4575
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), idstr);
K
kailixu 已提交
4576
  tsdbReaderClose(*ppReader);
X
Xiaoyu Wang 已提交
4577
  *ppReader = NULL;  // reset the pointer value.
H
Hongze Cheng 已提交
4578
  return code;
H
refact  
Hongze Cheng 已提交
4579 4580
}

4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596
static void clearSharedPtr(STsdbReader* p) {
  p->status.pLDataIter = NULL;
  p->status.pTableMap = NULL;
  p->status.uidList.tableUidList = NULL;
  p->pReadSnap = NULL;
  p->pSchema = NULL;
  p->pSchemaMap = NULL;
}

static void setSharedPtr(STsdbReader* pDst, const STsdbReader* pSrc) {
  pDst->status.pTableMap = pSrc->status.pTableMap;
  pDst->status.pLDataIter = pSrc->status.pLDataIter;
  pDst->status.uidList = pSrc->status.uidList;
  pDst->pSchema = pSrc->pSchema;
  pDst->pSchemaMap = pSrc->pSchemaMap;
  pDst->pReadSnap = pSrc->pReadSnap;
4597 4598 4599 4600

  if (pDst->pSchema) {
    tsdbRowMergerInit(&pDst->status.merger, pDst->pSchema);
  }
4601 4602
}

H
refact  
Hongze Cheng 已提交
4603
void tsdbReaderClose(STsdbReader* pReader) {
4604 4605
  if (pReader == NULL) {
    return;
4606
  }
H
refact  
Hongze Cheng 已提交
4607

4608
  tsdbAcquireReader(pReader);
4609

4610
  {
H
Haojun Liao 已提交
4611
    if (pReader->innerReader[0] != NULL || pReader->innerReader[1] != NULL) {
4612
      STsdbReader* p = pReader->innerReader[0];
4613
      clearSharedPtr(p);
4614 4615

      p = pReader->innerReader[1];
4616
      clearSharedPtr(p);
4617 4618 4619 4620 4621 4622

      tsdbReaderClose(pReader->innerReader[0]);
      tsdbReaderClose(pReader->innerReader[1]);
    }
  }

4623
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
H
Hongze Cheng 已提交
4624

4625
  taosArrayDestroy(pSupInfo->pColAgg);
H
Haojun Liao 已提交
4626
  for (int32_t i = 0; i < pSupInfo->numOfCols; ++i) {
4627 4628 4629 4630
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
4631

4632 4633
  if (pReader->resBlockInfo.freeBlock) {
    pReader->resBlockInfo.pResBlock = blockDataDestroy(pReader->resBlockInfo.pResBlock);
H
Haojun Liao 已提交
4634
  }
4635

H
Haojun Liao 已提交
4636
  taosMemoryFree(pSupInfo->colId);
H
Hongze Cheng 已提交
4637
  tBlockDataDestroy(&pReader->status.fileBlockData);
4638
  cleanupDataBlockIterator(&pReader->status.blockIter);
4639

H
Haojun Liao 已提交
4640
  size_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
4641 4642 4643 4644
  if (pReader->status.pTableMap != NULL) {
    destroyAllBlockScanInfo(pReader->status.pTableMap);
    clearBlockScanInfoBuf(&pReader->blockInfoBuf);
  }
4645

H
Haojun Liao 已提交
4646 4647 4648
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
4649

4650 4651 4652 4653 4654 4655 4656 4657 4658
  if (pReader->pDelFReader != NULL) {
    tsdbDelFReaderClose(&pReader->pDelFReader);
  }

  if (pReader->pDelIdx != NULL) {
    taosArrayDestroy(pReader->pDelIdx);
    pReader->pDelIdx = NULL;
  }

4659
  qTrace("tsdb/reader-close: %p, untake snapshot", pReader);
4660
  tsdbUntakeReadSnap(pReader, pReader->pReadSnap, true);
4661
  pReader->pReadSnap = NULL;
4662

4663 4664
  tsdbReleaseReader(pReader);

4665
  tsdbUninitReaderLock(pReader);
4666

H
Haojun Liao 已提交
4667
  taosMemoryFreeClear(pReader->status.pLDataIter);
H
Haojun Liao 已提交
4668
  taosMemoryFreeClear(pReader->status.uidList.tableUidList);
H
Haojun Liao 已提交
4669
  SIOCostSummary* pCost = &pReader->cost;
4670

H
Haojun Liao 已提交
4671 4672
  SFilesetIter* pFilesetIter = &pReader->status.fileIter;
  if (pFilesetIter->pLastBlockReader != NULL) {
H
Haojun Liao 已提交
4673 4674
    SLastBlockReader* pLReader = pFilesetIter->pLastBlockReader;
    tMergeTreeClose(&pLReader->mergeTree);
H
Haojun Liao 已提交
4675

H
Haojun Liao 已提交
4676
    getLastBlockLoadInfo(pLReader->pInfo, &pCost->lastBlockLoad, &pCost->lastBlockLoadTime);
H
refact  
Hongze Cheng 已提交
4677

H
Haojun Liao 已提交
4678 4679 4680
    pLReader->pInfo = destroyLastBlockLoadInfo(pLReader->pInfo);
    taosMemoryFree(pLReader);
  }
H
refact  
Hongze Cheng 已提交
4681

4682 4683 4684 4685 4686
  tsdbDebug(
      "%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
      " SMA-time:%.2f ms, fileBlocks:%" PRId64
      ", fileBlocks-load-time:%.2f ms, "
      "build in-memory-block-time:%.2f ms, lastBlocks:%" PRId64 ", lastBlocks-time:%.2f ms, composed-blocks:%" PRId64
X
Xiaoyu Wang 已提交
4687 4688
      ", composed-blocks-time:%.2fms, STableBlockScanInfo size:%.2f Kb, createTime:%.2f ms,initDelSkylineIterTime:%.2f "
      "ms, %s",
4689 4690 4691
      pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime, pCost->numOfBlocks,
      pCost->blockLoadTime, pCost->buildmemBlock, pCost->lastBlockLoad, pCost->lastBlockLoadTime, pCost->composedBlocks,
      pCost->buildComposedBlockTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pCost->createScanInfoList,
H
Haojun Liao 已提交
4692
      pCost->initDelSkylineIterTime, pReader->idStr);
H
refact  
Hongze Cheng 已提交
4693

4694
  taosMemoryFree(pReader->idStr);
H
Haojun Liao 已提交
4695

4696
  tsdbRowMergerCleanup(&pReader->status.merger);
4697
  taosMemoryFree(pReader->pSchema);
4698

4699
  tSimpleHashCleanup(pReader->pSchemaMap);
4700
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
4701 4702
}

4703 4704 4705 4706 4707 4708 4709 4710 4711 4712
int32_t tsdbReaderSuspend(STsdbReader* pReader) {
  int32_t code = 0;

  // save reader's base state & reset top state to be reconstructed from base state
  SReaderStatus*       pStatus = &pReader->status;
  STableBlockScanInfo* pBlockScanInfo = NULL;

  if (pStatus->loadFromFile) {
    SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
    if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
4713
      pBlockScanInfo = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, pReader->idStr);
4714 4715 4716 4717
      if (pBlockScanInfo == NULL) {
        goto _err;
      }
    } else {
4718
      pBlockScanInfo = *pStatus->pTableIter;
4719 4720 4721 4722 4723
    }

    tsdbDataFReaderClose(&pReader->pFileReader);

    // resetDataBlockScanInfo excluding lastKey
4724
    STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4725
    int32_t iter = 0;
4726

H
Haojun Liao 已提交
4727
    while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
4728 4729 4730 4731 4732 4733 4734 4735
      STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;

      pInfo->iterInit = false;
      pInfo->iter.hasVal = false;
      pInfo->iiter.hasVal = false;

      if (pInfo->iter.iter != NULL) {
        pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
4736 4737
      }

4738 4739 4740 4741 4742
      if (pInfo->iiter.iter != NULL) {
        pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
      }

      pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
4743 4744
    }
  } else {
4745 4746
    // resetDataBlockScanInfo excluding lastKey
    STableBlockScanInfo** p = NULL;
H
Haojun Liao 已提交
4747
    int32_t iter = 0;
4748

H
Haojun Liao 已提交
4749
    while ((p = tSimpleHashIterate(pStatus->pTableMap, p, &iter)) != NULL) {
4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766
      STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;

      pInfo->iterInit = false;
      pInfo->iter.hasVal = false;
      pInfo->iiter.hasVal = false;

      if (pInfo->iter.iter != NULL) {
        pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
      }

      if (pInfo->iiter.iter != NULL) {
        pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
      }

      pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    }

4767
    pBlockScanInfo = pStatus->pTableIter == NULL ? NULL : *pStatus->pTableIter;
4768 4769
    if (pBlockScanInfo) {
      // save lastKey to restore memory iterator
4770
      STimeWindow w = pReader->resBlockInfo.pResBlock->info.window;
4771 4772 4773 4774
      pBlockScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? w.ekey : w.skey;

      // reset current current table's data block scan info,
      pBlockScanInfo->iterInit = false;
4775 4776
      pBlockScanInfo->iter.hasVal = false;
      pBlockScanInfo->iiter.hasVal = false;
4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791
      if (pBlockScanInfo->iter.iter != NULL) {
        pBlockScanInfo->iter.iter = tsdbTbDataIterDestroy(pBlockScanInfo->iter.iter);
      }

      if (pBlockScanInfo->iiter.iter != NULL) {
        pBlockScanInfo->iiter.iter = tsdbTbDataIterDestroy(pBlockScanInfo->iiter.iter);
      }

      pBlockScanInfo->pBlockList = taosArrayDestroy(pBlockScanInfo->pBlockList);
      tMapDataClear(&pBlockScanInfo->mapData);
      // TODO: keep skyline for reuse
      pBlockScanInfo->delSkyline = taosArrayDestroy(pBlockScanInfo->delSkyline);
    }
  }

4792
  tsdbUntakeReadSnap(pReader, pReader->pReadSnap, false);
4793
  pReader->pReadSnap = NULL;
H
Haojun Liao 已提交
4794
  pReader->flag = READER_STATUS_SUSPEND;
4795

4796 4797
  tsdbDebug("reader: %p suspended uid %" PRIu64 " in this query %s", pReader, pBlockScanInfo ? pBlockScanInfo->uid : 0,
            pReader->idStr);
4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808
  return code;

_err:
  tsdbError("failed to suspend data reader, code:%s %s", tstrerror(code), pReader->idStr);
  return code;
}

static int32_t tsdbSetQueryReseek(void* pQHandle) {
  int32_t      code = 0;
  STsdbReader* pReader = pQHandle;

4809
  code = tsdbTryAcquireReader(pReader);
4810
  if (code == 0) {
H
Haojun Liao 已提交
4811
    if (pReader->flag == READER_STATUS_SUSPEND) {
4812
      tsdbReleaseReader(pReader);
4813 4814 4815 4816
      return code;
    }

    tsdbReaderSuspend(pReader);
4817

4818
    tsdbReleaseReader(pReader);
4819

4820
    return code;
4821 4822 4823
  } else if (code == EBUSY) {
    return TSDB_CODE_VND_QUERY_BUSY;
  } else {
4824 4825
    terrno = TAOS_SYSTEM_ERROR(code);
    return TSDB_CODE_FAILED;
4826 4827 4828 4829 4830 4831
  }
}

int32_t tsdbReaderResume(STsdbReader* pReader) {
  int32_t code = 0;

4832
  STableBlockScanInfo** pBlockScanInfo = pReader->status.pTableIter;
4833 4834 4835

  //  restore reader's state
  //  task snapshot
H
Haojun Liao 已提交
4836
  int32_t numOfTables = tSimpleHashGetSize(pReader->status.pTableMap);
4837
  if (numOfTables > 0) {
4838
    qTrace("tsdb/reader: %p, take snapshot", pReader);
4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853
    code = tsdbTakeReadSnap(pReader, tsdbSetQueryReseek, &pReader->pReadSnap);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
      code = doOpenReaderImpl(pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    } else {
      STsdbReader* pPrevReader = pReader->innerReader[0];
      STsdbReader* pNextReader = pReader->innerReader[1];

      // we need only one row
4854
      pPrevReader->resBlockInfo.capacity = 1;
4855
      setSharedPtr(pPrevReader, pReader);
4856

4857
      pNextReader->resBlockInfo.capacity = 1;
4858
      setSharedPtr(pNextReader, pReader);
4859 4860 4861 4862 4863 4864 4865 4866

      code = doOpenReaderImpl(pPrevReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
  }

H
Haojun Liao 已提交
4867
  pReader->flag = READER_STATUS_NORMAL;
4868 4869
  tsdbDebug("reader: %p resumed uid %" PRIu64 ", numOfTable:%" PRId32 ", in this query %s", pReader,
            pBlockScanInfo ? (*pBlockScanInfo)->uid : 0, numOfTables, pReader->idStr);
4870 4871 4872 4873 4874 4875 4876
  return code;

_err:
  tsdbError("failed to resume data reader, code:%s %s", tstrerror(code), pReader->idStr);
  return code;
}

D
dapan1121 已提交
4877
static bool tsdbReadRowsCountOnly(STsdbReader* pReader) {
4878
  int32_t      code = TSDB_CODE_SUCCESS;
4879
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
D
dapan1121 已提交
4880

D
dapan1121 已提交
4881 4882 4883
  if (pReader->status.loadFromFile == false) {
    return false;
  }
D
dapan1121 已提交
4884

D
dapan1121 已提交
4885
  code = readRowsCountFromFiles(pReader);
D
dapan1121 已提交
4886 4887
  if (code != TSDB_CODE_SUCCESS) {
    return false;
D
dapan1121 已提交
4888 4889
  }

D
dapan1121 已提交
4890 4891 4892 4893 4894
  code = readRowsCountFromMem(pReader);
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

D
dapan1121 已提交
4895 4896 4897
  pBlock->info.rows = pReader->rowsNum;
  pBlock->info.id.uid = 0;
  pBlock->info.dataLoad = 0;
4898

D
dapan1121 已提交
4899
  pReader->rowsNum = 0;
4900

D
dapan1121 已提交
4901 4902 4903
  return pBlock->info.rows > 0;
}

4904
static int32_t doTsdbNextDataBlock(STsdbReader* pReader, bool* hasNext) {
D
dapan1121 已提交
4905
  int32_t code = TSDB_CODE_SUCCESS;
4906

H
Haojun Liao 已提交
4907
  // cleanup the data that belongs to the previous data block
4908
  SSDataBlock* pBlock = pReader->resBlockInfo.pResBlock;
4909
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
4910

D
dapan1121 已提交
4911 4912
  *hasNext = false;

4913
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
4914
  if (tSimpleHashGetSize(pStatus->pTableMap) == 0) {
D
dapan1121 已提交
4915
    return code;
4916
  }
H
Haojun Liao 已提交
4917

D
dapan1121 已提交
4918 4919 4920 4921
  if (READ_MODE_COUNT_ONLY == pReader->readMode) {
    return tsdbReadRowsCountOnly(pReader);
  }

4922
  if (pStatus->loadFromFile) {
D
dapan1121 已提交
4923
    code = buildBlockFromFiles(pReader);
4924
    if (code != TSDB_CODE_SUCCESS) {
D
dapan1121 已提交
4925
      return code;
4926
    }
4927

D
dapan1121 已提交
4928
    if (pBlock->info.rows <= 0) {
4929
      resetTableListIndex(&pReader->status);
D
dapan1121 已提交
4930
      code = buildBlockFromBufferSequentially(pReader);
H
Haojun Liao 已提交
4931
    }
4932
  } else {  // no data in files, let's try the buffer
D
dapan1121 已提交
4933
    code = buildBlockFromBufferSequentially(pReader);
H
Haojun Liao 已提交
4934
  }
D
dapan1121 已提交
4935

D
dapan1121 已提交
4936 4937
  *hasNext = pBlock->info.rows > 0;

D
dapan1121 已提交
4938
  return code;
H
refact  
Hongze Cheng 已提交
4939 4940
}

4941
int32_t tsdbNextDataBlock(STsdbReader* pReader, bool* hasNext) {
D
dapan1121 已提交
4942 4943 4944
  int32_t code = TSDB_CODE_SUCCESS;

  *hasNext = false;
4945

H
Haojun Liao 已提交
4946 4947
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->step == EXTERNAL_ROWS_NEXT || pReader->code != TSDB_CODE_SUCCESS) {
    return (pReader->code != TSDB_CODE_SUCCESS)? pReader->code:code;
4948 4949
  }

4950 4951
  SReaderStatus* pStatus = &pReader->status;

D
dapan1121 已提交
4952
  code = tsdbAcquireReader(pReader);
4953 4954
  qTrace("tsdb/read: %p, take read mutex, code: %d", pReader, code);

H
Haojun Liao 已提交
4955
  if (pReader->flag == READER_STATUS_SUSPEND) {
4956 4957 4958 4959 4960
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
4961 4962
  }

4963
  if (pReader->innerReader[0] != NULL && pReader->step == 0) {
D
dapan1121 已提交
4964 4965 4966 4967 4968
    code = doTsdbNextDataBlock(pReader->innerReader[0], hasNext);
    if (code) {
      tsdbReleaseReader(pReader);
      return code;
    }
4969

4970
    pReader->step = EXTERNAL_ROWS_PREV;
D
dapan1121 已提交
4971
    if (*hasNext) {
4972
      pStatus = &pReader->innerReader[0]->status;
4973
      if (pStatus->composedDataBlock) {
4974
        qTrace("tsdb/read: %p, unlock read mutex", pReader);
4975
        tsdbReleaseReader(pReader);
4976 4977
      }

D
dapan1121 已提交
4978
      return code;
4979
    }
4980
  }
4981

4982
  if (pReader->step == EXTERNAL_ROWS_PREV) {
4983
    // prepare for the main scan
4984 4985 4986
    code = doOpenReaderImpl(pReader);
    int32_t step = 1;
    resetAllDataBlockScanInfo(pReader->status.pTableMap, pReader->innerReader[0]->window.ekey, step);
4987 4988 4989 4990 4991

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4992
    pReader->step = EXTERNAL_ROWS_MAIN;
4993 4994
  }

D
dapan1121 已提交
4995 4996 4997 4998 4999
  code = doTsdbNextDataBlock(pReader, hasNext);
  if (code != TSDB_CODE_SUCCESS) {
    tsdbReleaseReader(pReader);
    return code;
  }
5000

D
dapan1121 已提交
5001
  if (*hasNext) {
5002
    if (pStatus->composedDataBlock) {
5003
      qTrace("tsdb/read: %p, unlock read mutex", pReader);
5004
      tsdbReleaseReader(pReader);
5005 5006
    }

D
dapan1121 已提交
5007
    return code;
5008 5009
  }

5010
  if (pReader->step == EXTERNAL_ROWS_MAIN && pReader->innerReader[1] != NULL) {
5011
    // prepare for the next row scan
5012 5013 5014
    int32_t step = -1;
    code = doOpenReaderImpl(pReader->innerReader[1]);
    resetAllDataBlockScanInfo(pReader->innerReader[1]->status.pTableMap, pReader->window.ekey, step);
5015 5016 5017 5018
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

D
dapan1121 已提交
5019 5020 5021 5022 5023
    code = doTsdbNextDataBlock(pReader->innerReader[1], hasNext);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
5024

5025
    pReader->step = EXTERNAL_ROWS_NEXT;
D
dapan1121 已提交
5026
    if (*hasNext) {
5027
      pStatus = &pReader->innerReader[1]->status;
5028
      if (pStatus->composedDataBlock) {
5029
        qTrace("tsdb/read: %p, unlock read mutex", pReader);
5030
        tsdbReleaseReader(pReader);
5031 5032
      }

D
dapan1121 已提交
5033
      return code;
5034 5035 5036
    }
  }

5037
  qTrace("tsdb/read: %p, unlock read mutex", pReader);
5038
  tsdbReleaseReader(pReader);
5039

D
dapan1121 已提交
5040
  return code;
5041 5042
}

G
Ganlin Zhao 已提交
5043 5044
static bool doFillNullColSMA(SBlockLoadSuppInfo* pSup, int32_t numOfRows, int32_t numOfCols, SColumnDataAgg* pTsAgg) {
  bool hasNullSMA = false;
5045 5046
  // do fill all null column value SMA info
  int32_t i = 0, j = 0;
5047
  int32_t size = (int32_t)taosArrayGetSize(pSup->pColAgg);
5048
  taosArrayInsert(pSup->pColAgg, 0, pTsAgg);
5049
  size++;
5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060

  while (j < numOfCols && i < size) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colId[j]) {
      i += 1;
      j += 1;
    } else if (pAgg->colId < pSup->colId[j]) {
      i += 1;
    } else if (pSup->colId[j] < pAgg->colId) {
      if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
        SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
5061
        taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
5062
        i += 1;
D
dapan1121 已提交
5063
        size++;
G
Ganlin Zhao 已提交
5064
        hasNullSMA = true;
5065 5066 5067 5068
      }
      j += 1;
    }
  }
5069 5070 5071 5072 5073 5074

  while (j < numOfCols) {
    if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
      SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
      taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
      i += 1;
G
Ganlin Zhao 已提交
5075
      hasNullSMA = true;
5076 5077 5078
    }
    j++;
  }
G
Ganlin Zhao 已提交
5079 5080

  return hasNullSMA;
5081 5082
}

G
Ganlin Zhao 已提交
5083
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SSDataBlock* pDataBlock, bool* allHave, bool *hasNullSMA) {
H
Haojun Liao 已提交
5084 5085
  SColumnDataAgg*** pBlockSMA = &pDataBlock->pBlockAgg;

H
Hongze Cheng 已提交
5086
  int32_t code = 0;
5087
  *allHave = false;
H
Haojun Liao 已提交
5088
  *pBlockSMA = NULL;
H
Hongze Cheng 已提交
5089

5090
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
5091 5092 5093
    return TSDB_CODE_SUCCESS;
  }

5094
  // there is no statistics data for composed block
5095
  if (pReader->status.composedDataBlock || (!pReader->suppInfo.smaValid)) {
5096 5097
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
5098

5099
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
5100 5101
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

5102
  if (pReader->resBlockInfo.pResBlock->info.id.uid != pFBlock->uid) {
H
Haojun Liao 已提交
5103 5104
    return TSDB_CODE_SUCCESS;
  }
5105

D
dapan1121 已提交
5106 5107
  int64_t st = taosGetTimestampUs();

5108
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
H
Hongze Cheng 已提交
5109
  if (tDataBlkHasSma(pBlock)) {
H
Hongze Cheng 已提交
5110
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg);
5111
    if (code != TSDB_CODE_SUCCESS) {
5112 5113
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
5114 5115
      return code;
    }
5116
  } else {
H
Haojun Liao 已提交
5117
    *pBlockSMA = NULL;
5118
    return TSDB_CODE_SUCCESS;
5119
  }
H
Hongze Cheng 已提交
5120

5121
  *allHave = true;
H
Hongze Cheng 已提交
5122

5123 5124
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
5125

5126 5127
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
5128 5129
  pTsAgg->min = pReader->resBlockInfo.pResBlock->info.window.skey;
  pTsAgg->max = pReader->resBlockInfo.pResBlock->info.window.ekey;
5130 5131

  // update the number of NULL data rows
5132
  size_t numOfCols = pSup->numOfCols;
5133

5134
  // ensure capacity
H
Haojun Liao 已提交
5135 5136 5137
  if (pDataBlock->pDataBlock) {
    size_t colsNum = taosArrayGetSize(pDataBlock->pDataBlock);
    taosArrayEnsureCap(pSup->pColAgg, colsNum);
5138 5139
  }

5140
  SSDataBlock* pResBlock = pReader->resBlockInfo.pResBlock;
5141 5142
  if (pResBlock->pBlockAgg == NULL) {
    size_t num = taosArrayGetSize(pResBlock->pDataBlock);
H
Haojun Liao 已提交
5143
    pResBlock->pBlockAgg = taosMemoryCalloc(num, POINTER_BYTES);
5144
  }
5145

5146
  // do fill all null column value SMA info
G
Ganlin Zhao 已提交
5147 5148 5149 5150
  if (doFillNullColSMA(pSup, pBlock->nRow, numOfCols, pTsAgg)) {
    *hasNullSMA = true;
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
5151
  size_t size = taosArrayGetSize(pSup->pColAgg);
5152

H
Haojun Liao 已提交
5153
  int32_t i = 0, j = 0;
5154
  while (j < numOfCols && i < size) {
5155
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
H
Haojun Liao 已提交
5156 5157
    if (pAgg->colId == pSup->colId[j]) {
      pResBlock->pBlockAgg[pSup->slotId[j]] = pAgg;
5158 5159
      i += 1;
      j += 1;
H
Haojun Liao 已提交
5160
    } else if (pAgg->colId < pSup->colId[j]) {
5161
      i += 1;
H
Haojun Liao 已提交
5162
    } else if (pSup->colId[j] < pAgg->colId) {
5163 5164
      pResBlock->pBlockAgg[pSup->slotId[j]] = NULL;
      *allHave = false;
5165 5166 5167 5168
      j += 1;
    }
  }

H
Haojun Liao 已提交
5169
  *pBlockSMA = pResBlock->pBlockAgg;
5170
  pReader->cost.smaDataLoad += 1;
5171

D
dapan1121 已提交
5172 5173 5174
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
  pReader->cost.smaLoadTime += elapsedTime;

5175
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", %s", 0, pFBlock->uid, pReader->idStr);
H
Hongze Cheng 已提交
5176
  return code;
H
Hongze Cheng 已提交
5177 5178
}

H
Haojun Liao 已提交
5179 5180
STableBlockScanInfo* getTableBlockScanInfo(SSHashObj* pTableMap, uint64_t uid, const char* id) {
  STableBlockScanInfo** p = tSimpleHashGet(pTableMap, &uid, sizeof(uid));
H
Haojun Liao 已提交
5181 5182
  if (p == NULL || *p == NULL) {
    terrno = TSDB_CODE_INVALID_PARA;
H
Haojun Liao 已提交
5183
    int32_t size = tSimpleHashGetSize(pTableMap);
H
Haojun Liao 已提交
5184 5185 5186 5187 5188 5189 5190
    tsdbError("failed to locate the uid:%" PRIu64 " in query table uid list, total tables:%d, %s", uid, size, id);
    return NULL;
  }

  return *p;
}

H
Haojun Liao 已提交
5191
static SSDataBlock* doRetrieveDataBlock(STsdbReader* pReader) {
5192
  SReaderStatus*       pStatus = &pReader->status;
D
dapan1121 已提交
5193
  int32_t              code = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
5194
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(&pStatus->blockIter);
5195

H
Haojun Liao 已提交
5196
  if (pReader->code != TSDB_CODE_SUCCESS) {
5197 5198 5199
    return NULL;
  }

H
Haojun Liao 已提交
5200
  STableBlockScanInfo* pBlockScanInfo = getTableBlockScanInfo(pStatus->pTableMap, pBlockInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
5201
  if (pBlockScanInfo == NULL) {
5202
    return NULL;
5203 5204
  }

D
dapan1121 已提交
5205 5206 5207 5208 5209 5210 5211 5212
  code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData, pBlockScanInfo->uid);
  if (code != TSDB_CODE_SUCCESS) {
    tBlockDataDestroy(&pStatus->fileBlockData);
    terrno = code;
    return NULL;
  }

  code = copyBlockDataToSDataBlock(pReader);
5213
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
5214
    tBlockDataDestroy(&pStatus->fileBlockData);
5215 5216
    terrno = code;
    return NULL;
5217
  }
5218

5219
  return pReader->resBlockInfo.pResBlock;
H
Hongze Cheng 已提交
5220 5221
}

H
Haojun Liao 已提交
5222
SSDataBlock* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
5223
  STsdbReader* pTReader = pReader;
5224 5225
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
5226
      pTReader = pReader->innerReader[0];
5227
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
5228
      pTReader = pReader->innerReader[1];
5229 5230 5231
    }
  }

5232 5233
  SReaderStatus* pStatus = &pTReader->status;
  if (pStatus->composedDataBlock) {
5234
    return pTReader->resBlockInfo.pResBlock;
5235 5236 5237 5238
  }

  SSDataBlock* ret = doRetrieveDataBlock(pTReader);

5239
  qTrace("tsdb/read-retrieve: %p, unlock read mutex", pReader);
5240
  tsdbReleaseReader(pReader);
5241 5242

  return ret;
5243 5244
}

H
Haojun Liao 已提交
5245
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
5246 5247
  int32_t code = TSDB_CODE_SUCCESS;

5248
  qTrace("tsdb/reader-reset: %p, take read mutex", pReader);
5249
  tsdbAcquireReader(pReader);
L
Liu Jicong 已提交
5250

H
Haojun Liao 已提交
5251
  if (pReader->flag == READER_STATUS_SUSPEND) {
5252 5253 5254 5255 5256
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
L
Liu Jicong 已提交
5257 5258
  }

H
Haojun Liao 已提交
5259
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->pReadSnap == NULL) {
5260
    tsdbDebug("tsdb reader reset return %p, %s", pReader->pReadSnap, pReader->idStr);
5261
    tsdbReleaseReader(pReader);
5262 5263
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
5264

5265
  SReaderStatus*  pStatus = &pReader->status;
H
Haojun Liao 已提交
5266
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
5267

L
Liu Jicong 已提交
5268
  pReader->order = pCond->order;
5269
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
H
Haojun Liao 已提交
5270 5271
  pStatus->loadFromFile = true;
  pStatus->pTableIter = NULL;
H
Haojun Liao 已提交
5272
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
5273

5274
  // allocate buffer in order to load data blocks from file
5275
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
5276

5277
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
5278
  tsdbDataFReaderClose(&pReader->pFileReader);
5279

H
Haojun Liao 已提交
5280
  int32_t numOfTables = tSimpleHashGetSize(pStatus->pTableMap);
L
Liu Jicong 已提交
5281

H
Haojun Liao 已提交
5282
  initFilesetIterator(&pStatus->fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
5283
  resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
5284
  resetTableListIndex(&pReader->status);
H
Haojun Liao 已提交
5285

5286 5287 5288
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
  int64_t ts = asc ? pReader->window.skey - 1 : pReader->window.ekey + 1;
5289
  resetAllDataBlockScanInfo(pStatus->pTableMap, ts, step);
5290 5291

  // no data in files, let's try buffer in memory
H
Haojun Liao 已提交
5292 5293
  if (pStatus->fileIter.numOfFiles == 0) {
    pStatus->loadFromFile = false;
5294
    resetTableListIndex(pStatus);
5295 5296 5297
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
5298 5299
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
5300

5301
      tsdbReleaseReader(pReader);
5302 5303 5304
      return code;
    }
  }
H
Hongze Cheng 已提交
5305

H
Hongze Cheng 已提交
5306 5307 5308 5309
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, skey:%" PRId64 ", query range:%" PRId64 " - %" PRId64
            " in query %s",
            pReader, pReader->suid, numOfTables, pCond->twindows.skey, pReader->window.skey, pReader->window.ekey,
            pReader->idStr);
5310

5311
  tsdbReleaseReader(pReader);
5312

5313
  return code;
H
Hongze Cheng 已提交
5314
}
H
Hongze Cheng 已提交
5315

5316
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows, int32_t numOfBucket) {
5317 5318 5319
  if (numOfRows < startRow) {
    return 0;
  }
X
Xiaoyu Wang 已提交
5320
  int32_t bucketIndex = ((numOfRows - startRow) / bucketRange);
5321 5322 5323 5324
  if (bucketIndex == numOfBucket) {
    bucketIndex -= 1;
  }
  return bucketIndex;
5325
}
H
Hongze Cheng 已提交
5326

5327 5328 5329 5330
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
5331
  pTableBlockInfo->numOfVgroups = 1;
H
Hongze Cheng 已提交
5332

5333 5334
  const int32_t numOfBuckets = 20.0;

5335
  // find the start data block in file
dengyihao's avatar
dengyihao 已提交
5336
  tsdbAcquireReader(pReader);
H
Haojun Liao 已提交
5337
  if (pReader->flag == READER_STATUS_SUSPEND) {
5338 5339 5340 5341 5342
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
dengyihao's avatar
dengyihao 已提交
5343
  }
5344
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
5345

5346 5347 5348
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
5349

X
Xiaoyu Wang 已提交
5350
  int32_t bucketRange = ceil(((double)(pc->maxRows - pc->minRows)) / numOfBuckets);
H
Hongze Cheng 已提交
5351

5352
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
5353

H
Haojun Liao 已提交
5354
  int32_t numOfTables = (int32_t)tSimpleHashGetSize(pStatus->pTableMap);
5355
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
5356

5357 5358
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
5359

5360 5361
  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
H
Haojun Liao 已提交
5362
  }
H
Hongze Cheng 已提交
5363

5364
  pTableBlockInfo->numOfTables = numOfTables;
5365
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
5366

5367 5368
  while (true) {
    if (hasNext) {
H
Hongze Cheng 已提交
5369
      SDataBlk* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
5370

5371 5372
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
5373

5374 5375 5376
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
5377

5378 5379 5380
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
5381

5382 5383 5384
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
5385

5386
      pTableBlockInfo->totalSize += pBlock->aSubBlock[0].szBlock;
5387

5388
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows, numOfBuckets);
5389
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
5390

H
Haojun Liao 已提交
5391
      hasNext = blockIteratorNext(&pStatus->blockIter, pReader->idStr);
5392 5393
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
H
Haojun Liao 已提交
5394
      if ((code != TSDB_CODE_SUCCESS) || (pStatus->loadFromFile == false)) {
5395 5396
        break;
      }
H
refact  
Hongze Cheng 已提交
5397

5398 5399
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
      hasNext = (pBlockIter->numOfBlocks > 0);
5400
    }
H
refact  
Hongze Cheng 已提交
5401

H
Hongze Cheng 已提交
5402 5403
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
5404
  }
dengyihao's avatar
dengyihao 已提交
5405
  tsdbReleaseReader(pReader);
H
refact  
Hongze Cheng 已提交
5406 5407
  return code;
}
H
Hongze Cheng 已提交
5408

H
refact  
Hongze Cheng 已提交
5409
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
5410
  int32_t code = TSDB_CODE_SUCCESS;
5411
  int64_t rows = 0;
H
Hongze Cheng 已提交
5412

5413
  SReaderStatus* pStatus = &pReader->status;
5414
  tsdbAcquireReader(pReader);
H
Haojun Liao 已提交
5415
  if (pReader->flag == READER_STATUS_SUSPEND) {
5416 5417 5418 5419 5420
    code = tsdbReaderResume(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      tsdbReleaseReader(pReader);
      return code;
    }
5421 5422
  }

H
Haojun Liao 已提交
5423 5424
  int32_t iter = 0;
  pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, NULL, &iter);
H
Hongze Cheng 已提交
5425

5426
  while (pStatus->pTableIter != NULL) {
5427
    STableBlockScanInfo* pBlockScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
5428 5429

    STbData* d = NULL;
5430
    if (pReader->pReadSnap->pMem != NULL) {
H
Hongze Cheng 已提交
5431
      d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
5432 5433 5434 5435 5436 5437
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
5438
    if (pReader->pReadSnap->pIMem != NULL) {
H
Hongze Cheng 已提交
5439
      di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
5440 5441 5442 5443 5444 5445
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
H
Haojun Liao 已提交
5446
    pStatus->pTableIter = tSimpleHashIterate(pStatus->pTableMap, pStatus->pTableIter, &iter);
5447
  }
H
Hongze Cheng 已提交
5448

5449
  tsdbReleaseReader(pReader);
5450

H
refact  
Hongze Cheng 已提交
5451
  return rows;
H
Hongze Cheng 已提交
5452
}
D
dapan1121 已提交
5453

H
Haojun Liao 已提交
5454
int32_t tsdbGetTableSchema(void* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
5455
  SMetaReader mr = {0};
H
Haojun Liao 已提交
5456
  metaReaderInit(&mr, ((SVnode*)pVnode)->pMeta, 0);
5457
  int32_t code = metaReaderGetTableEntryByUidCache(&mr, uid);
D
dapan1121 已提交
5458 5459 5460 5461 5462 5463 5464
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
5465

5466
  // only child table and ordinary table is allowed, super table is not allowed.
D
dapan1121 已提交
5467
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
5468
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
5469
    *suid = mr.me.ctbEntry.suid;
5470
    code = metaReaderGetTableEntryByUidCache(&mr, *suid);
D
dapan1121 已提交
5471 5472 5473 5474 5475
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
5476
  } else if (mr.me.type == TSDB_NORMAL_TABLE) {  // do nothing
H
Haojun Liao 已提交
5477 5478 5479 5480
  } else {
    terrno = TSDB_CODE_INVALID_PARA;
    metaReaderClear(&mr);
    return terrno;
D
dapan1121 已提交
5481 5482 5483
  }

  metaReaderClear(&mr);
L
Liu Jicong 已提交
5484

5485
  // get the newest table schema version
H
Haojun Liao 已提交
5486
  code = metaGetTbTSchemaEx(((SVnode*)pVnode)->pMeta, *suid, uid, -1, pSchema);
5487
  return code;
D
dapan1121 已提交
5488
}
H
Hongze Cheng 已提交
5489

H
Hongze Cheng 已提交
5490
int32_t tsdbTakeReadSnap(STsdbReader* pReader, _query_reseek_func_t reseek, STsdbReadSnap** ppSnap) {
H
Hongze Cheng 已提交
5491 5492 5493
  int32_t        code = 0;
  STsdb*         pTsdb = pReader->pTsdb;
  SVersionRange* pRange = &pReader->verRange;
H
Hongze Cheng 已提交
5494 5495

  // alloc
H
Hongze Cheng 已提交
5496 5497
  STsdbReadSnap* pSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(*pSnap));
  if (pSnap == NULL) {
H
Hongze Cheng 已提交
5498 5499 5500 5501 5502
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
H
Hongze Cheng 已提交
5503
  taosThreadRwlockRdlock(&pTsdb->rwLock);
H
Hongze Cheng 已提交
5504 5505

  // take snapshot
H
Hongze Cheng 已提交
5506
  if (pTsdb->mem && (pRange->minVer <= pTsdb->mem->maxVer && pRange->maxVer >= pTsdb->mem->minVer)) {
H
Hongze Cheng 已提交
5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517
    pSnap->pMem = pTsdb->mem;
    pSnap->pNode = taosMemoryMalloc(sizeof(*pSnap->pNode));
    if (pSnap->pNode == NULL) {
      taosThreadRwlockUnlock(&pTsdb->rwLock);
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
    pSnap->pNode->pQHandle = pReader;
    pSnap->pNode->reseek = reseek;

    tsdbRefMemTable(pTsdb->mem, pSnap->pNode);
H
Hongze Cheng 已提交
5518 5519
  }

H
Hongze Cheng 已提交
5520
  if (pTsdb->imem && (pRange->minVer <= pTsdb->imem->maxVer && pRange->maxVer >= pTsdb->imem->minVer)) {
H
Hongze Cheng 已提交
5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531
    pSnap->pIMem = pTsdb->imem;
    pSnap->pINode = taosMemoryMalloc(sizeof(*pSnap->pINode));
    if (pSnap->pINode == NULL) {
      taosThreadRwlockUnlock(&pTsdb->rwLock);
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
    pSnap->pINode->pQHandle = pReader;
    pSnap->pINode->reseek = reseek;

    tsdbRefMemTable(pTsdb->imem, pSnap->pINode);
H
Hongze Cheng 已提交
5532 5533
  }

H
Hongze Cheng 已提交
5534
  // fs
H
Hongze Cheng 已提交
5535
  code = tsdbFSRef(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
5536 5537 5538 5539
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
5540 5541

  // unlock
H
Hongze Cheng 已提交
5542
  taosThreadRwlockUnlock(&pTsdb->rwLock);
H
Hongze Cheng 已提交
5543

5544
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
5545

H
Hongze Cheng 已提交
5546
_exit:
H
Hongze Cheng 已提交
5547 5548 5549 5550 5551 5552 5553 5554 5555 5556
  if (code) {
    *ppSnap = NULL;
    if (pSnap) {
      if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
      if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
      taosMemoryFree(pSnap);
    }
  } else {
    *ppSnap = pSnap;
  }
H
Hongze Cheng 已提交
5557 5558 5559
  return code;
}

5560
void tsdbUntakeReadSnap(STsdbReader* pReader, STsdbReadSnap* pSnap, bool proactive) {
H
Hongze Cheng 已提交
5561 5562
  STsdb* pTsdb = pReader->pTsdb;

H
Hongze Cheng 已提交
5563 5564
  if (pSnap) {
    if (pSnap->pMem) {
5565
      tsdbUnrefMemTable(pSnap->pMem, pSnap->pNode, proactive);
H
Hongze Cheng 已提交
5566 5567 5568
    }

    if (pSnap->pIMem) {
5569
      tsdbUnrefMemTable(pSnap->pIMem, pSnap->pINode, proactive);
H
Hongze Cheng 已提交
5570 5571
    }

H
Hongze Cheng 已提交
5572
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
5573 5574
    if (pSnap->pNode) taosMemoryFree(pSnap->pNode);
    if (pSnap->pINode) taosMemoryFree(pSnap->pINode);
H
Hongze Cheng 已提交
5575
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
5576
  }
5577
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
5578
}
5579 5580 5581 5582 5583

// if failed, do nothing
void tsdbReaderSetId(STsdbReader* pReader, const char* idstr) {
  taosMemoryFreeClear(pReader->idStr);
  pReader->idStr = taosStrdup(idstr);
5584
}
H
Haojun Liao 已提交
5585

H
Haojun Liao 已提交
5586
void tsdbReaderSetCloseFlag(STsdbReader* pReader) { pReader->code = TSDB_CODE_TSC_QUERY_CANCELLED; }