tsdbRead.c 151.6 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Haojun Liao 已提交
16
#include "osDef.h"
H
Hongze Cheng 已提交
17
#include "tsdb.h"
18

H
Hongze Cheng 已提交
19
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
H
Hongze Cheng 已提交
20

21 22 23 24 25 26
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

27
typedef struct {
dengyihao's avatar
dengyihao 已提交
28
  STbDataIter* iter;
29 30 31 32
  int32_t      index;
  bool         hasVal;
} SIterInfo;

33 34
typedef struct {
  int32_t numOfBlocks;
35
  int32_t numOfLastFiles;
36 37
} SBlockNumber;

38
typedef struct SBlockIndex {
39 40
  int32_t     ordinalIndex;
  int64_t     inFileOffset;
H
Haojun Liao 已提交
41
  STimeWindow window;  // todo replace it with overlap flag.
42 43
} SBlockIndex;

H
Haojun Liao 已提交
44
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
45 46
  uint64_t  uid;
  TSKEY     lastKey;
H
Hongze Cheng 已提交
47
  SMapData  mapData;            // block info (compressed)
48
  SArray*   pBlockList;         // block data index list, SArray<SBlockIndex>
H
Hongze Cheng 已提交
49 50 51 52 53 54
  SIterInfo iter;               // mem buffer skip list iterator
  SIterInfo iiter;              // imem buffer skip list iterator
  SArray*   delSkyline;         // delete info for this table
  int32_t   fileDelIndex;       // file block delete index
  int32_t   lastBlockDelIndex;  // delete index for last block
  bool      iterInit;           // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
55 56 57
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
58
  int64_t uid;
59
  int64_t offset;
H
Haojun Liao 已提交
60
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
61 62

typedef struct SBlockOrderSupporter {
63 64 65 66
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
67 68 69
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
70 71 72
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
73
  int64_t headFileLoad;
74
  double  headFileLoadTime;
75
  int64_t smaDataLoad;
76
  double  smaLoadTime;
77 78
  int64_t lastBlockLoad;
  double  lastBlockLoadTime;
H
Haojun Liao 已提交
79 80
  int64_t composedBlocks;
  double  buildComposedBlockTime;
H
Haojun Liao 已提交
81
  double  createScanInfoList;
H
Haojun Liao 已提交
82 83
//  double  getTbFromMemTime;
//  double  getTbFromIMemTime;
84
  double  initDelSkylineIterTime;
H
Hongze Cheng 已提交
85 86 87
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
88 89 90 91 92 93 94
  SArray*        pColAgg;
  SColumnDataAgg tsColAgg;
  int16_t*       colId;
  int16_t*       slotId;
  int32_t        numOfCols;
  char**         buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
  bool           smaValid;  // the sma on all queried columns are activated
H
Hongze Cheng 已提交
95 96
} SBlockLoadSuppInfo;

97
typedef struct SLastBlockReader {
H
Hongze Cheng 已提交
98 99 100 101 102
  STimeWindow        window;
  SVersionRange      verRange;
  int32_t            order;
  uint64_t           uid;
  SMergeTree         mergeTree;
103
  SSttBlockLoadInfo* pInfo;
104 105
} SLastBlockReader;

106
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
107 108 109
  int32_t           numOfFiles;  // number of total files
  int32_t           index;       // current accessed index in the list
  SArray*           pFileList;   // data file list
110
  int32_t           order;
H
Hongze Cheng 已提交
111
  SLastBlockReader* pLastBlockReader;  // last file block reader
112
} SFilesetIter;
H
Haojun Liao 已提交
113 114

typedef struct SFileDataBlockInfo {
115
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
116
  uint64_t uid;
117
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
118 119 120
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
121
  int32_t   numOfBlocks;
122
  int32_t   index;
H
Hongze Cheng 已提交
123
  SArray*   blockList;  // SArray<SFileDataBlockInfo>
124
  int32_t   order;
125
  SDataBlk  block;  // current SDataBlk data
126
  SHashObj* pTableMap;
H
Haojun Liao 已提交
127 128 129
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
130 131 132 133
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
134 135
} SFileBlockDumpInfo;

136
typedef struct SUidOrderCheckInfo {
137 138
  uint64_t* tableUidList;  // access table uid list in uid ascending order list
  int32_t   currentIndex;  // index in table uid list
139 140
} SUidOrderCheckInfo;

H
Haojun Liao 已提交
141
typedef struct SReaderStatus {
H
Hongze Cheng 已提交
142 143 144
  bool                  loadFromFile;       // check file stage
  bool                  composedDataBlock;  // the returned data block is a composed block or not
  SHashObj*             pTableMap;          // SHash<STableBlockScanInfo>
145
  STableBlockScanInfo** pTableIter;         // table iterator used in building in-memory buffer data blocks.
H
Hongze Cheng 已提交
146 147 148 149 150 151
  SUidOrderCheckInfo    uidCheckInfo;       // check all table in uid order
  SFileBlockDumpInfo    fBlockDumpInfo;
  SDFileSet*            pCurrentFileset;  // current opened file set
  SBlockData            fileBlockData;
  SFilesetIter          fileIter;
  SDataBlockIter        blockIter;
H
Haojun Liao 已提交
152 153
} SReaderStatus;

154
typedef struct SBlockInfoBuf {
H
Hongze Cheng 已提交
155 156 157
  int32_t currentIndex;
  SArray* pData;
  int32_t numPerBucket;
158 159
} SBlockInfoBuf;

H
Hongze Cheng 已提交
160
struct STsdbReader {
H
Haojun Liao 已提交
161 162 163
  STsdb*             pTsdb;
  uint64_t           suid;
  int16_t            order;
H
Haojun Liao 已提交
164
  bool               freeBlock;
H
Haojun Liao 已提交
165 166 167 168
  STimeWindow        window;  // the primary query time window that applies to all queries
  SSDataBlock*       pResBlock;
  int32_t            capacity;
  SReaderStatus      status;
169 170
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
171
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
172
  STsdbReadSnap*     pReadSnap;
173
  SIOCostSummary     cost;
174 175 176 177 178
  STSchema*          pSchema;      // the newest version schema
  STSchema*          pMemSchema;   // the previous schema for in-memory data, to avoid load schema too many times
  SDataFReader*      pFileReader;  // the file reader
  SDelFReader*       pDelFReader;  // the del file reader
  SArray*            pDelIdx;      // del file block index;
179
  SVersionRange      verRange;
180 181 182
  SBlockInfoBuf      blockInfoBuf;
  int32_t            step;
  STsdbReader*       innerReader[2];
H
Hongze Cheng 已提交
183
};
H
Hongze Cheng 已提交
184

H
Haojun Liao 已提交
185
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
186 187
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
188
static TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
189 190
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                        SRowMerger* pMerger);
H
Hongze Cheng 已提交
191
static int32_t  doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
192
                                       SRowMerger* pMerger, SVersionRange* pVerRange);
193
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
dengyihao's avatar
dengyihao 已提交
194
                                 STsdbReader* pReader);
H
Hongze Cheng 已提交
195
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow,
H
Haojun Liao 已提交
196
                                     STableBlockScanInfo* pScanInfo);
197
static int32_t  doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
H
Hongze Cheng 已提交
198
                                         int32_t rowIndex);
199
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
H
Hongze Cheng 已提交
200 201
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order,
                               SVersionRange* pVerRange);
202

H
Hongze Cheng 已提交
203 204 205 206
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
                                        STSRow** pTSRow, STsdbReader* pReader, bool* freeTSRow);
static int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo,
                                  STsdbReader* pReader, STSRow** pTSRow);
207 208
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                                     STsdbReader* pReader);
209

dengyihao's avatar
dengyihao 已提交
210 211 212 213
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
214
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Hongze Cheng 已提交
215 216 217
static int64_t       getCurrentKeyInLastBlock(SLastBlockReader* pLastBlockReader);
static bool          hasDataInLastBlock(SLastBlockReader* pLastBlockReader);
static int32_t       doBuildDataBlock(STsdbReader* pReader);
H
Haojun Liao 已提交
218
static TSDBKEY       getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader);
219
static bool          hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo);
220
static void          initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter);
221
static int32_t       getInitialDelIndex(const SArray* pDelSkyline, int32_t order);
H
Haojun Liao 已提交
222

223 224
static FORCE_INLINE STSchema* getLatestTableSchema(STsdbReader* pReader, uint64_t uid);

225 226
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }

227 228
static int32_t setColumnIdSlotList(SBlockLoadSuppInfo* pSupInfo, SColumnInfo* pCols, const int32_t* pSlotIdList,
                                   int32_t numOfCols) {
229
  pSupInfo->smaValid = true;
230
  pSupInfo->numOfCols = numOfCols;
231
  pSupInfo->colId = taosMemoryMalloc(numOfCols * (sizeof(int16_t) * 2 + POINTER_BYTES));
H
Haojun Liao 已提交
232 233
  if (pSupInfo->colId == NULL) {
    taosMemoryFree(pSupInfo->colId);
H
Haojun Liao 已提交
234 235
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
236

H
Haojun Liao 已提交
237
  pSupInfo->slotId = (int16_t*)((char*)pSupInfo->colId + (sizeof(int16_t) * numOfCols));
238
  pSupInfo->buildBuf = (char**)((char*)pSupInfo->slotId + (sizeof(int16_t) * numOfCols));
H
Haojun Liao 已提交
239
  for (int32_t i = 0; i < numOfCols; ++i) {
H
Haojun Liao 已提交
240 241
    pSupInfo->colId[i] = pCols[i].colId;
    pSupInfo->slotId[i] = pSlotIdList[i];
242

H
Haojun Liao 已提交
243 244
    if (IS_VAR_DATA_TYPE(pCols[i].type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCols[i].bytes);
H
Haojun Liao 已提交
245 246
    } else {
      pSupInfo->buildBuf[i] = NULL;
247
    }
H
Haojun Liao 已提交
248
  }
H
Hongze Cheng 已提交
249

H
Haojun Liao 已提交
250 251
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
252

H
Haojun Liao 已提交
253
static int32_t updateBlockSMAInfo(STSchema* pSchema, SBlockLoadSuppInfo* pSupInfo) {
254 255
  int32_t i = 0, j = 0;

256
  while (i < pSchema->numOfCols && j < pSupInfo->numOfCols) {
257
    STColumn* pTCol = &pSchema->columns[i];
H
Haojun Liao 已提交
258
    if (pTCol->colId == pSupInfo->colId[j]) {
259 260
      if (!IS_BSMA_ON(pTCol)) {
        pSupInfo->smaValid = false;
H
Haojun Liao 已提交
261
        return TSDB_CODE_SUCCESS;
262 263 264 265
      }

      i += 1;
      j += 1;
H
Haojun Liao 已提交
266
    } else if (pTCol->colId < pSupInfo->colId[j]) {
267 268 269
      // do nothing
      i += 1;
    } else {
H
Haojun Liao 已提交
270
      return TSDB_CODE_INVALID_PARA;
271 272
    }
  }
H
Haojun Liao 已提交
273 274

  return TSDB_CODE_SUCCESS;
275 276
}

277
static int32_t initBlockScanInfoBuf(SBlockInfoBuf* pBuf, int32_t numOfTables) {
H
Hongze Cheng 已提交
278
  int32_t num = numOfTables / pBuf->numPerBucket;
279 280 281 282 283
  int32_t remainder = numOfTables % pBuf->numPerBucket;
  if (pBuf->pData == NULL) {
    pBuf->pData = taosArrayInit(num + 1, POINTER_BYTES);
  }

H
Hongze Cheng 已提交
284
  for (int32_t i = 0; i < num; ++i) {
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
    char* p = taosMemoryCalloc(pBuf->numPerBucket, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

    taosArrayPush(pBuf->pData, &p);
  }

  if (remainder > 0) {
    char* p = taosMemoryCalloc(remainder, sizeof(STableBlockScanInfo));
    if (p == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }
    taosArrayPush(pBuf->pData, &p);
  }

  return TSDB_CODE_SUCCESS;
}

static void clearBlockScanInfoBuf(SBlockInfoBuf* pBuf) {
  size_t num = taosArrayGetSize(pBuf->pData);
H
Hongze Cheng 已提交
306
  for (int32_t i = 0; i < num; ++i) {
307 308 309 310 311 312 313 314 315
    char** p = taosArrayGet(pBuf->pData, i);
    taosMemoryFree(*p);
  }

  taosArrayDestroy(pBuf->pData);
}

static void* getPosInBlockInfoBuf(SBlockInfoBuf* pBuf, int32_t index) {
  int32_t bucketIndex = index / pBuf->numPerBucket;
H
Hongze Cheng 已提交
316
  char**  pBucket = taosArrayGet(pBuf->pData, bucketIndex);
317 318 319 320
  return (*pBucket) + (index % pBuf->numPerBucket) * sizeof(STableBlockScanInfo);
}

// NOTE: speedup the whole processing by preparing the buffer for STableBlockScanInfo in batch model
321 322
static SHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, SBlockInfoBuf* pBuf, const STableKeyInfo* idList,
                                         int32_t numOfTables) {
H
Haojun Liao 已提交
323
  // allocate buffer in order to load data blocks from file
324
  // todo use simple hash instead, optimize the memory consumption
325 326 327
  SHashObj* pTableMap =
      taosHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK);
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
328 329 330
    return NULL;
  }

H
Haojun Liao 已提交
331
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
332
  initBlockScanInfoBuf(pBuf, numOfTables);
H
Haojun Liao 已提交
333

334
  for (int32_t j = 0; j < numOfTables; ++j) {
H
Haojun Liao 已提交
335
    STableBlockScanInfo* pScanInfo = getPosInBlockInfoBuf(pBuf, j);
336 337 338 339 340 341 342 343 344 345
    pScanInfo->uid = idList[j].uid;
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
      int64_t skey = pTsdbReader->window.skey;
      pScanInfo->lastKey = (skey > INT64_MIN) ? (skey - 1) : skey;
    } else {
      int64_t ekey = pTsdbReader->window.ekey;
      pScanInfo->lastKey = (ekey < INT64_MAX) ? (ekey + 1) : ekey;
    }

    taosHashPut(pTableMap, &pScanInfo->uid, sizeof(uint64_t), &pScanInfo, POINTER_BYTES);
H
Hongze Cheng 已提交
346 347
    tsdbTrace("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, pScanInfo->uid,
              pScanInfo->lastKey, pTsdbReader->idStr);
H
Haojun Liao 已提交
348 349
  }

H
Haojun Liao 已提交
350 351 352 353
  pTsdbReader->cost.createScanInfoList = (taosGetTimestampUs() - st) / 1000.0;
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, elapsed time:%.2f ms, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->cost.createScanInfoList,
            pTsdbReader->idStr);
354

355
  return pTableMap;
H
Hongze Cheng 已提交
356
}
H
Hongze Cheng 已提交
357

358 359
static void resetAllDataBlockScanInfo(SHashObj* pTableMap, int64_t ts) {
  STableBlockScanInfo** p = NULL;
dengyihao's avatar
dengyihao 已提交
360
  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
H
Hongze Cheng 已提交
361
    STableBlockScanInfo* pInfo = *(STableBlockScanInfo**)p;
362 363

    pInfo->iterInit = false;
H
Haojun Liao 已提交
364
    pInfo->iter.hasVal = false;
365
    pInfo->iiter.hasVal = false;
H
Haojun Liao 已提交
366

367 368
    if (pInfo->iter.iter != NULL) {
      pInfo->iter.iter = tsdbTbDataIterDestroy(pInfo->iter.iter);
369 370
    }

H
Haojun Liao 已提交
371 372 373 374
    if (pInfo->iiter.iter != NULL) {
      pInfo->iiter.iter = tsdbTbDataIterDestroy(pInfo->iiter.iter);
    }

375 376
    pInfo->delSkyline = taosArrayDestroy(pInfo->delSkyline);
    pInfo->lastKey = ts;
377 378 379
  }
}

380 381
static void clearBlockScanInfo(STableBlockScanInfo* p) {
  p->iterInit = false;
H
Haojun Liao 已提交
382 383

  p->iter.hasVal = false;
384
  p->iiter.hasVal = false;
385

386 387 388
  if (p->iter.iter != NULL) {
    p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
  }
389

390 391 392
  if (p->iiter.iter != NULL) {
    p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
  }
393

394 395 396 397
  p->delSkyline = taosArrayDestroy(p->delSkyline);
  p->pBlockList = taosArrayDestroy(p->pBlockList);
  tMapDataClear(&p->mapData);
}
398

H
Haojun Liao 已提交
399
static void destroyAllBlockScanInfo(SHashObj* pTableMap) {
400
  void* p = NULL;
H
Haojun Liao 已提交
401
  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
402
    clearBlockScanInfo(*(STableBlockScanInfo**)p);
403 404 405 406 407
  }

  taosHashCleanup(pTableMap);
}

408
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) { return pWindow->skey > pWindow->ekey; }
H
Hongze Cheng 已提交
409

410 411 412
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
413
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
414

415
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
416
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
417

dengyihao's avatar
dengyihao 已提交
418
  STimeWindow win = *pWindow;
419 420 421 422 423 424
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
425

H
Haojun Liao 已提交
426
// init file iterator
427
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet, STsdbReader* pReader) {
H
Hongze Cheng 已提交
428
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
429

430 431
  pIter->index = ASCENDING_TRAVERSE(pReader->order) ? -1 : numOfFileset;
  pIter->order = pReader->order;
H
Hongze Cheng 已提交
432
  pIter->pFileList = aDFileSet;
433
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
434

435 436 437 438
  if (pIter->pLastBlockReader == NULL) {
    pIter->pLastBlockReader = taosMemoryCalloc(1, sizeof(struct SLastBlockReader));
    if (pIter->pLastBlockReader == NULL) {
      int32_t code = TSDB_CODE_OUT_OF_MEMORY;
439
      tsdbError("failed to prepare the last block iterator, since:%s %s", tstrerror(code), pReader->idStr);
440 441
      return code;
    }
442 443
  }

444 445 446 447 448 449 450 451
  SLastBlockReader* pLReader = pIter->pLastBlockReader;
  pLReader->order = pReader->order;
  pLReader->window = pReader->window;
  pLReader->verRange = pReader->verRange;

  pLReader->uid = 0;
  tMergeTreeClose(&pLReader->mergeTree);

452
  if (pLReader->pInfo == NULL) {
453
    // here we ignore the first column, which is always be the primary timestamp column
454
    pLReader->pInfo =
H
Haojun Liao 已提交
455
        tCreateLastBlockLoadInfo(pReader->pSchema, &pReader->suppInfo.colId[1], pReader->suppInfo.numOfCols - 1);
H
Haojun Liao 已提交
456 457 458 459
    if (pLReader->pInfo == NULL) {
      tsdbDebug("init fileset iterator failed, code:%s %s", tstrerror(terrno), pReader->idStr);
      return terrno;
    }
460 461
  }

462
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
H
Haojun Liao 已提交
463 464 465
  return TSDB_CODE_SUCCESS;
}

466
static bool filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader) {
467 468
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
469 470 471
  pIter->index += step;

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
H
Haojun Liao 已提交
472 473 474
    return false;
  }

H
Haojun Liao 已提交
475 476 477
  SIOCostSummary* pSum = &pReader->cost;
  getLastBlockLoadInfo(pIter->pLastBlockReader->pInfo, &pSum->lastBlockLoad, &pReader->cost.lastBlockLoadTime);

478 479
  pIter->pLastBlockReader->uid = 0;
  tMergeTreeClose(&pIter->pLastBlockReader->mergeTree);
480
  resetLastBlockLoadInfo(pIter->pLastBlockReader->pInfo);
481

H
Haojun Liao 已提交
482 483
  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
484

485
  while (1) {
H
Haojun Liao 已提交
486 487 488
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
489

490
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
491

492 493 494 495
    int32_t code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
496

497 498
    pReader->cost.headFileLoad += 1;

499 500 501 502 503 504 505 506 507 508 509 510
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
      return false;
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
511 512 513
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
        return false;
      }
514 515
      continue;
    }
C
Cary Xu 已提交
516

517
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
518
              pReader->window.ekey, pReader->idStr);
519 520
    return true;
  }
521

H
Haojun Liao 已提交
522
_err:
H
Haojun Liao 已提交
523 524 525
  return false;
}

526
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
527 528
  pIter->order = order;
  pIter->index = -1;
529
  pIter->numOfBlocks = 0;
530 531 532 533 534 535 536
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
}

L
Liu Jicong 已提交
537
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
538

H
Haojun Liao 已提交
539
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
540 541
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
542 543
}

544 545 546 547 548 549 550 551
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
H
Haojun Liao 已提交
552
    SColumnInfoData colInfo = {0};
553 554 555 556 557 558 559 560 561 562 563 564 565
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }
  return pResBlock;
}

566
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, STsdbReader** ppReader, int32_t capacity,
H
Haojun Liao 已提交
567
                                SSDataBlock* pResBlock, const char* idstr) {
H
Haojun Liao 已提交
568
  int32_t      code = 0;
569
  int8_t       level = 0;
H
Haojun Liao 已提交
570
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
571 572
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
573
    goto _end;
H
Hongze Cheng 已提交
574 575
  }

C
Cary Xu 已提交
576
  if (VND_IS_TSMA(pVnode)) {
H
Haojun Liao 已提交
577
    tsdbDebug("vgId:%d, tsma is selected to query, %s", TD_VID(pVnode), idstr);
C
Cary Xu 已提交
578 579
  }

H
Haojun Liao 已提交
580
  initReaderStatus(&pReader->status);
581

L
Liu Jicong 已提交
582
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
583 584
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
585
  pReader->capacity = capacity;
H
Haojun Liao 已提交
586
  pReader->pResBlock = pResBlock;
dengyihao's avatar
dengyihao 已提交
587 588
  pReader->idStr = (idstr != NULL) ? strdup(idstr) : NULL;
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
589
  pReader->type = pCond->type;
590
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
591
  pReader->blockInfoBuf.numPerBucket = 1000;  // 1000 tables per bucket
H
Hongze Cheng 已提交
592

H
Haojun Liao 已提交
593 594 595 596 597 598 599 600 601
  if (pReader->pResBlock == NULL) {
    pReader->freeBlock = true;
    pReader->pResBlock = createResBlock(pCond, pReader->capacity);
    if (pReader->pResBlock == NULL) {
      code = terrno;
      goto _end;
    }
  }

H
Haojun Liao 已提交
602 603 604 605 606 607
  if (pCond->numOfCols <= 0) {
    tsdbError("vgId:%d, invalid column number %d in query cond, %s", TD_VID(pVnode), pCond->numOfCols, idstr);
    code = TSDB_CODE_INVALID_PARA;
    goto _end;
  }

608 609
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
610
  pSup->pColAgg = taosArrayInit(pCond->numOfCols, sizeof(SColumnDataAgg));
H
Haojun Liao 已提交
611
  if (pSup->pColAgg == NULL) {
612 613 614
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
615

616 617
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;

H
Hongze Cheng 已提交
618
  code = tBlockDataCreate(&pReader->status.fileBlockData);
H
Haojun Liao 已提交
619 620 621 622 623
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

H
Haojun Liao 已提交
624
  setColumnIdSlotList(&pReader->suppInfo, pCond->colList, pCond->pSlotList, pCond->numOfCols);
625

H
Hongze Cheng 已提交
626 627
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
628

H
Haojun Liao 已提交
629
_end:
H
Haojun Liao 已提交
630
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
631 632 633
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
634

H
Haojun Liao 已提交
635
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
636
  // SArray* aBlockIdx = taosArrayInit(8, sizeof(SBlockIdx));
H
Hongze Cheng 已提交
637

638
  int64_t st = taosGetTimestampUs();
639 640 641 642
  // int32_t code = tsdbReadBlockIdx(pFileReader, aBlockIdx);
  LRUHandle* handle = NULL;
  int32_t    code = tsdbCacheGetBlockIdx(pFileReader->pTsdb->biCache, pFileReader, &handle);
  if (code != TSDB_CODE_SUCCESS || handle == NULL) {
643
    goto _end;
H
Haojun Liao 已提交
644
  }
H
Hongze Cheng 已提交
645

646 647
  SArray* aBlockIdx = (SArray*)taosLRUCacheValue(pFileReader->pTsdb->biCache, handle);
  size_t  num = taosArrayGetSize(aBlockIdx);
648
  if (num == 0) {
649 650
    tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
    // taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
651 652
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
653

654 655 656 657
  int64_t et1 = taosGetTimestampUs();

  SBlockIdx* pBlockIdx = NULL;
  for (int32_t i = 0; i < num; ++i) {
658
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Haojun Liao 已提交
659

660
    // uid check
H
Hongze Cheng 已提交
661
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
662 663 664 665
      continue;
    }

    // this block belongs to a table that is not queried.
H
Haojun Liao 已提交
666
    void* p = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(uint64_t));
H
Haojun Liao 已提交
667 668 669 670
    if (p == NULL) {
      continue;
    }

H
Haojun Liao 已提交
671
    STableBlockScanInfo* pScanInfo = *(STableBlockScanInfo**)p;
H
Haojun Liao 已提交
672
    if (pScanInfo->pBlockList == NULL) {
673
      pScanInfo->pBlockList = taosArrayInit(4, sizeof(SBlockIndex));
H
Haojun Liao 已提交
674 675
    }

H
Hongze Cheng 已提交
676
    taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
677
  }
H
Hongze Cheng 已提交
678

679
  int64_t et2 = taosGetTimestampUs();
680
  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
681
            (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0, pReader->idStr);
682 683 684

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

H
Haojun Liao 已提交
685
_end:
686 687
  // taosArrayDestroy(aBlockIdx);
  tsdbBICacheRelease(pFileReader->pTsdb->biCache, handle);
H
Haojun Liao 已提交
688 689
  return code;
}
H
Hongze Cheng 已提交
690

691
static void cleanupTableScanInfo(SHashObj* pTableMap) {
692
  STableBlockScanInfo** px = NULL;
dengyihao's avatar
dengyihao 已提交
693
  while (1) {
694
    px = taosHashIterate(pTableMap, px);
695 696 697 698
    if (px == NULL) {
      break;
    }

699
    // reset the index in last block when handing a new file
700 701
    tMapDataClear(&(*px)->mapData);
    taosArrayClear((*px)->pBlockList);
702
  }
703 704
}

705
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum) {
706 707 708 709 710 711
  int32_t numOfQTable = 0;
  size_t  sizeInDisk = 0;
  size_t  numOfTables = taosArrayGetSize(pIndexList);

  int64_t st = taosGetTimestampUs();
  cleanupTableScanInfo(pReader->status.pTableMap);
712

dengyihao's avatar
dengyihao 已提交
713
  for (int32_t i = 0; i < numOfTables; ++i) {
H
Haojun Liao 已提交
714
    SBlockIdx* pBlockIdx = taosArrayGet(pIndexList, i);
H
Hongze Cheng 已提交
715

H
Haojun Liao 已提交
716 717
    STableBlockScanInfo* pScanInfo =
        *(STableBlockScanInfo**)taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(int64_t));
H
Hongze Cheng 已提交
718

719
    tMapDataReset(&pScanInfo->mapData);
H
Hongze Cheng 已提交
720
    tsdbReadDataBlk(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);
H
Haojun Liao 已提交
721
    taosArrayEnsureCap(pScanInfo->pBlockList, pScanInfo->mapData.nItem);
722

723
    sizeInDisk += pScanInfo->mapData.nData;
H
Haojun Liao 已提交
724

H
Haojun Liao 已提交
725
    SDataBlk block = {0};
726
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Haojun Liao 已提交
727
      tGetDataBlk(pScanInfo->mapData.pData + pScanInfo->mapData.aOffset[j], &block);
H
Hongze Cheng 已提交
728

729
      // 1. time range check
H
Haojun Liao 已提交
730
      if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
H
Haojun Liao 已提交
731 732
        continue;
      }
H
Hongze Cheng 已提交
733

734
      // 2. version range check
H
Haojun Liao 已提交
735
      if (block.minVer > pReader->verRange.maxVer || block.maxVer < pReader->verRange.minVer) {
736 737
        continue;
      }
738

H
Haojun Liao 已提交
739 740
      SBlockIndex bIndex = {.ordinalIndex = j, .inFileOffset = block.aSubBlock->offset};
      bIndex.window = (STimeWindow){.skey = block.minKey.ts, .ekey = block.maxKey.ts};
741

H
Haojun Liao 已提交
742 743
      void* p1 = taosArrayPush(pScanInfo->pBlockList, &bIndex);
      if (p1 == NULL) {
744
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
745 746
        return TSDB_CODE_OUT_OF_MEMORY;
      }
747

748
      pBlockNum->numOfBlocks += 1;
H
Haojun Liao 已提交
749
    }
H
Hongze Cheng 已提交
750

H
Haojun Liao 已提交
751
    if (pScanInfo->pBlockList != NULL && taosArrayGetSize(pScanInfo->pBlockList) > 0) {
752 753 754 755
      numOfQTable += 1;
    }
  }

H
Hongze Cheng 已提交
756
  pBlockNum->numOfLastFiles = pReader->pFileReader->pSet->nSttF;
757
  int32_t total = pBlockNum->numOfLastFiles + pBlockNum->numOfBlocks;
758

759
  double el = (taosGetTimestampUs() - st) / 1000.0;
760 761 762 763 764
  tsdbDebug(
      "load block of %ld tables completed, blocks:%d in %d tables, last-files:%d, block-info-size:%.2f Kb, elapsed "
      "time:%.2f ms %s",
      numOfTables, pBlockNum->numOfBlocks, numOfQTable, pBlockNum->numOfLastFiles, sizeInDisk / 1000.0, el,
      pReader->idStr);
765

766
  pReader->cost.numOfBlocks += total;
767
  pReader->cost.headFileLoadTime += el;
768

H
Haojun Liao 已提交
769 770
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
771

772
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
773
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
774
  pDumpInfo->allDumped = true;
775
  pDumpInfo->lastKey = maxKey + step;
H
Haojun Liao 已提交
776 777
}

778 779
static void doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
                         SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
780
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
H
Hongze Cheng 已提交
781
    if (!COL_VAL_IS_VALUE(pColVal)) {
H
Haojun Liao 已提交
782 783 784
      colDataAppendNULL(pColInfoData, rowIndex);
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
H
Haojun Liao 已提交
785
      ASSERT(pColVal->value.nData <= pColInfoData->info.bytes);
786 787 788 789
      if (pColVal->value.nData > 0) {  // pData may be null, if nData is 0
        memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      }

H
Haojun Liao 已提交
790 791 792
      colDataAppend(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
    }
  } else {
H
Hongze Cheng 已提交
793
    colDataAppend(pColInfoData, rowIndex, (const char*)&pColVal->value, !COL_VAL_IS_VALUE(pColVal));
H
Haojun Liao 已提交
794
  }
H
Haojun Liao 已提交
795 796
}

797
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
H
Haojun Liao 已提交
798 799 800
  size_t num = taosArrayGetSize(pBlockIter->blockList);
  if (num == 0) {
    ASSERT(pBlockIter->numOfBlocks == num);
801 802
    return NULL;
  }
803 804 805

  SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pBlockInfo;
806 807
}

H
Hongze Cheng 已提交
808
static SDataBlk* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
809

H
Haojun Liao 已提交
810 811 812 813 814 815
static int doBinarySearchKey(TSKEY* keyList, int num, int pos, TSKEY key, int order) {
  // start end position
  int s, e;
  s = pos;

  // check
H
Hongze Cheng 已提交
816
  assert(pos >= 0 && pos < num);
H
Haojun Liao 已提交
817 818 819 820
  assert(num > 0);

  if (order == TSDB_ORDER_ASC) {
    // find the first position which is smaller than the key
H
Hongze Cheng 已提交
821 822
    e = num - 1;
    if (key < keyList[pos]) return -1;
H
Haojun Liao 已提交
823 824
    while (1) {
      // check can return
H
Hongze Cheng 已提交
825 826 827
      if (key >= keyList[e]) return e;
      if (key <= keyList[s]) return s;
      if (e - s <= 1) return s;
H
Haojun Liao 已提交
828 829

      // change start or end position
H
Hongze Cheng 已提交
830
      int mid = s + (e - s + 1) / 2;
H
Haojun Liao 已提交
831 832
      if (keyList[mid] > key)
        e = mid;
H
Hongze Cheng 已提交
833
      else if (keyList[mid] < key)
H
Haojun Liao 已提交
834 835 836 837
        s = mid;
      else
        return mid;
    }
H
Hongze Cheng 已提交
838
  } else {  // DESC
H
Haojun Liao 已提交
839
    // find the first position which is bigger than the key
H
Hongze Cheng 已提交
840 841
    e = 0;
    if (key > keyList[pos]) return -1;
H
Haojun Liao 已提交
842 843
    while (1) {
      // check can return
H
Hongze Cheng 已提交
844 845 846
      if (key <= keyList[e]) return e;
      if (key >= keyList[s]) return s;
      if (s - e <= 1) return s;
H
Haojun Liao 已提交
847 848

      // change start or end position
H
Hongze Cheng 已提交
849
      int mid = s - (s - e + 1) / 2;
H
Haojun Liao 已提交
850 851
      if (keyList[mid] < key)
        e = mid;
H
Hongze Cheng 已提交
852
      else if (keyList[mid] > key)
H
Haojun Liao 已提交
853 854 855 856 857 858 859
        s = mid;
      else
        return mid;
    }
  }
}

H
Haojun Liao 已提交
860
static int32_t getEndPosInDataBlock(STsdbReader* pReader, SBlockData* pBlockData, SDataBlk* pBlock, int32_t pos) {
H
Haojun Liao 已提交
861 862
  // NOTE: reverse the order to find the end position in data block
  int32_t endPos = -1;
H
Hongze Cheng 已提交
863
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
864 865 866 867 868 869

  if (asc && pReader->window.ekey >= pBlock->maxKey.ts) {
    endPos = pBlock->nRow - 1;
  } else if (!asc && pReader->window.skey <= pBlock->minKey.ts) {
    endPos = 0;
  } else {
C
Cary Xu 已提交
870 871
    int64_t key = asc ? pReader->window.ekey : pReader->window.skey;
    endPos = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, pReader->order);
H
Haojun Liao 已提交
872 873 874 875 876
  }

  return endPos;
}

H
Haojun Liao 已提交
877
static void copyPrimaryTsCol(const SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
H
Haojun Liao 已提交
878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
                             int32_t dumpedRows, bool asc) {
  if (asc) {
    memcpy(pColData->pData, &pBlockData->aTSKEY[pDumpInfo->rowIndex], dumpedRows * sizeof(int64_t));
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    memcpy(pColData->pData, &pBlockData->aTSKEY[startIndex], dumpedRows * sizeof(int64_t));

    // todo: opt perf by extract the loop
    // reverse the array list
    int32_t  mid = dumpedRows >> 1u;
    int64_t* pts = (int64_t*)pColData->pData;
    for (int32_t j = 0; j < mid; ++j) {
      int64_t t = pts[j];
      pts[j] = pts[dumpedRows - j - 1];
      pts[dumpedRows - j - 1] = t;
    }
  }
}

H
Haojun Liao 已提交
897 898
// a faster version of copy procedure.
static void copyNumericCols(const SColData* pData, SFileBlockDumpInfo* pDumpInfo, SColumnInfoData* pColData,
899
                            int32_t dumpedRows, bool asc) {
H
Haojun Liao 已提交
900 901 902 903 904 905 906 907
  uint8_t* p = NULL;
  if (asc) {
    p = pData->pData + tDataTypes[pData->type].bytes * pDumpInfo->rowIndex;
  } else {
    int32_t startIndex = pDumpInfo->rowIndex - dumpedRows + 1;
    p = pData->pData + tDataTypes[pData->type].bytes * startIndex;
  }

908
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
909

H
Haojun Liao 已提交
910
  // make sure it is aligned to 8bit, the allocated memory address is aligned to 256bit
911
  //  ASSERT((((uint64_t)pColData->pData) & (0x8 - 1)) == 0);
H
Haojun Liao 已提交
912 913 914 915 916 917

  // 1. copy data in a batch model
  memcpy(pColData->pData, p, dumpedRows * tDataTypes[pData->type].bytes);

  // 2. reverse the array list in case of descending order scan data block
  if (!asc) {
918
    switch (pColData->info.type) {
H
Haojun Liao 已提交
919 920 921
      case TSDB_DATA_TYPE_TIMESTAMP:
      case TSDB_DATA_TYPE_DOUBLE:
      case TSDB_DATA_TYPE_BIGINT:
922
      case TSDB_DATA_TYPE_UBIGINT: {
H
Haojun Liao 已提交
923 924 925 926 927 928 929 930 931 932 933 934 935
        int32_t  mid = dumpedRows >> 1u;
        int64_t* pts = (int64_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_BOOL:
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_UTINYINT: {
936
        int32_t mid = dumpedRows >> 1u;
H
Haojun Liao 已提交
937 938
        int8_t* pts = (int8_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
939
          int8_t t = pts[j];
H
Haojun Liao 已提交
940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_SMALLINT:
      case TSDB_DATA_TYPE_USMALLINT: {
        int32_t  mid = dumpedRows >> 1u;
        int16_t* pts = (int16_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
          int64_t t = pts[j];
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }

      case TSDB_DATA_TYPE_FLOAT:
      case TSDB_DATA_TYPE_INT:
      case TSDB_DATA_TYPE_UINT: {
        int32_t  mid = dumpedRows >> 1u;
        int32_t* pts = (int32_t*)pColData->pData;
        for (int32_t j = 0; j < mid; ++j) {
H
Haojun Liao 已提交
964
          int32_t t = pts[j];
H
Haojun Liao 已提交
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
          pts[j] = pts[dumpedRows - j - 1];
          pts[dumpedRows - j - 1] = t;
        }
        break;
      }
    }
  }

  // 3. if the  null value exists, check items one-by-one
  if (pData->flag != HAS_VALUE) {
    int32_t rowIndex = 0;

    for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step, rowIndex++) {
      uint8_t v = tColDataGetBitValue(pData, j);
      if (v == 0 || v == 1) {
        colDataSetNull_f(pColData->nullbitmap, rowIndex);
        pColData->hasNull = true;
      }
    }
  }
}

987
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
988 989 990 991
  SReaderStatus*      pStatus = &pReader->status;
  SDataBlockIter*     pBlockIter = &pStatus->blockIter;
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Hongze Cheng 已提交
992

993
  SBlockData*         pBlockData = &pStatus->fileBlockData;
H
Haojun Liao 已提交
994
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
H
Hongze Cheng 已提交
995
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
H
Haojun Liao 已提交
996
  SSDataBlock*        pResBlock = pReader->pResBlock;
H
Haojun Liao 已提交
997
  int32_t             numOfOutputCols = pSupInfo->numOfCols;
H
Haojun Liao 已提交
998

H
Haojun Liao 已提交
999
  SColVal cv = {0};
1000
  int64_t st = taosGetTimestampUs();
1001 1002
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
1003

1004 1005 1006 1007 1008 1009 1010 1011
  // no data exists, return directly.
  if (pBlockData->nRow == 0 || pBlockData->aTSKEY == 0) {
    tsdbWarn("%p no need to copy since no data in blockData, table uid:%" PRIu64 " has been dropped, %s", pReader, pBlockInfo->uid,
              pReader->idStr);
    pResBlock->info.rows = 0;
    return 0;
  }

1012 1013
  if ((pDumpInfo->rowIndex == 0 && asc) || (pDumpInfo->rowIndex == pBlock->nRow - 1 && (!asc))) {
    if (asc && pReader->window.skey <= pBlock->minKey.ts) {
1014 1015 1016
      // pDumpInfo->rowIndex = 0;
    } else if (!asc && pReader->window.ekey >= pBlock->maxKey.ts) {
      // pDumpInfo->rowIndex = pBlock->nRow - 1;
H
Haojun Liao 已提交
1017
    } else {  // find the appropriate the start position in current block, and set it to be the current rowIndex
1018
      int32_t pos = asc ? pBlock->nRow - 1 : 0;
C
Cary Xu 已提交
1019 1020 1021
      int32_t order = asc ? TSDB_ORDER_DESC : TSDB_ORDER_ASC;
      int64_t key = asc ? pReader->window.skey : pReader->window.ekey;
      pDumpInfo->rowIndex = doBinarySearchKey(pBlockData->aTSKEY, pBlock->nRow, pos, key, order);
H
Haojun Liao 已提交
1022 1023 1024 1025 1026 1027 1028 1029 1030

      if (pDumpInfo->rowIndex < 0) {
        tsdbError(
            "%p failed to locate the start position in current block, global index:%d, table index:%d, brange:%" PRId64
            "-%" PRId64 ", minVer:%" PRId64 ", maxVer:%" PRId64 " %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->minVer,
            pBlock->maxVer, pReader->idStr);
        return TSDB_CODE_INVALID_PARA;
      }
1031
    }
H
Haojun Liao 已提交
1032 1033 1034
  }

  // time window check
1035 1036 1037 1038 1039 1040 1041
  int32_t endIndex = getEndPosInDataBlock(pReader, pBlockData, pBlock, pDumpInfo->rowIndex);
  if (endIndex == -1) {
    setBlockAllDumped(pDumpInfo, pReader->window.ekey, pReader->order);
    return TSDB_CODE_SUCCESS;
  }

  endIndex += step;
H
Haojun Liao 已提交
1042 1043 1044
  int32_t dumpedRows = asc ? (endIndex - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex - endIndex);
  if (dumpedRows > pReader->capacity) {  // output buffer check
    dumpedRows = pReader->capacity;
1045 1046
  }

H
Haojun Liao 已提交
1047
  int32_t i = 0;
H
Haojun Liao 已提交
1048 1049
  int32_t rowIndex = 0;

H
Haojun Liao 已提交
1050 1051
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
1052
    copyPrimaryTsCol(pBlockData, pDumpInfo, pColData, dumpedRows, asc);
1053 1054 1055
    i += 1;
  }

1056
  int32_t colIndex = 0;
H
Hongze Cheng 已提交
1057
  int32_t num = pBlockData->nColData;
1058
  while (i < numOfOutputCols && colIndex < num) {
1059 1060
    rowIndex = 0;

H
Hongze Cheng 已提交
1061
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
H
Haojun Liao 已提交
1062
    if (pData->cid < pSupInfo->colId[i]) {
1063
      colIndex += 1;
H
Haojun Liao 已提交
1064 1065
    } else if (pData->cid == pSupInfo->colId[i]) {
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
1066

H
Hongze Cheng 已提交
1067
      if (pData->flag == HAS_NONE || pData->flag == HAS_NULL || pData->flag == (HAS_NULL | HAS_NONE)) {
H
Haojun Liao 已提交
1068
        colDataAppendNNULL(pColData, 0, dumpedRows);
H
Haojun Liao 已提交
1069
      } else {
H
Haojun Liao 已提交
1070
        if (IS_MATHABLE_TYPE(pColData->info.type)) {
H
Haojun Liao 已提交
1071 1072
          copyNumericCols(pData, pDumpInfo, pColData, dumpedRows, asc);
        } else {  // varchar/nchar type
H
Haojun Liao 已提交
1073
          for (int32_t j = pDumpInfo->rowIndex; rowIndex < dumpedRows; j += step) {
H
Haojun Liao 已提交
1074 1075 1076 1077
            tColDataGetValue(pData, j, &cv);
            doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
          }
        }
H
Haojun Liao 已提交
1078
      }
H
Haojun Liao 已提交
1079

1080
      colIndex += 1;
1081
      i += 1;
1082
    } else {  // the specified column does not exist in file block, fill with null data
H
Haojun Liao 已提交
1083
      pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
1084
      colDataAppendNNULL(pColData, 0, dumpedRows);
1085
      i += 1;
H
Haojun Liao 已提交
1086
    }
1087 1088
  }

1089
  // fill the mis-matched columns with null value
1090
  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
1091
    pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
1092
    colDataAppendNNULL(pColData, 0, dumpedRows);
1093
    i += 1;
H
Haojun Liao 已提交
1094
  }
H
Haojun Liao 已提交
1095

1096
  pResBlock->info.dataLoad = 1;
H
Haojun Liao 已提交
1097 1098
  pResBlock->info.rows = dumpedRows;
  pDumpInfo->rowIndex += step * dumpedRows;
1099

1100
  // check if current block are all handled
1101
  if (pDumpInfo->rowIndex >= 0 && pDumpInfo->rowIndex < pBlock->nRow) {
1102 1103 1104 1105
    int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
    if (outOfTimeWindow(ts, &pReader->window)) {  // the remain data has out of query time window, ignore current block
      setBlockAllDumped(pDumpInfo, ts, pReader->order);
    }
1106
  } else {
1107 1108
    int64_t ts = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
    setBlockAllDumped(pDumpInfo, ts, pReader->order);
H
Haojun Liao 已提交
1109
  }
H
Haojun Liao 已提交
1110

1111
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
1112
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
1113

1114
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
1115
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
1116
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", uid:%" PRIu64 " elapsed time:%.2f ms, %s",
H
Haojun Liao 已提交
1117
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, dumpedRows,
H
Haojun Liao 已提交
1118
            unDumpedRows, pBlock->minVer, pBlock->maxVer, pBlockInfo->uid, elapsedTime, pReader->idStr);
1119 1120 1121 1122

  return TSDB_CODE_SUCCESS;
}

1123 1124
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData,
                                   uint64_t uid) {
1125
  int32_t code = 0;
1126 1127
  int64_t st = taosGetTimestampUs();

1128
  tBlockDataReset(pBlockData);
1129 1130 1131 1132 1133 1134 1135
  STSchema* pSchema = getLatestTableSchema(pReader, uid);
  if (pSchema == NULL) {
    tsdbDebug("%p table uid:%"PRIu64" has been dropped, no data existed, %s", pReader, uid, pReader->idStr);
    return code;
  }

  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
1136
  TABLEID tid = {.suid = pReader->suid, .uid = uid};
1137
  code = tBlockDataInit(pBlockData, &tid, pSchema, &pSup->colId[1], pSup->numOfCols - 1);
1138 1139 1140 1141
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1142
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
1143
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1144

H
Hongze Cheng 已提交
1145
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
1146
  code = tsdbReadDataBlock(pReader->pFileReader, pBlock, pBlockData);
1147 1148
  if (code != TSDB_CODE_SUCCESS) {
    tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
H
Hongze Cheng 已提交
1149
              ", rows:%d, code:%s %s",
1150
              pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
1151 1152 1153
              tstrerror(code), pReader->idStr);
    return code;
  }
1154

1155
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
1156

1157
  tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
H
Hongze Cheng 已提交
1158
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
1159 1160
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
1161 1162 1163

  pReader->cost.blockLoadTime += elapsedTime;
  pDumpInfo->allDumped = false;
1164

H
Haojun Liao 已提交
1165
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1166
}
H
Hongze Cheng 已提交
1167

H
Haojun Liao 已提交
1168 1169 1170
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
1171

H
Haojun Liao 已提交
1172 1173 1174 1175
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
1176

H
Haojun Liao 已提交
1177 1178
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
1179

H
Haojun Liao 已提交
1180 1181
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
1182 1183
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
1184

H
Haojun Liao 已提交
1185 1186 1187 1188
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
1189

H
Haojun Liao 已提交
1190 1191
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
1192

H
Haojun Liao 已提交
1193
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
1194
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
1195
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
1196

H
Haojun Liao 已提交
1197
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
1198

H
Haojun Liao 已提交
1199 1200
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
1201

H
Haojun Liao 已提交
1202 1203 1204 1205 1206 1207 1208
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
1209

1210
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
1211
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
1212

1213 1214 1215
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

H
Haojun Liao 已提交
1216
static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter, const char* idStr) {
1217 1218
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo != NULL) {
H
Haojun Liao 已提交
1219
    STableBlockScanInfo** pScanInfo = taosHashGet(pBlockIter->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
H
Haojun Liao 已提交
1220
    if (pScanInfo == NULL) {
1221
      tsdbError("failed to locate the uid:%" PRIu64 " in query table uid list, %s", pBlockInfo->uid, idStr);
H
Haojun Liao 已提交
1222 1223 1224
      return TSDB_CODE_INVALID_PARA;
    }

H
Haojun Liao 已提交
1225 1226
    SBlockIndex* pIndex = taosArrayGet((*pScanInfo)->pBlockList, pBlockInfo->tbBlockIdx);
    tMapDataGetItemByIdx(&(*pScanInfo)->mapData, pIndex->ordinalIndex, &pBlockIter->block, tGetDataBlk);
1227
  }
1228 1229 1230 1231 1232 1233

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1234
}
H
Hongze Cheng 已提交
1235

1236
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks) {
1237
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
1238

1239
  SBlockOrderSupporter sup = {0};
1240
  pBlockIter->numOfBlocks = numOfBlocks;
1241
  taosArrayClear(pBlockIter->blockList);
1242
  pBlockIter->pTableMap = pReader->status.pTableMap;
1243

1244 1245
  // access data blocks according to the offset of each block in asc/desc order.
  int32_t numOfTables = (int32_t)taosHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
1246

1247
  int64_t st = taosGetTimestampUs();
1248
  int32_t code = initBlockOrderSupporter(&sup, numOfTables);
1249 1250 1251
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
1252

1253 1254 1255 1256 1257 1258 1259
  int32_t cnt = 0;
  void*   ptr = NULL;
  while (1) {
    ptr = taosHashIterate(pReader->status.pTableMap, ptr);
    if (ptr == NULL) {
      break;
    }
H
Haojun Liao 已提交
1260

1261
    STableBlockScanInfo* pTableScanInfo = *(STableBlockScanInfo**)ptr;
1262 1263 1264
    if (pTableScanInfo->pBlockList == NULL || taosArrayGetSize(pTableScanInfo->pBlockList) == 0) {
      continue;
    }
H
Haojun Liao 已提交
1265

1266 1267
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
1268

1269 1270 1271
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1272
      return TSDB_CODE_OUT_OF_MEMORY;
1273
    }
H
Haojun Liao 已提交
1274

1275
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
1276

1277 1278 1279
    for (int32_t k = 0; k < num; ++k) {
      SBlockIndex* pIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
      sup.pDataBlockInfo[sup.numOfTables][k] =
1280
          (SBlockOrderWrapper){.uid = pTableScanInfo->uid, .offset = pIndex->inFileOffset};
1281 1282 1283 1284 1285
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
1286

H
Haojun Liao 已提交
1287 1288 1289 1290
  if (numOfBlocks != cnt && sup.numOfTables != numOfTables) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_INVALID_PARA;
  }
H
Haojun Liao 已提交
1291

1292
  // since there is only one table qualified, blocks are not sorted
1293 1294
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
1295 1296
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
1297
    }
1298

1299
    int64_t et = taosGetTimestampUs();
1300
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
1301
              pReader, numOfBlocks, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
1302

1303
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1304
    cleanupBlockOrderSupporter(&sup);
H
Haojun Liao 已提交
1305
    doSetCurrentBlock(pBlockIter, pReader->idStr);
1306
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1307
  }
H
Haojun Liao 已提交
1308

1309 1310
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
1311

1312
  SMultiwayMergeTreeInfo* pTree = NULL;
H
Haojun Liao 已提交
1313 1314

  uint8_t ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
1315 1316
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
S
Shengliang Guan 已提交
1317
    return TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
1318
  }
H
Haojun Liao 已提交
1319

1320 1321 1322 1323
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
1324

1325 1326
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
1327

1328 1329 1330 1331
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
1332

1333 1334
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
1335
  }
H
Haojun Liao 已提交
1336

1337
  int64_t et = taosGetTimestampUs();
H
Hongze Cheng 已提交
1338 1339
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, numOfBlocks,
            (et - st) / 1000.0, pReader->idStr);
1340 1341
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
1342

1343
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
1344
  doSetCurrentBlock(pBlockIter, pReader->idStr);
1345

1346
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
1347
}
H
Hongze Cheng 已提交
1348

H
Haojun Liao 已提交
1349
static bool blockIteratorNext(SDataBlockIter* pBlockIter, const char* idStr) {
1350 1351
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

1352
  int32_t step = asc ? 1 : -1;
1353
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
1354 1355 1356
    return false;
  }

1357
  pBlockIter->index += step;
H
Haojun Liao 已提交
1358
  doSetCurrentBlock(pBlockIter, idStr);
1359

1360 1361 1362
  return true;
}

1363 1364 1365
/**
 * This is an two rectangles overlap cases.
 */
H
Hongze Cheng 已提交
1366
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SDataBlk* pBlock) {
1367 1368
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
H
Hongze Cheng 已提交
1369 1370
         (pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
         (pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
H
Haojun Liao 已提交
1371
}
H
Hongze Cheng 已提交
1372

1373
static bool getNeighborBlockOfSameTable(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
1374
                                        int32_t* nextIndex, int32_t order, SBlockIndex* pBlockIndex) {
1375
  bool asc = ASCENDING_TRAVERSE(order);
H
Haojun Liao 已提交
1376
  if (asc && pBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
1377
    return false;
1378 1379
  }

H
Haojun Liao 已提交
1380
  if (!asc && pBlockInfo->tbBlockIdx == 0) {
1381
    return false;
1382 1383
  }

1384
  int32_t step = asc ? 1 : -1;
H
Haojun Liao 已提交
1385
  *nextIndex = pBlockInfo->tbBlockIdx + step;
H
Hongze Cheng 已提交
1386 1387
  *pBlockIndex = *(SBlockIndex*)taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
  //  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, pIndex->ordinalIndex, pBlock, tGetDataBlk);
1388
  return true;
1389 1390 1391
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
1392
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1393 1394
  int32_t index = pBlockIter->index;

1395
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  return -1;
}

1407
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1408
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
1409 1410 1411 1412
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1413 1414 1415 1416 1417
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1418

1419 1420 1421
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1422

H
Haojun Liao 已提交
1423
  doSetCurrentBlock(pBlockIter, "");
1424 1425 1426
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
1427
// todo: this attribute could be acquired during extractin the global ordered block list.
1428
static bool overlapWithNeighborBlock(SDataBlk* pBlock, SBlockIndex* pNeighborBlockIndex, int32_t order) {
1429 1430
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
1431
    return pBlock->maxKey.ts == pNeighborBlockIndex->window.skey;
1432
  } else {
1433
    return pBlock->minKey.ts == pNeighborBlockIndex->window.ekey;
1434
  }
H
Haojun Liao 已提交
1435
}
H
Hongze Cheng 已提交
1436

H
Hongze Cheng 已提交
1437
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SDataBlk* pBlock) {
H
Haojun Liao 已提交
1438
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1439

1440
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1441
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1442
}
H
Hongze Cheng 已提交
1443

H
Hongze Cheng 已提交
1444
static bool keyOverlapFileBlock(TSDBKEY key, SDataBlk* pBlock, SVersionRange* pVerRange) {
H
Hongze Cheng 已提交
1445 1446
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVer >= pVerRange->minVer) &&
         (pBlock->minVer <= pVerRange->maxVer);
H
Haojun Liao 已提交
1447 1448
}

H
Hongze Cheng 已提交
1449 1450
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock,
                                       int32_t startIndex) {
1451 1452
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

1453
  for (int32_t i = startIndex; i < num; i += 1) {
1454 1455
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
H
Hongze Cheng 已提交
1456
      if (p->version >= pBlock->minVer) {
1457 1458 1459
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
H
Hongze Cheng 已提交
1460
      if (p->version >= pBlock->minVer) {
1461 1462
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
H
Hongze Cheng 已提交
1463 1464
          if (pnext->ts >= pBlock->minKey.ts) {
            return true;
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

H
Hongze Cheng 已提交
1478
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock, int32_t order) {
1479 1480 1481 1482
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1483
  // ts is not overlap
1484
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1485
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1486 1487 1488 1489 1490
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1491
  if (ASCENDING_TRAVERSE(order)) {
1492
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, pBlockScanInfo->fileDelIndex);
1493 1494
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1495
    while (1) {
1496 1497 1498 1499 1500
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
        break;
1501 1502 1503
      }
    }

1504
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock, index);
1505
  }
1506 1507
}

H
Haojun Liao 已提交
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
typedef struct {
  bool overlapWithNeighborBlock;
  bool hasDupTs;
  bool overlapWithDelInfo;
  bool overlapWithLastBlock;
  bool overlapWithKeyInBuf;
  bool partiallyRequired;
  bool moreThanCapcity;
} SDataBlockToLoadInfo;

static void getBlockToLoadInfo(SDataBlockToLoadInfo* pInfo, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                               STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader,
                               STsdbReader* pReader) {
1521 1522
  int32_t     neighborIndex = 0;
  SBlockIndex bIndex = {0};
1523

1524
  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pScanInfo, &neighborIndex, pReader->order, &bIndex);
1525

1526
  // overlap with neighbor
1527
  if (hasNeighbor) {
1528
    pInfo->overlapWithNeighborBlock = overlapWithNeighborBlock(pBlock, &bIndex, pReader->order);
1529 1530
  }

1531
  // has duplicated ts of different version in this block
H
Haojun Liao 已提交
1532 1533
  pInfo->hasDupTs = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  pInfo->overlapWithDelInfo = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1534

H
Haojun Liao 已提交
1535 1536 1537
  if (hasDataInLastBlock(pLastBlockReader)) {
    int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
    pInfo->overlapWithLastBlock = !(pBlock->maxKey.ts < tsLast || pBlock->minKey.ts > tsLast);
1538 1539
  }

H
Haojun Liao 已提交
1540 1541 1542 1543
  pInfo->moreThanCapcity = pBlock->nRow > pReader->capacity;
  pInfo->partiallyRequired = dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock);
  pInfo->overlapWithKeyInBuf = keyOverlapFileBlock(keyInBuf, pBlock, &pReader->verRange);
}
1544

H
Haojun Liao 已提交
1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
// 5. delete info should not overlap with current block data
// 6. current block should not contain the duplicated ts
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);

  bool loadDataBlock =
      (info.overlapWithNeighborBlock || info.hasDupTs || info.partiallyRequired || info.overlapWithKeyInBuf ||
       info.moreThanCapcity || info.overlapWithDelInfo || info.overlapWithLastBlock);
1559 1560 1561

  // log the reason why load the datablock for profile
  if (loadDataBlock) {
1562
    tsdbDebug("%p uid:%" PRIu64 " need to load the datablock, overlapneighbor:%d, hasDup:%d, partiallyRequired:%d, "
H
Haojun Liao 已提交
1563
              "overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithlastBlock:%d, %s",
H
Haojun Liao 已提交
1564 1565 1566
              pReader, pBlockInfo->uid, info.overlapWithNeighborBlock, info.hasDupTs, info.partiallyRequired,
              info.overlapWithKeyInBuf, info.moreThanCapcity, info.overlapWithDelInfo, info.overlapWithLastBlock,
              pReader->idStr);
1567 1568 1569
  }

  return loadDataBlock;
H
Haojun Liao 已提交
1570 1571
}

H
Haojun Liao 已提交
1572 1573 1574 1575 1576 1577 1578 1579 1580
static bool isCleanFileDataBlock(STsdbReader* pReader, SFileDataBlockInfo* pBlockInfo, SDataBlk* pBlock,
                                 STableBlockScanInfo* pScanInfo, TSDBKEY keyInBuf, SLastBlockReader* pLastBlockReader) {
  SDataBlockToLoadInfo info = {0};
  getBlockToLoadInfo(&info, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader, pReader);
  bool isCleanFileBlock = !(info.overlapWithNeighborBlock || info.hasDupTs || info.overlapWithKeyInBuf ||
                            info.overlapWithDelInfo || info.overlapWithLastBlock);
  return isCleanFileBlock;
}

1581
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1582
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1583 1584
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1585

1586 1587 1588
  SSDataBlock* pBlock = pReader->pResBlock;

  int64_t st = taosGetTimestampUs();
1589
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->capacity, pReader);
H
Haojun Liao 已提交
1590

H
Haojun Liao 已提交
1591
  blockDataUpdateTsWindow(pBlock, pReader->suppInfo.slotId[0]);
H
Haojun Liao 已提交
1592
  pBlock->info.id.uid = pBlockScanInfo->uid;
1593

1594
  setComposedBlockFlag(pReader, true);
1595

1596
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
S
Shengliang Guan 已提交
1597
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%d, brange:%" PRId64
1598
            " - %" PRId64 ", uid:%"PRIu64",  %s",
1599
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
1600
            pBlockScanInfo->uid, pReader->idStr);
1601 1602

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1603 1604 1605
  return code;
}

1606 1607
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key,
                                            SFileBlockDumpInfo* pDumpInfo) {
1608 1609 1610
  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
1611 1612
  bool asc = (pReader->order == TSDB_ORDER_ASC);
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && asc) || (pDumpInfo->rowIndex > 0 && (!asc))) {
1613
    int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
1614 1615

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1616
    if (nextKey != key) {  // merge is not needed
1617
      doAppendRowFromFileBlock(pReader->pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
1618 1619 1620 1621 1622 1623 1624 1625
      pDumpInfo->rowIndex += step;
      return true;
    }
  }

  return false;
}

1626
static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo,
1627
                                  SVersionRange* pVerRange) {
H
Haojun Liao 已提交
1628 1629
  int32_t step = ASCENDING_TRAVERSE(pLastBlockReader->order)? 1:-1;

1630 1631 1632 1633 1634 1635 1636 1637
  while (1) {
    bool hasVal = tMergeTreeNext(&pLastBlockReader->mergeTree);
    if (!hasVal) {
      return false;
    }

    TSDBROW row = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
    TSDBKEY k = TSDBROW_KEY(&row);
1638 1639 1640
    if (hasBeenDropped(pScanInfo->delSkyline, &pScanInfo->lastBlockDelIndex, &k, pLastBlockReader->order, pVerRange)) {
      pScanInfo->lastKey = k.ts;
    } else {
H
Haojun Liao 已提交
1641 1642 1643 1644 1645 1646
      // the qualifed ts may equal to k.ts, only a greater version one.
      // here we need to fallback one step.
      if (pScanInfo->lastKey == k.ts) {
        pScanInfo->lastKey -= step;
      }

1647 1648 1649 1650 1651 1652 1653
      return true;
    }
  }
}

static bool tryCopyDistinctRowFromSttBlock(TSDBROW* fRow, SLastBlockReader* pLastBlockReader,
                                           STableBlockScanInfo* pScanInfo, int64_t ts, STsdbReader* pReader) {
1654
  bool hasVal = nextRowFromLastBlocks(pLastBlockReader, pScanInfo, &pReader->verRange);
1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
  if (hasVal) {
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 != ts) {
      doAppendRowFromFileBlock(pReader->pResBlock, pReader, fRow->pBlockData, fRow->iRow);
      return true;
    }
  } else {
    doAppendRowFromFileBlock(pReader->pResBlock, pReader, fRow->pBlockData, fRow->iRow);
    return true;
  }

  return false;
}

1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
static FORCE_INLINE STSchema* getLatestTableSchema(STsdbReader* pReader, uint64_t uid) {
  if (pReader->pSchema != NULL) {
    return pReader->pSchema;
  }

  pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, uid, -1, 1);
  if (pReader->pSchema == NULL)  {
    tsdbError("failed to get table schema, uid:%" PRIu64 ", it may have been dropped, ver:-1, %s", uid, pReader->idStr);
  }

  return  pReader->pSchema;
}

H
Haojun Liao 已提交
1682 1683 1684
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
  // always set the newest schema version in pReader->pSchema
  if (pReader->pSchema == NULL) {
1685
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, uid, -1, 1);
H
Haojun Liao 已提交
1686 1687
  }

1688
  if (pReader->pSchema && sversion == pReader->pSchema->version) {
H
Haojun Liao 已提交
1689 1690 1691 1692 1693 1694
    return pReader->pSchema;
  }

  if (pReader->pMemSchema == NULL) {
    int32_t code =
        metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pMemSchema);
H
Haojun Liao 已提交
1695 1696 1697 1698 1699 1700
    if (code != TSDB_CODE_SUCCESS) {
      terrno = code;
      return NULL;
    } else {
      return pReader->pMemSchema;
    }
H
Haojun Liao 已提交
1701 1702 1703 1704 1705 1706
  }

  if (pReader->pMemSchema->version == sversion) {
    return pReader->pMemSchema;
  }

H
Haojun Liao 已提交
1707
  taosMemoryFreeClear(pReader->pMemSchema);
H
Haojun Liao 已提交
1708
  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pMemSchema);
H
Haojun Liao 已提交
1709
  if (code != TSDB_CODE_SUCCESS || pReader->pMemSchema == NULL) {
H
Haojun Liao 已提交
1710 1711
    terrno = code;
    return NULL;
H
Haojun Liao 已提交
1712 1713
  } else {
    return pReader->pMemSchema;
H
Haojun Liao 已提交
1714
  }
H
Haojun Liao 已提交
1715 1716
}

1717
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
1718 1719 1720 1721 1722 1723
                                     SIterInfo* pIter, int64_t key, SLastBlockReader* pLastBlockReader) {
  SRowMerger          merge = {0};
  STSRow*             pTSRow = NULL;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1724
  int64_t tsLast = INT64_MIN;
1725
  if (hasDataInLastBlock(pLastBlockReader)) {
1726 1727
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
1728

H
Hongze Cheng 已提交
1729 1730
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1731

1732 1733
  int64_t minKey = 0;
  if (pReader->order == TSDB_ORDER_ASC) {
H
Hongze Cheng 已提交
1734
    minKey = INT64_MAX;  // chosen the minimum value
1735
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
1736 1737
      minKey = tsLast;
    }
1738

1739 1740 1741
    if (minKey > k.ts) {
      minKey = k.ts;
    }
1742

1743
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
1744 1745 1746 1747
      minKey = key;
    }
  } else {
    minKey = INT64_MIN;
1748
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
1749 1750 1751 1752 1753 1754 1755
      minKey = tsLast;
    }

    if (minKey < k.ts) {
      minKey = k.ts;
    }

1756
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
1757 1758
      minKey = key;
    }
1759 1760 1761 1762
  }

  bool init = false;

1763
  // ASC: file block ---> last block -----> imem -----> mem
H
Hongze Cheng 已提交
1764
  // DESC: mem -----> imem -----> last block -----> file block
1765 1766
  if (pReader->order == TSDB_ORDER_ASC) {
    if (minKey == key) {
1767
      init = true;
H
Haojun Liao 已提交
1768 1769 1770 1771
      int32_t code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
1772
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1773 1774
    }

1775
    if (minKey == tsLast) {
1776
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1777 1778 1779
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1780
        init = true;
H
Haojun Liao 已提交
1781 1782 1783 1784
        int32_t code = tRowMergerInit(&merge, &fRow1, pReader->pSchema);
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
1785
      }
1786
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge, &pReader->verRange);
1787
    }
1788

1789
    if (minKey == k.ts) {
K
kailixu 已提交
1790 1791 1792 1793
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      if (pSchema == NULL) {
        return terrno;
      }
H
Haojun Liao 已提交
1794
      if (init) {
K
kailixu 已提交
1795
        tRowMergerAdd(&merge, pRow, pSchema);
H
Haojun Liao 已提交
1796
      } else {
1797
        init = true;
1798
        int32_t code = tRowMergerInit(&merge, pRow, pSchema);
H
Haojun Liao 已提交
1799 1800 1801 1802 1803 1804 1805
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
      }
      int32_t code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
1806 1807 1808 1809 1810
      }
    }
  } else {
    if (minKey == k.ts) {
      init = true;
1811
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
H
Haojun Liao 已提交
1812 1813 1814 1815 1816 1817
      int32_t   code = tRowMergerInit(&merge, pRow, pSchema);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

      code = doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1818
      if (code != TSDB_CODE_SUCCESS || merge.pTSchema == NULL) {
H
Haojun Liao 已提交
1819 1820
        return code;
      }
1821 1822
    }

1823
    if (minKey == tsLast) {
1824
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1825 1826 1827
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1828
        init = true;
H
Haojun Liao 已提交
1829 1830 1831 1832
        int32_t code = tRowMergerInit(&merge, &fRow1, pReader->pSchema);
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
1833
      }
1834
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge, &pReader->verRange);
1835 1836 1837
    }

    if (minKey == key) {
H
Haojun Liao 已提交
1838 1839 1840
      if (init) {
        tRowMerge(&merge, &fRow);
      } else {
1841
        init = true;
H
Haojun Liao 已提交
1842 1843 1844 1845
        int32_t code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
1846 1847 1848
      }
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
    }
1849 1850
  }

1851 1852 1853 1854 1855
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1856
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo);
1857 1858 1859 1860 1861 1862

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
  return TSDB_CODE_SUCCESS;
}

1863 1864 1865
static int32_t doMergeFileBlockAndLastBlock(SLastBlockReader* pLastBlockReader, STsdbReader* pReader,
                                            STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                            bool mergeBlockData) {
1866
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1867
  int64_t             tsLastBlock = getCurrentKeyInLastBlock(pLastBlockReader);
1868 1869 1870

  STSRow*    pTSRow = NULL;
  SRowMerger merge = {0};
1871
  TSDBROW    fRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Hongze Cheng 已提交
1872
  tsdbTrace("fRow ptr:%p, %d, uid:%" PRIu64 ", %s", fRow.pBlockData, fRow.iRow, pLastBlockReader->uid, pReader->idStr);
1873

1874 1875 1876
  // only last block exists
  if ((!mergeBlockData) || (tsLastBlock != pBlockData->aTSKEY[pDumpInfo->rowIndex])) {
    if (tryCopyDistinctRowFromSttBlock(&fRow, pLastBlockReader, pBlockScanInfo, tsLastBlock, pReader)) {
1877
      pBlockScanInfo->lastKey = tsLastBlock;
1878 1879
      return TSDB_CODE_SUCCESS;
    } else {
H
Haojun Liao 已提交
1880 1881 1882 1883
      int32_t code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
1884 1885 1886

      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
      tRowMerge(&merge, &fRow1);
1887
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, &merge, &pReader->verRange);
1888

H
Haojun Liao 已提交
1889
      code = tRowMergerGetRow(&merge, &pTSRow);
1890 1891 1892
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
1893

1894
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo);
1895 1896 1897 1898 1899

      taosMemoryFree(pTSRow);
      tRowMergerClear(&merge);
    }
  } else {  // not merge block data
H
Haojun Liao 已提交
1900 1901 1902 1903 1904
    int32_t code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

1905
    doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, &merge, &pReader->verRange);
1906 1907

    // merge with block data if ts == key
H
Haojun Liao 已提交
1908
    if (tsLastBlock == pBlockData->aTSKEY[pDumpInfo->rowIndex]) {
1909 1910 1911
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
    }

H
Haojun Liao 已提交
1912
    code = tRowMergerGetRow(&merge, &pTSRow);
1913 1914 1915 1916
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

1917
    doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo);
1918 1919 1920 1921

    taosMemoryFree(pTSRow);
    tRowMergerClear(&merge);
  }
1922 1923 1924 1925

  return TSDB_CODE_SUCCESS;
}

1926 1927
static int32_t mergeFileBlockAndLastBlock(STsdbReader* pReader, SLastBlockReader* pLastBlockReader, int64_t key,
                                          STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
1928 1929
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1930
  if (hasDataInFileBlock(pBlockData, pDumpInfo)) {
1931
    // no last block available, only data block exists
1932
    if (!hasDataInLastBlock(pLastBlockReader)) {
1933 1934 1935 1936 1937 1938 1939 1940 1941
      return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
    }

    // row in last file block
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
    int64_t ts = getCurrentKeyInLastBlock(pLastBlockReader);
    ASSERT(ts >= key);

    if (ASCENDING_TRAVERSE(pReader->order)) {
1942
      if (key < ts) {  // imem, mem are all empty, file blocks (data blocks and last block) exist
1943 1944 1945 1946
        return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
      } else if (key == ts) {
        STSRow*    pTSRow = NULL;
        SRowMerger merge = {0};
1947

H
Haojun Liao 已提交
1948 1949 1950 1951 1952
        int32_t code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

1953
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1954 1955 1956 1957

        TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
        tRowMerge(&merge, &fRow1);

1958
        doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, ts, &merge, &pReader->verRange);
1959

H
Haojun Liao 已提交
1960
        code = tRowMergerGetRow(&merge, &pTSRow);
1961 1962 1963 1964
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

1965
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo);
1966

1967 1968
        taosMemoryFree(pTSRow);
        tRowMergerClear(&merge);
1969
        return code;
1970
      } else {
1971
        return TSDB_CODE_SUCCESS;
1972
      }
1973
    } else {  // desc order
1974
      return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, pBlockData, true);
1975
    }
1976
  } else {  // only last block exists
1977
    return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, NULL, false);
H
Haojun Liao 已提交
1978
  }
1979 1980
}

1981 1982
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                     SLastBlockReader* pLastBlockReader) {
H
Haojun Liao 已提交
1983 1984 1985
  SRowMerger          merge = {0};
  STSRow*             pTSRow = NULL;
  int32_t             code = TSDB_CODE_SUCCESS;
1986 1987 1988
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SArray*             pDelList = pBlockScanInfo->delSkyline;

1989 1990
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
1991

1992
  int64_t tsLast = INT64_MIN;
1993 1994 1995
  if (hasDataInLastBlock(pLastBlockReader)) {
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
1996

H
Hongze Cheng 已提交
1997
  int64_t key = hasDataInFileBlock(pBlockData, pDumpInfo) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
1998 1999 2000 2001

  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

2002
  int64_t minKey = 0;
2003 2004 2005 2006 2007
  if (ASCENDING_TRAVERSE(pReader->order)) {
    minKey = INT64_MAX;  // let's find the minimum
    if (minKey > k.ts) {
      minKey = k.ts;
    }
2008

2009 2010 2011
    if (minKey > ik.ts) {
      minKey = ik.ts;
    }
2012

2013
    if (minKey > key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2014 2015
      minKey = key;
    }
2016

2017
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
2018 2019 2020
      minKey = tsLast;
    }
  } else {
H
Hongze Cheng 已提交
2021
    minKey = INT64_MIN;  // let find the maximum ts value
2022 2023 2024 2025 2026 2027 2028 2029
    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < ik.ts) {
      minKey = ik.ts;
    }

2030
    if (minKey < key && hasDataInFileBlock(pBlockData, pDumpInfo)) {
2031 2032 2033
      minKey = key;
    }

2034
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
2035 2036
      minKey = tsLast;
    }
2037 2038 2039 2040
  }

  bool init = false;

2041 2042 2043 2044
  // ASC: file block -----> last block -----> imem -----> mem
  // DESC: mem -----> imem -----> last block -----> file block
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (minKey == key) {
2045
      init = true;
2046
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2047
      code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2048 2049 2050 2051
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

2052
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
2053 2054
    }

2055
    if (minKey == tsLast) {
2056
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2057 2058 2059
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
2060
        init = true;
2061
        code = tRowMergerInit(&merge, &fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2062 2063 2064
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2065
      }
H
Haojun Liao 已提交
2066

2067
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge, &pReader->verRange);
2068 2069 2070
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2071 2072 2073
      if (init) {
        tRowMerge(&merge, piRow);
      } else {
2074 2075
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
H
Haojun Liao 已提交
2076 2077 2078 2079
        if (pSchema == NULL) {
          return code;
        }

H
Haojun Liao 已提交
2080
        code = tRowMergerInit(&merge, piRow, pSchema);
H
Haojun Liao 已提交
2081 2082 2083
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2084
      }
H
Haojun Liao 已提交
2085

2086 2087
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge,
                              pReader);
H
Haojun Liao 已提交
2088 2089 2090
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2091 2092
    }

2093
    if (minKey == k.ts) {
H
Haojun Liao 已提交
2094
      if (init) {
2095 2096 2097 2098
        if (merge.pTSchema == NULL) {
          return code;
        }

H
Haojun Liao 已提交
2099 2100
        tRowMerge(&merge, pRow);
      } else {
2101
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
H
Haojun Liao 已提交
2102
        code = tRowMergerInit(&merge, pRow, pSchema);
H
Haojun Liao 已提交
2103 2104 2105
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2106
      }
H
Haojun Liao 已提交
2107 2108 2109 2110 2111
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge,
                              pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2112 2113 2114 2115
    }
  } else {
    if (minKey == k.ts) {
      init = true;
2116
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
H
Haojun Liao 已提交
2117
      code = tRowMergerInit(&merge, pRow, pSchema);
H
Haojun Liao 已提交
2118 2119 2120 2121
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }

H
Haojun Liao 已提交
2122 2123 2124 2125 2126
      code = doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge,
                              pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2127 2128 2129
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
2130 2131 2132
      if (init) {
        tRowMerge(&merge, piRow);
      } else {
2133 2134
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
H
Haojun Liao 已提交
2135
        code = tRowMergerInit(&merge, piRow, pSchema);
H
Haojun Liao 已提交
2136 2137 2138
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2139
      }
H
Haojun Liao 已提交
2140 2141 2142 2143 2144
      code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge,
                              pReader);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2145 2146 2147
    }

    if (minKey == tsLast) {
2148
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
2149 2150 2151
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
2152
        init = true;
2153
        code = tRowMergerInit(&merge, &fRow1, pReader->pSchema);
H
Haojun Liao 已提交
2154 2155 2156
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
2157
      }
2158
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge, &pReader->verRange);
2159 2160 2161
    }

    if (minKey == key) {
H
Haojun Liao 已提交
2162
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
2163
      if (!init) {
2164
        code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
2165 2166 2167
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }
H
Haojun Liao 已提交
2168
      } else {
2169 2170 2171
        if (merge.pTSchema == NULL) {
          return code;
        }
H
Haojun Liao 已提交
2172
        tRowMerge(&merge, &fRow);
2173 2174
      }
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
2175 2176 2177
    }
  }

2178
  if (merge.pTSchema == NULL) {
2179 2180 2181
    return code;
  }

H
Haojun Liao 已提交
2182
  code = tRowMergerGetRow(&merge, &pTSRow);
2183 2184 2185 2186
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

2187
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo);
2188 2189 2190

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
2191
  return code;
2192 2193
}

2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

  int32_t code = TSDB_CODE_SUCCESS;

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
    startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
  } else {
    startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));
D
dapan1121 已提交
2209
  int64_t st = 0;
2210 2211 2212 2213 2214 2215 2216 2217 2218

  STbData* d = NULL;
  if (pReader->pReadSnap->pMem != NULL) {
    d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
    if (d != NULL) {
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);

H
Haojun Liao 已提交
2219
        tsdbDebug("%p uid:%" PRIu64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Haojun Liao 已提交
2220
                  "-%" PRId64 " %s",
2221 2222
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2223
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
2224 2225 2226 2227 2228
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2229
    tsdbDebug("%p uid:%" PRIu64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2230 2231 2232 2233 2234 2235 2236 2237 2238 2239
  }

  STbData* di = NULL;
  if (pReader->pReadSnap->pIMem != NULL) {
    di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
    if (di != NULL) {
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);

H
Haojun Liao 已提交
2240
        tsdbDebug("%p uid:%" PRIu64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Haojun Liao 已提交
2241
                  "-%" PRId64 " %s",
2242 2243
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
      } else {
H
Haojun Liao 已提交
2244
        tsdbError("%p uid:%" PRIu64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
2245 2246 2247 2248 2249
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
H
Haojun Liao 已提交
2250
    tsdbDebug("%p uid:%" PRIu64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
2251 2252
  }

2253
  st = taosGetTimestampUs();
2254
  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);
2255
  pReader->cost.initDelSkylineIterTime += (taosGetTimestampUs() - st) / 1000.0;
2256 2257 2258 2259 2260

  pBlockScanInfo->iterInit = true;
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
2261 2262
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
2263 2264 2265 2266 2267 2268 2269 2270
  // it is an multi-table data block
  if (pBlockData->aUid != NULL) {
    uint64_t uid = pBlockData->aUid[pDumpInfo->rowIndex];
    if (uid != pBlockScanInfo->uid) {  // move to next row
      return false;
    }
  }

2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

2282
  TSDBKEY k = {.ts = ts, .version = ver};
2283 2284
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, &k, pReader->order,
                     &pReader->verRange)) {
2285 2286 2287
    return false;
  }

2288 2289 2290
  return true;
}

2291
static bool initLastBlockReader(SLastBlockReader* pLBlockReader, STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
2292
  // the last block reader has been initialized for this table.
2293
  if (pLBlockReader->uid == pScanInfo->uid) {
2294
    return hasDataInLastBlock(pLBlockReader);
2295 2296
  }

2297 2298
  if (pLBlockReader->uid != 0) {
    tMergeTreeClose(&pLBlockReader->mergeTree);
2299 2300
  }

2301 2302
  initMemDataIterator(pScanInfo, pReader);
  pLBlockReader->uid = pScanInfo->uid;
2303

H
Hongze Cheng 已提交
2304
  int32_t     step = ASCENDING_TRAVERSE(pLBlockReader->order) ? 1 : -1;
2305 2306 2307
  STimeWindow w = pLBlockReader->window;
  if (ASCENDING_TRAVERSE(pLBlockReader->order)) {
    w.skey = pScanInfo->lastKey + step;
2308
  } else {
2309
    w.ekey = pScanInfo->lastKey + step;
2310 2311
  }

2312
  tsdbDebug("init last block reader, window:%"PRId64"-%"PRId64", uid:%"PRIu64", %s", w.skey, w.ekey, pScanInfo->uid, pReader->idStr);
2313 2314 2315
  int32_t code = tMergeTreeOpen(&pLBlockReader->mergeTree, (pLBlockReader->order == TSDB_ORDER_DESC),
                                pReader->pFileReader, pReader->suid, pScanInfo->uid, &w, &pLBlockReader->verRange,
                                pLBlockReader->pInfo, false, pReader->idStr);
2316 2317 2318 2319
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

2320
  return nextRowFromLastBlocks(pLBlockReader, pScanInfo, &pReader->verRange);
2321 2322
}

2323
static int64_t getCurrentKeyInLastBlock(SLastBlockReader* pLastBlockReader) {
2324
  TSDBROW row = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
2325
  return TSDBROW_TS(&row);
2326 2327
}

H
Hongze Cheng 已提交
2328
static bool hasDataInLastBlock(SLastBlockReader* pLastBlockReader) { return pLastBlockReader->mergeTree.pIter != NULL; }
2329 2330

bool hasDataInFileBlock(const SBlockData* pBlockData, const SFileBlockDumpInfo* pDumpInfo) {
H
Haojun Liao 已提交
2331
  if ((pBlockData->nRow > 0) && (pBlockData->nRow != pDumpInfo->totalRows)) {
2332
    return false;  // this is an invalid result.
2333
  }
2334
  return pBlockData->nRow > 0 && (!pDumpInfo->allDumped);
2335
}
2336

2337 2338
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                              STsdbReader* pReader) {
2339 2340
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  if (tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo)) {
2341
    pBlockScanInfo->lastKey = key;
2342 2343
    return TSDB_CODE_SUCCESS;
  } else {
2344 2345
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);

2346 2347 2348
    STSRow*    pTSRow = NULL;
    SRowMerger merge = {0};

H
Haojun Liao 已提交
2349 2350 2351 2352 2353
    int32_t code = tRowMergerInit(&merge, &fRow, pReader->pSchema);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2354
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
H
Haojun Liao 已提交
2355
    code = tRowMergerGetRow(&merge, &pTSRow);
2356 2357 2358 2359
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2360
    doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo);
2361 2362 2363 2364 2365 2366 2367

    taosMemoryFree(pTSRow);
    tRowMergerClear(&merge);
    return TSDB_CODE_SUCCESS;
  }
}

H
Haojun Liao 已提交
2368 2369
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
                                          SBlockData* pBlockData, SLastBlockReader* pLastBlockReader) {
2370 2371
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2372
  TSDBROW *pRow = NULL, *piRow = NULL;
2373
  int64_t key = (pBlockData->nRow > 0 && (!pDumpInfo->allDumped)) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
2374 2375 2376
  if (pBlockScanInfo->iter.hasVal) {
    pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  }
2377

2378 2379 2380
  if (pBlockScanInfo->iiter.hasVal) {
    piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
  }
2381

2382 2383 2384 2385
  // two levels of mem-table does contain the valid rows
  if (pRow != NULL && piRow != NULL) {
    return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
  }
2386

2387 2388 2389 2390
  // imem + file + last block
  if (pBlockScanInfo->iiter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key, pLastBlockReader);
  }
2391

2392 2393 2394
  // mem + file + last block
  if (pBlockScanInfo->iter.hasVal) {
    return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key, pLastBlockReader);
2395
  }
2396 2397 2398

  // files data blocks + last block
  return mergeFileBlockAndLastBlock(pReader, pLastBlockReader, key, pBlockScanInfo, pBlockData);
2399 2400
}

H
Haojun Liao 已提交
2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440
static int32_t loadNeighborIfOverlap(SFileDataBlockInfo* pBlockInfo, STableBlockScanInfo* pBlockScanInfo,
                                     STsdbReader* pReader, bool* loadNeighbor) {
  int32_t     code = TSDB_CODE_SUCCESS;
  int32_t     step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
  int32_t     nextIndex = -1;
  SBlockIndex nxtBIndex = {0};

  *loadNeighbor = false;
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);

  bool hasNeighbor = getNeighborBlockOfSameTable(pBlockInfo, pBlockScanInfo, &nextIndex, pReader->order, &nxtBIndex);
  if (!hasNeighbor) {  // do nothing
    return code;
  }

  if (overlapWithNeighborBlock(pBlock, &nxtBIndex, pReader->order)) {  // load next block
    SReaderStatus*  pStatus = &pReader->status;
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

    // 1. find the next neighbor block in the scan block list
    SFileDataBlockInfo fb = {.uid = pBlockInfo->uid, .tbBlockIdx = nextIndex};
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);

    // 2. remove it from the scan block list
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);

    // 3. load the neighbor block, and set it to be the currently accessed file data block
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pBlockInfo->uid);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    // 4. check the data values
    initBlockDumpInfo(pReader, pBlockIter);
    *loadNeighbor = true;
  }

  return code;
}

2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453
static void updateComposedBlockInfo(STsdbReader* pReader, double el, STableBlockScanInfo* pBlockScanInfo) {
  SSDataBlock* pResBlock = pReader->pResBlock;

  pResBlock->info.id.uid = (pBlockScanInfo != NULL) ? pBlockScanInfo->uid : 0;
  pResBlock->info.dataLoad = 1;
  blockDataUpdateTsWindow(pResBlock, pReader->suppInfo.slotId[0]);

  setComposedBlockFlag(pReader, true);

  pReader->cost.composedBlocks += 1;
  pReader->cost.buildComposedBlockTime += el;
}

2454
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2455 2456
  int32_t code = TSDB_CODE_SUCCESS;

2457 2458
  SSDataBlock* pResBlock = pReader->pResBlock;

H
Hongze Cheng 已提交
2459
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Haojun Liao 已提交
2460 2461
  SLastBlockReader*   pLastBlockReader = pReader->status.fileIter.pLastBlockReader;

2462
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
2463
  int64_t st = taosGetTimestampUs();
2464
  int32_t step = asc ? 1 : -1;
2465
  double  el = 0;
2466 2467 2468

  STableBlockScanInfo* pBlockScanInfo = NULL;
  if (pBlockInfo != NULL) {
2469 2470
    void* p = taosHashGet(pReader->status.pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
    if (p == NULL) {
H
Haojun Liao 已提交
2471
      code = TSDB_CODE_INVALID_PARA;
2472 2473
      tsdbError("failed to locate the uid:%" PRIu64 " in query table uid list, total tables:%d, %s", pBlockInfo->uid,
                taosHashGetSize(pReader->status.pTableMap), pReader->idStr);
H
Haojun Liao 已提交
2474 2475 2476
      goto _end;
    }

H
Hongze Cheng 已提交
2477
    pBlockScanInfo = *(STableBlockScanInfo**)p;
2478

H
Haojun Liao 已提交
2479
    SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
H
Hongze Cheng 已提交
2480
    TSDBKEY   keyInBuf = getCurrentKeyInBuf(pBlockScanInfo, pReader);
H
Haojun Liao 已提交
2481 2482

    // it is a clean block, load it directly
2483
    if (isCleanFileDataBlock(pReader, pBlockInfo, pBlock, pBlockScanInfo, keyInBuf, pLastBlockReader) &&
2484
        pBlock->nRow <= pReader->capacity) {
2485
      if (asc || ((!asc) && (!hasDataInLastBlock(pLastBlockReader)))) {
2486
        copyBlockDataToSDataBlock(pReader);
2487 2488

        // record the last key value
H
Hongze Cheng 已提交
2489
        pBlockScanInfo->lastKey = asc ? pBlock->maxKey.ts : pBlock->minKey.ts;
H
Haojun Liao 已提交
2490 2491
        goto _end;
      }
H
Haojun Liao 已提交
2492 2493
    }
  } else {  // file blocks not exist
2494
    pBlockScanInfo = *pReader->status.pTableIter;
2495 2496
  }

2497
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2498
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
2499

2500
  while (1) {
2501
    bool hasBlockData = false;
2502
    {
H
Haojun Liao 已提交
2503
      while (pBlockData->nRow > 0) {  // find the first qualified row in data block
2504 2505 2506 2507 2508
        if (isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
          hasBlockData = true;
          break;
        }

2509 2510
        pDumpInfo->rowIndex += step;

H
Hongze Cheng 已提交
2511
        SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
2512
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
H
Haojun Liao 已提交
2513
          pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);  // NOTE: get the new block info
H
Haojun Liao 已提交
2514

H
Haojun Liao 已提交
2515 2516 2517 2518 2519
          // continue check for the next file block if the last ts in the current block
          // is overlapped with the next neighbor block
          bool loadNeighbor = false;
          code = loadNeighborIfOverlap(pBlockInfo, pBlockScanInfo, pReader, &loadNeighbor);
          if ((!loadNeighbor) || (code != 0)) {
2520 2521
            setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
            break;
2522
          }
2523 2524
        }
      }
2525
    }
2526

2527
    // no data in last block and block, no need to proceed.
2528
    if (hasBlockData == false) {
2529
      break;
2530 2531
    }

2532
    buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
2533

2534
    // currently loaded file data block is consumed
2535
    if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
H
Hongze Cheng 已提交
2536
      SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
2537
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2538 2539 2540 2541 2542
      break;
    }

    if (pResBlock->info.rows >= pReader->capacity) {
      break;
2543 2544 2545
    }
  }

H
Haojun Liao 已提交
2546
_end:
2547 2548
  el = (taosGetTimestampUs() - st) / 1000.0;
  updateComposedBlockInfo(pReader, el, pBlockScanInfo);
2549

2550 2551
  if (pResBlock->info.rows > 0) {
    tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
H
Hongze Cheng 已提交
2552
              " rows:%d, elapsed time:%.2f ms %s",
H
Haojun Liao 已提交
2553
              pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
H
Haojun Liao 已提交
2554
              pResBlock->info.rows, el, pReader->idStr);
2555
  }
2556

H
Haojun Liao 已提交
2557
  return code;
2558 2559 2560 2561
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

2562 2563 2564 2565 2566 2567 2568 2569
int32_t getInitialDelIndex(const SArray* pDelSkyline, int32_t order) {
  if (pDelSkyline == NULL) {
    return 0;
  }

  return ASCENDING_TRAVERSE(order) ? 0 : taosArrayGetSize(pDelSkyline) - 1;
}

dengyihao's avatar
dengyihao 已提交
2570 2571
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
2572 2573 2574
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
2575

2576
  int32_t code = 0;
2577 2578
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
2579
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
2580
  if (pDelFile && taosArrayGetSize(pReader->pDelIdx) > 0) {
2581
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
2582
    SDelIdx* pIdx = taosArraySearch(pReader->pDelIdx, &idx, tCmprDelIdx, TD_EQ);
2583

H
Haojun Liao 已提交
2584
    if (pIdx != NULL) {
2585
      code = tsdbReadDelData(pReader->pDelFReader, pIdx, pDelData);
2586 2587 2588
    }
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
2589
    }
2590
  }
2591

2592 2593 2594 2595 2596 2597 2598
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
2599 2600
  }

2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
2615 2616 2617 2618 2619 2620 2621
  int32_t index = getInitialDelIndex(pBlockScanInfo->delSkyline, pReader->order);

  pBlockScanInfo->iter.index = index;
  pBlockScanInfo->iiter.index = index;
  pBlockScanInfo->fileDelIndex = index;
  pBlockScanInfo->lastBlockDelIndex = index;

2622 2623
  return code;

H
Haojun Liao 已提交
2624
_err:
2625 2626
  taosArrayDestroy(pDelData);
  return code;
2627 2628
}

H
Haojun Liao 已提交
2629
TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
2630
  bool asc = ASCENDING_TRAVERSE(pReader->order);
2631
//  TSKEY initialVal = asc? TSKEY_MIN:TSKEY_MAX;
2632

2633
  TSDBKEY  key = {.ts = TSKEY_INITIAL_VAL}, ikey = {.ts = TSKEY_INITIAL_VAL};
2634 2635

  bool hasKey = false, hasIKey = false;
2636
  TSDBROW* pRow = getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
2637
  if (pRow != NULL) {
2638
    hasKey = true;
2639 2640 2641
    key = TSDBROW_KEY(pRow);
  }

2642 2643 2644 2645 2646
  TSDBROW* pIRow = getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
  if (pIRow != NULL) {
    hasIKey = true;
    ikey = TSDBROW_KEY(pIRow);
  }
2647

2648 2649 2650 2651 2652 2653 2654 2655 2656
  if (hasKey) {
    if (hasIKey) { // has data in mem & imem
      if (asc) {
        return key.ts <= ikey.ts ? key : ikey;
      } else  {
        return key.ts <= ikey.ts ? ikey: key;
      }
    } else {  // no data in imem
      return key;
2657
    }
2658 2659 2660 2661
  } else {
    // no data in mem & imem, return the initial value
    // only imem has data, return ikey
    return ikey;
2662 2663 2664
  }
}

2665
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum) {
H
Haojun Liao 已提交
2666
  SReaderStatus* pStatus = &pReader->status;
2667
  pBlockNum->numOfBlocks = 0;
2668
  pBlockNum->numOfLastFiles = 0;
2669

2670
  size_t  numOfTables = taosHashGetSize(pReader->status.pTableMap);
2671
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
2672 2673

  while (1) {
2674
    bool hasNext = filesetIteratorNext(&pStatus->fileIter, pReader);
2675
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
2676 2677 2678
      break;
    }

H
Haojun Liao 已提交
2679
    taosArrayClear(pIndexList);
H
Haojun Liao 已提交
2680 2681
    int32_t code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2682
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2683 2684 2685
      return code;
    }

H
Hongze Cheng 已提交
2686
    if (taosArrayGetSize(pIndexList) > 0 || pReader->pFileReader->pSet->nSttF > 0) {
2687
      code = doLoadFileBlock(pReader, pIndexList, pBlockNum);
H
Haojun Liao 已提交
2688
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2689
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2690 2691 2692
        return code;
      }

2693
      if (pBlockNum->numOfBlocks + pBlockNum->numOfLastFiles > 0) {
H
Haojun Liao 已提交
2694 2695 2696
        break;
      }
    }
2697

H
Haojun Liao 已提交
2698 2699 2700
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
2701
  taosArrayDestroy(pIndexList);
2702

H
Haojun Liao 已提交
2703 2704 2705 2706 2707 2708 2709
  if (pReader->pReadSnap != NULL) {
    SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
    if (pReader->pDelFReader == NULL && pDelFile != NULL) {
      int32_t code = tsdbDelFReaderOpen(&pReader->pDelFReader, pDelFile, pReader->pTsdb);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2710

H
Haojun Liao 已提交
2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721
      pReader->pDelIdx = taosArrayInit(4, sizeof(SDelIdx));
      if (pReader->pDelIdx == NULL) {
        code = TSDB_CODE_OUT_OF_MEMORY;
        return code;
      }

      code = tsdbReadDelIdx(pReader->pDelFReader, pReader->pDelIdx);
      if (code != TSDB_CODE_SUCCESS) {
        taosArrayDestroy(pReader->pDelIdx);
        return code;
      }
2722 2723 2724
    }
  }

H
Haojun Liao 已提交
2725 2726 2727
  return TSDB_CODE_SUCCESS;
}

2728
static int32_t uidComparFunc(const void* p1, const void* p2) {
2729 2730
  uint64_t pu1 = *(uint64_t*)p1;
  uint64_t pu2 = *(uint64_t*)p2;
2731 2732 2733
  if (pu1 == pu2) {
    return 0;
  } else {
2734
    return (pu1 < pu2) ? -1 : 1;
2735 2736
  }
}
2737

2738
static void extractOrderedTableUidList(SUidOrderCheckInfo* pOrderCheckInfo, SReaderStatus* pStatus, int32_t order) {
2739 2740 2741 2742
  int32_t index = 0;
  int32_t total = taosHashGetSize(pStatus->pTableMap);

  void* p = taosHashIterate(pStatus->pTableMap, NULL);
2743
  while (p != NULL) {
H
Hongze Cheng 已提交
2744
    STableBlockScanInfo* pScanInfo = *(STableBlockScanInfo**)p;
2745 2746 2747 2748 2749 2750 2751
    pOrderCheckInfo->tableUidList[index++] = pScanInfo->uid;
    p = taosHashIterate(pStatus->pTableMap, p);
  }

  taosSort(pOrderCheckInfo->tableUidList, total, sizeof(uint64_t), uidComparFunc);
}

2752 2753 2754
static int32_t initOrderCheckInfo(SUidOrderCheckInfo* pOrderCheckInfo, STsdbReader* pReader) {
  SReaderStatus* pStatus = &pReader->status;

2755 2756 2757 2758
  int32_t total = taosHashGetSize(pStatus->pTableMap);
  if (total == 0) {
    return TSDB_CODE_SUCCESS;
  }
2759

2760
  if (pOrderCheckInfo->tableUidList == NULL) {
2761 2762 2763 2764 2765 2766
    pOrderCheckInfo->currentIndex = 0;
    pOrderCheckInfo->tableUidList = taosMemoryMalloc(total * sizeof(uint64_t));
    if (pOrderCheckInfo->tableUidList == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

2767
    extractOrderedTableUidList(pOrderCheckInfo, pStatus, pReader->order);
2768 2769 2770
    uint64_t uid = pOrderCheckInfo->tableUidList[0];
    pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
  } else {
2771 2772
    if (pStatus->pTableIter == NULL) {  // it is the last block of a new file
      pOrderCheckInfo->currentIndex = 0;
2773 2774
      uint64_t uid = pOrderCheckInfo->tableUidList[pOrderCheckInfo->currentIndex];
      pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2775 2776

      // the tableMap has already updated
2777
      if (pStatus->pTableIter == NULL) {
2778
        void* p = taosMemoryRealloc(pOrderCheckInfo->tableUidList, total * sizeof(uint64_t));
2779 2780 2781 2782 2783
        if (p == NULL) {
          return TSDB_CODE_OUT_OF_MEMORY;
        }

        pOrderCheckInfo->tableUidList = p;
2784
        extractOrderedTableUidList(pOrderCheckInfo, pStatus, pReader->order);
2785 2786 2787

        uid = pOrderCheckInfo->tableUidList[0];
        pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2788
      }
2789
    }
2790
  }
2791

2792 2793 2794
  return TSDB_CODE_SUCCESS;
}

2795
static bool moveToNextTable(SUidOrderCheckInfo* pOrderedCheckInfo, SReaderStatus* pStatus) {
2796 2797 2798 2799 2800 2801 2802 2803
  pOrderedCheckInfo->currentIndex += 1;
  if (pOrderedCheckInfo->currentIndex >= taosHashGetSize(pStatus->pTableMap)) {
    pStatus->pTableIter = NULL;
    return false;
  }

  uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
  pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2804
  return (pStatus->pTableIter != NULL);
2805 2806
}

2807
static int32_t doLoadLastBlockSequentially(STsdbReader* pReader) {
2808
  SReaderStatus*    pStatus = &pReader->status;
2809 2810
  SLastBlockReader* pLastBlockReader = pStatus->fileIter.pLastBlockReader;

2811
  SUidOrderCheckInfo* pOrderedCheckInfo = &pStatus->uidCheckInfo;
2812
  int32_t             code = initOrderCheckInfo(pOrderedCheckInfo, pReader);
2813
  if (code != TSDB_CODE_SUCCESS || (taosHashGetSize(pStatus->pTableMap) == 0)) {
2814 2815
    return code;
  }
2816

2817 2818
  SSDataBlock* pResBlock = pReader->pResBlock;

2819
  while (1) {
2820
    // load the last data block of current table
H
Hongze Cheng 已提交
2821
    STableBlockScanInfo* pScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
2822 2823

    bool hasVal = initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
2824
    if (!hasVal) {
2825 2826
      bool hasNexTable = moveToNextTable(pOrderedCheckInfo, pStatus);
      if (!hasNexTable) {
2827 2828
        return TSDB_CODE_SUCCESS;
      }
2829

2830
      continue;
2831 2832
    }

2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845
    int64_t st = taosGetTimestampUs();
    while (1) {
      bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

      // no data in last block and block, no need to proceed.
      if (hasBlockLData == false) {
        break;
      }

      buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
      if (pResBlock->info.rows >= pReader->capacity) {
        break;
      }
2846 2847
    }

2848 2849 2850 2851 2852 2853 2854 2855
    double el = (taosGetTimestampUs() - st) / 1000.0;
    updateComposedBlockInfo(pReader, el, pScanInfo);

    if (pResBlock->info.rows > 0) {
      tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
                " rows:%d, elapsed time:%.2f ms %s",
                pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                pResBlock->info.rows, el, pReader->idStr);
2856 2857
      return TSDB_CODE_SUCCESS;
    }
2858

2859
    // current table is exhausted, let's try next table
2860 2861
    bool hasNexTable = moveToNextTable(pOrderedCheckInfo, pStatus);
    if (!hasNexTable) {
2862 2863
      return TSDB_CODE_SUCCESS;
    }
2864 2865 2866
  }
}

2867
static int32_t doBuildDataBlock(STsdbReader* pReader) {
H
Hongze Cheng 已提交
2868 2869
  int32_t   code = TSDB_CODE_SUCCESS;
  SDataBlk* pBlock = NULL;
2870 2871 2872

  SReaderStatus*       pStatus = &pReader->status;
  SDataBlockIter*      pBlockIter = &pStatus->blockIter;
2873 2874 2875
  STableBlockScanInfo* pScanInfo = NULL;
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(pBlockIter);
  SLastBlockReader*    pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
2876

H
Haojun Liao 已提交
2877 2878
  ASSERT(pBlockInfo != NULL);

2879
  pScanInfo = *(STableBlockScanInfo**)taosHashGet(pReader->status.pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
H
Haojun Liao 已提交
2880
  if (pScanInfo == NULL) {
H
Haojun Liao 已提交
2881
    tsdbError("failed to get table scan-info, %s", pReader->idStr);
H
Haojun Liao 已提交
2882 2883 2884 2885
    code = TSDB_CODE_INVALID_PARA;
    return code;
  }

2886
  pBlock = getCurrentBlock(pBlockIter);
2887

2888
  initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
H
Haojun Liao 已提交
2889
  TSDBKEY keyInBuf = getCurrentKeyInBuf(pScanInfo, pReader);
2890

2891
  if (fileBlockShouldLoad(pReader, pBlockInfo, pBlock, pScanInfo, keyInBuf, pLastBlockReader)) {
2892
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData, pScanInfo->uid);
2893 2894
    if (code != TSDB_CODE_SUCCESS) {
      return code;
2895 2896 2897
    }

    // build composed data block
2898
    code = buildComposedDataBlock(pReader);
H
Haojun Liao 已提交
2899
  } else if (bufferDataInFileBlockGap(pReader->order, keyInBuf, pBlock)) {
2900
    // data in memory that are earlier than current file block
H
Haojun Liao 已提交
2901
    // rows in buffer should be less than the file block in asc, greater than file block in desc
2902
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
2903
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
2904 2905 2906 2907
  } else {
    if (hasDataInLastBlock(pLastBlockReader) && !ASCENDING_TRAVERSE(pReader->order)) {
      // only return the rows in last block
      int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
H
Hongze Cheng 已提交
2908
      ASSERT(tsLast >= pBlock->maxKey.ts);
2909

2910 2911 2912 2913
      SBlockData* pBData = &pReader->status.fileBlockData;
      tBlockDataReset(pBData);

      SSDataBlock* pResBlock = pReader->pResBlock;
2914
      tsdbDebug("load data in last block firstly, due to desc scan data, %s", pReader->idStr);
2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936

      int64_t st = taosGetTimestampUs();

      while (1) {
        bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);

        // no data in last block and block, no need to proceed.
        if (hasBlockLData == false) {
          break;
        }

        buildComposedDataBlockImpl(pReader, pScanInfo, &pReader->status.fileBlockData, pLastBlockReader);
        if (pResBlock->info.rows >= pReader->capacity) {
          break;
        }
      }

      double el = (taosGetTimestampUs() - st) / 1000.0;
      updateComposedBlockInfo(pReader, el, pScanInfo);

      if (pResBlock->info.rows > 0) {
        tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
H
Haojun Liao 已提交
2937
                  " rows:%d, elapsed time:%.2f ms %s",
2938 2939 2940
                  pReader, pResBlock->info.id.uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
                  pResBlock->info.rows, el, pReader->idStr);
      }
H
Hongze Cheng 已提交
2941
    } else {  // whole block is required, return it directly
2942 2943
      SDataBlockInfo* pInfo = &pReader->pResBlock->info;
      pInfo->rows = pBlock->nRow;
H
Haojun Liao 已提交
2944
      pInfo->id.uid = pScanInfo->uid;
2945
      pInfo->dataLoad = 0;
2946 2947 2948
      pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
      setComposedBlockFlag(pReader, false);
      setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
2949

2950
      // update the last key for the corresponding table
H
Hongze Cheng 已提交
2951
      pScanInfo->lastKey = ASCENDING_TRAVERSE(pReader->order) ? pInfo->window.ekey : pInfo->window.skey;
2952
      tsdbDebug("%p uid:%" PRIu64 " clean file block retrieved from file, global index:%d, "
H
Haojun Liao 已提交
2953 2954 2955
                "table index:%d, rows:%d, brange:%" PRId64 "-%" PRId64 ", %s",
                pReader, pScanInfo->uid, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->nRow, pBlock->minKey.ts,
                pBlock->maxKey.ts, pReader->idStr);
2956
    }
2957 2958 2959 2960 2961
  }

  return code;
}

H
Haojun Liao 已提交
2962
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
2963 2964
  SReaderStatus* pStatus = &pReader->status;

2965
  while (1) {
2966 2967 2968
    if (pStatus->pTableIter == NULL) {
      pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
      if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
2969
        return TSDB_CODE_SUCCESS;
2970 2971 2972
      }
    }

2973 2974
    STableBlockScanInfo** pBlockScanInfo = pStatus->pTableIter;
    initMemDataIterator(*pBlockScanInfo, pReader);
2975

2976
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
2977
    int32_t code = buildDataBlockFromBuf(pReader, *pBlockScanInfo, endKey);
H
Haojun Liao 已提交
2978 2979 2980 2981
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2982
    if (pReader->pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
2983
      return TSDB_CODE_SUCCESS;
2984 2985 2986 2987 2988
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
    if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
2989
      return TSDB_CODE_SUCCESS;
2990 2991 2992 2993
    }
  }
}

2994
// set the correct start position in case of the first/last file block, according to the query time window
2995
void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
H
Hongze Cheng 已提交
2996
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
2997

2998 2999 3000
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
3001 3002 3003

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
3004
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
3005 3006
}

3007
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
3008 3009
  SBlockNumber num = {0};

3010
  int32_t code = moveToNextFile(pReader, &num);
3011 3012 3013 3014 3015
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

  // all data files are consumed, try data in buffer
3016
  if (num.numOfBlocks + num.numOfLastFiles == 0) {
3017 3018 3019 3020 3021
    pReader->status.loadFromFile = false;
    return code;
  }

  // initialize the block iterator for a new fileset
3022 3023
  if (num.numOfBlocks > 0) {
    code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks);
H
Hongze Cheng 已提交
3024
  } else {  // no block data, only last block exists
3025
    tBlockDataReset(&pReader->status.fileBlockData);
3026
    resetDataBlockIterator(pBlockIter, pReader->order);
3027
  }
3028 3029

  // set the correct start position according to the query time window
3030
  initBlockDumpInfo(pReader, pBlockIter);
3031 3032 3033
  return code;
}

3034
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
3035 3036
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
3037 3038
}

3039
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
H
Haojun Liao 已提交
3040
  int32_t code = TSDB_CODE_SUCCESS;
3041 3042
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

3043 3044
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

3045
  if (pBlockIter->numOfBlocks == 0) {
H
Haojun Liao 已提交
3046
  _begin:
3047 3048 3049 3050 3051
    code = doLoadLastBlockSequentially(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

3052 3053 3054 3055
    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }

3056
    // all data blocks are checked in this last block file, now let's try the next file
3057 3058 3059 3060 3061 3062 3063 3064
    if (pReader->status.pTableIter == NULL) {
      code = initForFirstBlockInFile(pReader, pBlockIter);

      // error happens or all the data files are completely checked
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        return code;
      }

3065
      // this file does not have data files, let's start check the last block file if exists
3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080
      if (pBlockIter->numOfBlocks == 0) {
        goto _begin;
      }
    }

    code = doBuildDataBlock(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }

3081
  while (1) {
3082 3083
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

3084
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
3085
      code = buildComposedDataBlock(pReader);
3086 3087 3088 3089
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
H
Haojun Liao 已提交
3090
        bool hasNext = blockIteratorNext(&pReader->status.blockIter, pReader->idStr);
3091 3092
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
3093
        } else {
H
Haojun Liao 已提交
3094
          if (pReader->status.pCurrentFileset->nSttF > 0) {
3095 3096 3097 3098 3099 3100
            // data blocks in current file are exhausted, let's try the next file now
            tBlockDataReset(&pReader->status.fileBlockData);
            resetDataBlockIterator(pBlockIter, pReader->order);
            goto _begin;
          } else {
            code = initForFirstBlockInFile(pReader, pBlockIter);
3101

3102 3103 3104 3105
            // error happens or all the data files are completely checked
            if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
              return code;
            }
3106

3107 3108 3109 3110
            // this file does not have blocks, let's start check the last block file
            if (pBlockIter->numOfBlocks == 0) {
              goto _begin;
            }
3111
          }
3112
        }
H
Haojun Liao 已提交
3113
      }
3114 3115

      code = doBuildDataBlock(pReader);
3116 3117
    }

3118 3119 3120 3121 3122 3123 3124 3125
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }
3126
}
H
refact  
Hongze Cheng 已提交
3127

3128 3129
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
3130
  if (VND_IS_RSMA(pVnode)) {
3131
    int8_t  level = 0;
3132 3133
    int8_t  precision = pVnode->config.tsdbCfg.precision;
    int64_t now = taosGetTimestamp(precision);
H
Haojun Liao 已提交
3134
    int64_t offset = tsQueryRsmaTolerance * ((precision == TSDB_TIME_PRECISION_MILLI)   ? 1L
H
Hongze Cheng 已提交
3135 3136
                                             : (precision == TSDB_TIME_PRECISION_MICRO) ? 1000L
                                                                                        : 1000000L);
3137

3138
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
3139 3140 3141 3142 3143 3144 3145
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
3146
      if ((now - pRetention->keep) <= (winSKey + offset)) {
3147 3148 3149 3150 3151
        break;
      }
      ++level;
    }

3152
    const char* str = (idStr != NULL) ? idStr : "";
3153 3154

    if (level == TSDB_RETENTION_L0) {
3155
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
3156
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
3157 3158
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
3159
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
3160
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
3161 3162
      return VND_RSMA1(pVnode);
    } else {
3163
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
3164
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
3165 3166 3167 3168 3169 3170 3171
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
3172
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
3173
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
3174 3175

  int64_t endVer = 0;
L
Liu Jicong 已提交
3176 3177
  if (pCond->endVersion ==
      -1) {  // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
3178 3179
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
3180
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
3181 3182
  }

H
Haojun Liao 已提交
3183
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
3184 3185
}

3186
bool hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order, SVersionRange* pVerRange) {
3187 3188 3189
  if (pDelList == NULL) {
    return false;
  }
H
Haojun Liao 已提交
3190

L
Liu Jicong 已提交
3191 3192 3193
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
3194

3195 3196 3197 3198 3199 3200
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
      ASSERT(pKey->ts >= last->ts);

      if (pKey->ts > last->ts) {
3201
        return false;
3202 3203
      } else if (pKey->ts == last->ts) {
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
3204 3205
        return (prev->version >= pKey->version && prev->version <= pVerRange->maxVer &&
                prev->version >= pVerRange->minVer);
3206 3207
      }
    } else {
3208 3209 3210 3211 3212 3213 3214
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

      if (pKey->ts < pCurrent->ts) {
        return false;
      }

3215 3216
      if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version &&
          pVerRange->maxVer >= pCurrent->version) {
3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231
        return true;
      }

      while (pNext->ts <= pKey->ts && (*index) < num - 1) {
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

3232 3233
          if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version &&
              pVerRange->maxVer >= pCurrent->version) {
3234 3235 3236 3237 3238 3239
            return true;
          }
        }
      }

      return false;
3240 3241
    }
  } else {
3242 3243
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
3244

3245 3246 3247 3248 3249 3250 3251
      if (pKey->ts < pFirst->ts) {
        return false;
      } else if (pKey->ts == pFirst->ts) {
        return pFirst->version >= pKey->version;
      } else {
        ASSERT(0);
      }
3252
    } else {
3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

      if (pKey->ts > pCurrent->ts) {
        return false;
      }

      if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
        return true;
      }

      while (pPrev->ts >= pKey->ts && (*index) > 1) {
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

          if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
            return true;
          }
        }
3280 3281 3282 3283 3284
      }

      return false;
    }
  }
3285 3286

  return false;
3287 3288
}

3289
TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
3290
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
3291 3292
    return NULL;
  }
H
Hongze Cheng 已提交
3293

3294
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
3295
  TSDBKEY  key = {.ts = pRow->pTSRow->ts, .version = pRow->version};
3296
  if (outOfTimeWindow(key.ts, &pReader->window)) {
3297
    pIter->hasVal = false;
H
Haojun Liao 已提交
3298 3299
    return NULL;
  }
H
Hongze Cheng 已提交
3300

3301
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
3302
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
3303
      (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3304 3305
    return pRow;
  }
H
Hongze Cheng 已提交
3306

3307
  while (1) {
3308 3309
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3310 3311
      return NULL;
    }
H
Hongze Cheng 已提交
3312

3313
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
3314

H
Haojun Liao 已提交
3315
    key = TSDBROW_KEY(pRow);
3316
    if (outOfTimeWindow(key.ts, &pReader->window)) {
3317
      pIter->hasVal = false;
H
Haojun Liao 已提交
3318 3319
      return NULL;
    }
H
Hongze Cheng 已提交
3320

dengyihao's avatar
dengyihao 已提交
3321
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
3322
        (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order, &pReader->verRange))) {
H
Haojun Liao 已提交
3323 3324 3325 3326
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
3327

3328 3329
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
                         STsdbReader* pReader) {
H
Haojun Liao 已提交
3330
  while (1) {
3331 3332
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
3333 3334
      break;
    }
H
Hongze Cheng 已提交
3335

3336
    // data exists but not valid
3337
    TSDBROW* pRow = getValidMemRow(pIter, pDelList, pReader);
3338 3339 3340 3341 3342
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
3343
    TSDBKEY k = TSDBROW_KEY(pRow);
3344
    if (k.ts != ts) {
H
Haojun Liao 已提交
3345 3346 3347
      break;
    }

H
Haojun Liao 已提交
3348
    STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
H
Haojun Liao 已提交
3349 3350 3351 3352
    if (pTSchema == NULL) {
      return terrno;
    }

3353
    tRowMergerAdd(pMerger, pRow, pTSchema);
H
Haojun Liao 已提交
3354 3355 3356 3357 3358
  }

  return TSDB_CODE_SUCCESS;
}

3359
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
3360
                                          SVersionRange* pVerRange, int32_t step) {
3361
  while (rowIndex < pBlockData->nRow && rowIndex >= 0 && pBlockData->aTSKEY[rowIndex] == key) {
3362
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
3363
      rowIndex += step;
3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
    tRowMerge(pMerger, &fRow);
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

H
Hongze Cheng 已提交
3380
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SDataBlk* pBlock,
3381 3382
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
3383
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
3384
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
3385

3386
  *state = CHECK_FILEBLOCK_QUIT;
3387
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
3388

3389
  bool    loadNeighbor = true;
H
Haojun Liao 已提交
3390
  int32_t code = loadNeighborIfOverlap(pFBlock, pScanInfo, pReader, &loadNeighbor);
3391

H
Haojun Liao 已提交
3392
  if (loadNeighbor && (code == TSDB_CODE_SUCCESS)) {
3393 3394
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
H
Haojun Liao 已提交
3395
    if (pDumpInfo->rowIndex >= pDumpInfo->totalRows) {
3396 3397 3398 3399
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

H
Haojun Liao 已提交
3400
  return code;
3401 3402
}

3403 3404
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                SRowMerger* pMerger) {
3405 3406
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

3407
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
3408
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
3409
  int32_t step = asc ? 1 : -1;
3410

3411
  pDumpInfo->rowIndex += step;
3412
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
3413 3414 3415
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
3416

3417 3418 3419 3420
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
3421

3422
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
3423
      SDataBlk*           pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
3424 3425 3426
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
3427
      }
3428
    }
H
Haojun Liao 已提交
3429
  }
3430

H
Haojun Liao 已提交
3431 3432 3433
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3434
int32_t doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
3435 3436
                               SRowMerger* pMerger, SVersionRange* pVerRange) {
  while (nextRowFromLastBlocks(pLastBlockReader, pScanInfo, pVerRange)) {
3437 3438
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 == ts) {
3439
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
3440 3441 3442 3443 3444 3445 3446 3447 3448
      tRowMerge(pMerger, &fRow1);
    } else {
      break;
    }
  }

  return TSDB_CODE_SUCCESS;
}

3449 3450
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                                 STsdbReader* pReader, bool* freeTSRow) {
H
Haojun Liao 已提交
3451
  TSDBROW* pNextRow = NULL;
3452
  TSDBROW  current = *pRow;
3453

3454 3455
  {  // if the timestamp of the next valid row has a different ts, return current row directly
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
3456

3457 3458 3459
    if (!pIter->hasVal) {
      *pTSRow = current.pTSRow;
      *freeTSRow = false;
3460
      return TSDB_CODE_SUCCESS;
3461
    } else {  // has next point in mem/imem
3462
      pNextRow = getValidMemRow(pIter, pDelList, pReader);
3463 3464 3465
      if (pNextRow == NULL) {
        *pTSRow = current.pTSRow;
        *freeTSRow = false;
3466
        return TSDB_CODE_SUCCESS;
3467 3468
      }

H
Haojun Liao 已提交
3469
      if (current.pTSRow->ts != pNextRow->pTSRow->ts) {
3470 3471
        *pTSRow = current.pTSRow;
        *freeTSRow = false;
3472
        return TSDB_CODE_SUCCESS;
3473
      }
3474
    }
3475 3476
  }

3477 3478
  SRowMerger merge = {0};

3479
  // get the correct schema for data in memory
H
Haojun Liao 已提交
3480
  terrno = 0;
H
Haojun Liao 已提交
3481
  STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(&current), pReader, uid);
H
Haojun Liao 已提交
3482 3483 3484
  if (pTSchema == NULL) {
    return terrno;
  }
H
Haojun Liao 已提交
3485

3486 3487
  if (pReader->pSchema == NULL) {
    pReader->pSchema = pTSchema;
3488
  }
H
Haojun Liao 已提交
3489

H
Haojun Liao 已提交
3490 3491 3492 3493
  int32_t code = tRowMergerInit2(&merge, pReader->pSchema, &current, pTSchema);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
3494 3495

  STSchema* pTSchema1 = doGetSchemaForTSRow(TSDBROW_SVERSION(pNextRow), pReader, uid);
H
Haojun Liao 已提交
3496
  if (pTSchema1 == NULL) {
H
Haojun Liao 已提交
3497 3498 3499
    return terrno;
  }

H
Haojun Liao 已提交
3500 3501
  tRowMergerAdd(&merge, pNextRow, pTSchema1);

H
Haojun Liao 已提交
3502 3503 3504 3505 3506
  code = doMergeRowsInBuf(pIter, uid, current.pTSRow->ts, pDelList, &merge, pReader);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

H
Haojun Liao 已提交
3507
  code = tRowMergerGetRow(&merge, pTSRow);
3508 3509 3510
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
M
Minglei Jin 已提交
3511

3512
  tRowMergerClear(&merge);
3513
  *freeTSRow = true;
3514
  return TSDB_CODE_SUCCESS;
3515 3516
}

3517
int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
H
Hongze Cheng 已提交
3518
                           STSRow** pTSRow) {
H
Haojun Liao 已提交
3519 3520
  SRowMerger merge = {0};

3521 3522 3523
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

3524
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
H
Haojun Liao 已提交
3525
    STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
3526

H
Haojun Liao 已提交
3527 3528 3529 3530 3531 3532 3533 3534 3535 3536
    int32_t code = tRowMergerInit(&merge, piRow, pSchema);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge,
                            pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
3537

3538
    tRowMerge(&merge, pRow);
H
Haojun Liao 已提交
3539 3540 3541 3542 3543 3544
    code =
        doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

3545
  } else {
H
Haojun Liao 已提交
3546
    STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
3547

H
Haojun Liao 已提交
3548
    int32_t code = tRowMergerInit(&merge, pRow, pSchema);
3549
    if (code != TSDB_CODE_SUCCESS || merge.pTSchema == NULL) {
H
Haojun Liao 已提交
3550 3551 3552 3553 3554 3555 3556 3557
      return code;
    }

    code =
        doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
3558 3559

    tRowMerge(&merge, piRow);
H
Haojun Liao 已提交
3560 3561 3562 3563 3564
    code = doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge,
                            pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
3565
  }
3566

3567
  int32_t code = tRowMergerGetRow(&merge, pTSRow);
H
Haojun Liao 已提交
3568 3569
  tRowMergerClear(&merge);

3570
  return code;
3571 3572
}

3573 3574
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STSRow** pTSRow, int64_t endKey,
                            bool* freeTSRow) {
3575 3576
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
3577
  SArray*  pDelList = pBlockScanInfo->delSkyline;
3578
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
3579

3580 3581
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
3582
  if (pBlockScanInfo->iter.hasVal) {
3583 3584 3585 3586 3587 3588
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

3589
  if (pBlockScanInfo->iiter.hasVal) {
3590 3591 3592 3593 3594 3595
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

3596
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
3597
    TSDBKEY k = TSDBROW_KEY(pRow);
3598
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
3599

3600
    int32_t code = TSDB_CODE_SUCCESS;
3601 3602
    if (ik.ts != k.ts) {
      if (((ik.ts < k.ts) && asc) || ((ik.ts > k.ts) && (!asc))) {  // ik.ts < k.ts
3603
        code = doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader, freeTSRow);
3604
      } else if (((k.ts < ik.ts) && asc) || ((k.ts > ik.ts) && (!asc))) {
3605
        code = doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader, freeTSRow);
3606
      }
3607
    } else {  // ik.ts == k.ts
3608
      *freeTSRow = true;
3609 3610 3611 3612
      code = doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, pTSRow);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
3613
    }
3614

3615
    return code;
H
Haojun Liao 已提交
3616 3617
  }

3618
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
H
Hongze Cheng 已提交
3619 3620
    return doMergeMemTableMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader,
                                    freeTSRow);
H
Haojun Liao 已提交
3621 3622
  }

3623
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
3624
    return doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader, freeTSRow);
H
Haojun Liao 已提交
3625 3626 3627 3628 3629
  }

  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3630 3631
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow,
                             STableBlockScanInfo* pScanInfo) {
H
Haojun Liao 已提交
3632
  int32_t outputRowIndex = pBlock->info.rows;
3633
  int64_t uid = pScanInfo->uid;
3634

3635
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
3636
  STSchema*           pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
3637

3638
  SColVal colVal = {0};
3639
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
3640

3641
  if (pSupInfo->colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
3642
    SColumnInfoData* pColData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
3643
    ((int64_t*)pColData->pData)[outputRowIndex] = pTSRow->ts;
3644 3645 3646
    i += 1;
  }

H
Haojun Liao 已提交
3647
  while (i < pSupInfo->numOfCols && j < pSchema->numOfCols) {
H
Haojun Liao 已提交
3648
    col_id_t colId = pSupInfo->colId[i];
3649 3650

    if (colId == pSchema->columns[j].colId) {
H
Haojun Liao 已提交
3651
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
3652

H
Haojun Liao 已提交
3653
      tTSRowGetVal(pTSRow, pSchema, j, &colVal);
H
Haojun Liao 已提交
3654
      doCopyColVal(pColInfoData, outputRowIndex, i, &colVal, pSupInfo);
3655 3656 3657
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
H
Haojun Liao 已提交
3658
      SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
3659 3660

      colDataAppendNULL(pColInfoData, outputRowIndex);
3661 3662 3663
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
3664
    }
3665 3666
  }

3667
  // set null value since current column does not exist in the "pSchema"
H
Haojun Liao 已提交
3668
  while (i < pSupInfo->numOfCols) {
H
Haojun Liao 已提交
3669
    SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
3670
    colDataAppendNULL(pColInfoData, outputRowIndex);
3671 3672 3673
    i += 1;
  }

3674
  pBlock->info.dataLoad = 1;
3675
  pBlock->info.rows += 1;
3676
  pScanInfo->lastKey = pTSRow->ts;
3677 3678 3679
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3680 3681
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                 int32_t rowIndex) {
3682 3683 3684 3685
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
3686
  if (pReader->suppInfo.colId[i] == PRIMARYKEY_TIMESTAMP_COL_ID) {
H
Haojun Liao 已提交
3687
    SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
H
Haojun Liao 已提交
3688
    ((int64_t*)pColData->pData)[outputRowIndex] = pBlockData->aTSKEY[rowIndex];
3689
    i += 1;
3690 3691 3692
  }

  SColVal cv = {0};
H
Hongze Cheng 已提交
3693
  int32_t numOfInputCols = pBlockData->nColData;
H
Haojun Liao 已提交
3694
  int32_t numOfOutputCols = pSupInfo->numOfCols;
3695

3696
  while (i < numOfOutputCols && j < numOfInputCols) {
H
Haojun Liao 已提交
3697
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, j);
H
Haojun Liao 已提交
3698
    if (pData->cid < pSupInfo->colId[i]) {
3699 3700 3701 3702
      j += 1;
      continue;
    }

H
Haojun Liao 已提交
3703 3704
    SColumnInfoData* pCol = TARRAY_GET_ELEM(pResBlock->pDataBlock, pSupInfo->slotId[i]);
    if (pData->cid == pSupInfo->colId[i]) {
3705 3706
      tColDataGetValue(pData, rowIndex, &cv);
      doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
3707
      j += 1;
H
Haojun Liao 已提交
3708 3709
    } else if (pData->cid > pCol->info.colId) {
      // the specified column does not exist in file block, fill with null data
3710 3711 3712 3713 3714 3715 3716
      colDataAppendNULL(pCol, outputRowIndex);
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
H
Haojun Liao 已提交
3717
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, pSupInfo->slotId[i]);
3718
    colDataAppendNULL(pCol, outputRowIndex);
3719 3720 3721
    i += 1;
  }

3722
  pResBlock->info.dataLoad = 1;
3723 3724 3725 3726
  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

3727 3728
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
H
Haojun Liao 已提交
3729 3730 3731 3732
  SSDataBlock* pBlock = pReader->pResBlock;

  do {
    STSRow* pTSRow = NULL;
3733
    bool    freeTSRow = false;
3734
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &pTSRow, endKey, &freeTSRow);
3735 3736
    if (pTSRow == NULL) {
      break;
H
Haojun Liao 已提交
3737 3738
    }

3739 3740
    doAppendRowFromTSRow(pBlock, pReader, pTSRow, pBlockScanInfo);

3741 3742 3743
    if (freeTSRow) {
      taosMemoryFree(pTSRow);
    }
H
Haojun Liao 已提交
3744 3745

    // no data in buffer, return immediately
3746
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
3747 3748 3749
      break;
    }

3750
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
3751 3752 3753 3754 3755 3756
      break;
    }
  } while (1);

  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3757

3758 3759
// TODO refactor: with createDataBlockScanInfo
int32_t tsdbSetTableList(STsdbReader* pReader, const void* pTableList, int32_t num) {
3760
  int32_t size = taosHashGetSize(pReader->status.pTableMap);
3761

3762
  STableBlockScanInfo** p = NULL;
3763
  while ((p = taosHashIterate(pReader->status.pTableMap, p)) != NULL) {
3764
    clearBlockScanInfo(*p);
3765 3766
  }

3767 3768 3769
  // todo handle the case where size is less than the value of num
  ASSERT(size >= num);

3770 3771
  taosHashClear(pReader->status.pTableMap);

H
Hongze Cheng 已提交
3772 3773
  STableKeyInfo* pList = (STableKeyInfo*)pTableList;
  for (int32_t i = 0; i < num; ++i) {
3774 3775 3776
    STableBlockScanInfo* pInfo = getPosInBlockInfoBuf(&pReader->blockInfoBuf, i);
    pInfo->uid = pList[i].uid;
    taosHashPut(pReader->status.pTableMap, &pInfo->uid, sizeof(uint64_t), &pInfo, POINTER_BYTES);
3777 3778
  }

H
Hongze Cheng 已提交
3779 3780 3781
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
3782 3783 3784 3785 3786 3787
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
3788

dengyihao's avatar
dengyihao 已提交
3789 3790 3791 3792 3793 3794
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
3795

H
Hongze Cheng 已提交
3796
uint64_t getReaderMaxVersion(STsdbReader* pReader) { return pReader->verRange.maxVer; }
3797

3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812
static int32_t doOpenReaderImpl(STsdbReader* pReader) {
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

  initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
  resetDataBlockIterator(&pReader->status.blockIter, pReader->order);

  // no data in files, let's try buffer in memory
  if (pReader->status.fileIter.numOfFiles == 0) {
    pReader->status.loadFromFile = false;
    return TSDB_CODE_SUCCESS;
  } else {
    return initForFirstBlockInFile(pReader, pBlockIter);
  }
}

H
refact  
Hongze Cheng 已提交
3813
// ====================================== EXPOSED APIs ======================================
3814
int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, void* pTableList, int32_t numOfTables,
H
Haojun Liao 已提交
3815
                       SSDataBlock* pResBlock, STsdbReader** ppReader, const char* idstr) {
3816 3817 3818 3819 3820 3821
  STimeWindow window = pCond->twindows;
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    pCond->twindows.skey += 1;
    pCond->twindows.ekey -= 1;
  }

3822 3823 3824
  int32_t capacity = pVnode->config.tsdbCfg.maxRows;
  if (pResBlock != NULL) {
    blockDataEnsureCapacity(pResBlock, capacity);
H
Haojun Liao 已提交
3825 3826 3827
  }

  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, capacity, pResBlock, idstr);
3828
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3829 3830
    goto _err;
  }
H
Hongze Cheng 已提交
3831

3832
  // check for query time window
H
Haojun Liao 已提交
3833
  STsdbReader* pReader = *ppReader;
3834
  if (isEmptyQueryTimeWindow(&pReader->window) && pCond->type == TIMEWINDOW_RANGE_CONTAINED) {
H
Haojun Liao 已提交
3835 3836 3837
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3838

3839 3840
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
3841
    int32_t order = pCond->order;
3842
    if (order == TSDB_ORDER_ASC) {
3843
      pCond->twindows.ekey = window.skey;
3844 3845 3846
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
3847
      pCond->twindows.skey = window.ekey;
3848 3849 3850 3851
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

3852
    // here we only need one more row, so the capacity is set to be ONE.
H
Haojun Liao 已提交
3853
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[0], 1, pResBlock, idstr);
3854 3855 3856 3857 3858
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
3859
      pCond->twindows.skey = window.ekey;
3860
      pCond->twindows.ekey = INT64_MAX;
3861
    } else {
3862
      pCond->twindows.skey = INT64_MIN;
3863
      pCond->twindows.ekey = window.ekey;
3864
    }
3865 3866
    pCond->order = order;

H
Haojun Liao 已提交
3867
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[1], 1, pResBlock, idstr);
3868 3869 3870 3871 3872
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

H
Haojun Liao 已提交
3873
  // NOTE: the endVersion in pCond is the data version not schema version, so pCond->endVersion is not correct here.
3874 3875
  //  no valid error code set in metaGetTbTSchema, so let's set the error code here.
  //  we should proceed in case of tmq processing.
3876
  if (pCond->suid != 0) {
3877
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, -1, 1);
H
Haojun Liao 已提交
3878
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
3879
      tsdbError("failed to get table schema, suid:%" PRIu64 ", ver:-1, %s", pReader->suid, pReader->idStr);
H
Haojun Liao 已提交
3880
    }
3881 3882
  } else if (numOfTables > 0) {
    STableKeyInfo* pKey = pTableList;
3883
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1, 1);
H
Haojun Liao 已提交
3884
    if (pReader->pSchema == NULL) {
H
Haojun Liao 已提交
3885
      tsdbError("failed to get table schema, uid:%" PRIu64 ", ver:-1, %s", pKey->uid, pReader->idStr);
H
Haojun Liao 已提交
3886
    }
3887 3888
  }

3889
  if (pReader->pSchema != NULL) {
H
Haojun Liao 已提交
3890 3891 3892 3893
    code = updateBlockSMAInfo(pReader->pSchema, &pReader->suppInfo);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
3894
  }
3895

3896
  STsdbReader* p = (pReader->innerReader[0] != NULL) ? pReader->innerReader[0] : pReader;
H
Haojun Liao 已提交
3897
  pReader->status.pTableMap = createDataBlockScanInfo(p, &pReader->blockInfoBuf, pTableList, numOfTables);
H
Haojun Liao 已提交
3898 3899
  if (pReader->status.pTableMap == NULL) {
    *ppReader = NULL;
S
Shengliang Guan 已提交
3900
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
3901 3902
    goto _err;
  }
H
Hongze Cheng 已提交
3903

3904
  if (numOfTables > 0) {
H
Haojun Liao 已提交
3905
    code = tsdbTakeReadSnap(pReader->pTsdb, &pReader->pReadSnap, pReader->idStr);
3906 3907 3908
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Hongze Cheng 已提交
3909

3910
    if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
H
Haojun Liao 已提交
3911 3912
      code = doOpenReaderImpl(pReader);
      if (code != TSDB_CODE_SUCCESS) {
K
kailixu 已提交
3913
        goto _err;
3914
      }
3915
    } else {
H
Haojun Liao 已提交
3916 3917
      STsdbReader* pPrevReader = pReader->innerReader[0];
      STsdbReader* pNextReader = pReader->innerReader[1];
3918

H
Haojun Liao 已提交
3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930
      // we need only one row
      pPrevReader->capacity = 1;
      pPrevReader->status.pTableMap = pReader->status.pTableMap;
      pPrevReader->pSchema = pReader->pSchema;
      pPrevReader->pMemSchema = pReader->pMemSchema;
      pPrevReader->pReadSnap = pReader->pReadSnap;

      pNextReader->capacity = 1;
      pNextReader->status.pTableMap = pReader->status.pTableMap;
      pNextReader->pSchema = pReader->pSchema;
      pNextReader->pMemSchema = pReader->pMemSchema;
      pNextReader->pReadSnap = pReader->pReadSnap;
3931

H
Haojun Liao 已提交
3932
      code = doOpenReaderImpl(pPrevReader);
3933
      if (code != TSDB_CODE_SUCCESS) {
K
kailixu 已提交
3934
        goto _err;
3935
      }
3936 3937 3938
    }
  }

3939
  tsdbDebug("%p total numOfTable:%d in this query %s", pReader, numOfTables, pReader->idStr);
H
Hongze Cheng 已提交
3940
  return code;
H
Hongze Cheng 已提交
3941

3942
_err:
H
Haojun Liao 已提交
3943
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), idstr);
3944
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
3945
  return code;
3946
}
H
refact  
Hongze Cheng 已提交
3947 3948

void tsdbReaderClose(STsdbReader* pReader) {
3949 3950
  if (pReader == NULL) {
    return;
3951
  }
H
refact  
Hongze Cheng 已提交
3952

3953
  {
H
Haojun Liao 已提交
3954
    if (pReader->innerReader[0] != NULL || pReader->innerReader[1] != NULL) {
3955
      STsdbReader* p = pReader->innerReader[0];
3956

3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967
      p->status.pTableMap = NULL;
      p->pReadSnap = NULL;
      p->pSchema = NULL;
      p->pMemSchema = NULL;

      p = pReader->innerReader[1];

      p->status.pTableMap = NULL;
      p->pReadSnap = NULL;
      p->pSchema = NULL;
      p->pMemSchema = NULL;
3968 3969 3970 3971 3972 3973

      tsdbReaderClose(pReader->innerReader[0]);
      tsdbReaderClose(pReader->innerReader[1]);
    }
  }

3974
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
H
Hongze Cheng 已提交
3975

3976
  taosArrayDestroy(pSupInfo->pColAgg);
H
Haojun Liao 已提交
3977
  for (int32_t i = 0; i < pSupInfo->numOfCols; ++i) {
3978 3979 3980 3981
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
3982

H
Haojun Liao 已提交
3983 3984 3985
  if (pReader->freeBlock) {
    pReader->pResBlock = blockDataDestroy(pReader->pResBlock);
  }
3986

H
Haojun Liao 已提交
3987
  taosMemoryFree(pSupInfo->colId);
H
Haojun Liao 已提交
3988
  tBlockDataDestroy(&pReader->status.fileBlockData, true);
3989
  cleanupDataBlockIterator(&pReader->status.blockIter);
3990 3991

  size_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
3992 3993 3994 3995 3996
  if (pReader->status.pTableMap != NULL) {
    destroyAllBlockScanInfo(pReader->status.pTableMap);
    clearBlockScanInfoBuf(&pReader->blockInfoBuf);
  }

H
Haojun Liao 已提交
3997 3998 3999
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
4000

4001 4002 4003 4004 4005 4006 4007 4008 4009
  if (pReader->pDelFReader != NULL) {
    tsdbDelFReaderClose(&pReader->pDelFReader);
  }

  if (pReader->pDelIdx != NULL) {
    taosArrayDestroy(pReader->pDelIdx);
    pReader->pDelIdx = NULL;
  }

H
Haojun Liao 已提交
4010
  tsdbUntakeReadSnap(pReader->pTsdb, pReader->pReadSnap, pReader->idStr);
4011

4012
  taosMemoryFree(pReader->status.uidCheckInfo.tableUidList);
H
Haojun Liao 已提交
4013
  SIOCostSummary* pCost = &pReader->cost;
4014

H
Haojun Liao 已提交
4015 4016
  SFilesetIter* pFilesetIter = &pReader->status.fileIter;
  if (pFilesetIter->pLastBlockReader != NULL) {
H
Haojun Liao 已提交
4017 4018
    SLastBlockReader* pLReader = pFilesetIter->pLastBlockReader;
    tMergeTreeClose(&pLReader->mergeTree);
H
Haojun Liao 已提交
4019

H
Haojun Liao 已提交
4020 4021 4022 4023 4024
    getLastBlockLoadInfo(pLReader->pInfo, &pCost->lastBlockLoad, &pCost->lastBlockLoadTime);

    pLReader->pInfo = destroyLastBlockLoadInfo(pLReader->pInfo);
    taosMemoryFree(pLReader);
  }
H
refact  
Hongze Cheng 已提交
4025

4026 4027 4028 4029 4030
  tsdbDebug(
      "%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
      " SMA-time:%.2f ms, fileBlocks:%" PRId64
      ", fileBlocks-load-time:%.2f ms, "
      "build in-memory-block-time:%.2f ms, lastBlocks:%" PRId64 ", lastBlocks-time:%.2f ms, composed-blocks:%" PRId64
H
Haojun Liao 已提交
4031
      ", composed-blocks-time:%.2fms, STableBlockScanInfo size:%.2f Kb, createTime:%.2f ms,initDelSkylineIterTime:%.2f ms, %s",
4032 4033 4034
      pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime, pCost->numOfBlocks,
      pCost->blockLoadTime, pCost->buildmemBlock, pCost->lastBlockLoad, pCost->lastBlockLoadTime, pCost->composedBlocks,
      pCost->buildComposedBlockTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pCost->createScanInfoList,
H
Haojun Liao 已提交
4035
      pCost->initDelSkylineIterTime, pReader->idStr);
H
refact  
Hongze Cheng 已提交
4036

4037 4038
  taosMemoryFree(pReader->idStr);
  taosMemoryFree(pReader->pSchema);
4039

4040 4041 4042
  if (pReader->pMemSchema != pReader->pSchema) {
    taosMemoryFree(pReader->pMemSchema);
  }
4043

4044
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
4045 4046
}

4047
static bool doTsdbNextDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
4048
  // cleanup the data that belongs to the previous data block
4049 4050
  SSDataBlock* pBlock = pReader->pResBlock;
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
4051

4052
  SReaderStatus* pStatus = &pReader->status;
4053
  if (taosHashGetSize(pStatus->pTableMap) == 0) {
4054 4055
    return false;
  }
H
Haojun Liao 已提交
4056

4057 4058 4059 4060 4061
  if (pStatus->loadFromFile) {
    int32_t code = buildBlockFromFiles(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return false;
    }
4062

4063 4064 4065
    if (pBlock->info.rows > 0) {
      return true;
    } else {
H
Haojun Liao 已提交
4066
      buildBlockFromBufferSequentially(pReader);
4067
      return pBlock->info.rows > 0;
H
Haojun Liao 已提交
4068
    }
4069 4070 4071
  } else {  // no data in files, let's try the buffer
    buildBlockFromBufferSequentially(pReader);
    return pBlock->info.rows > 0;
H
Haojun Liao 已提交
4072
  }
H
refact  
Hongze Cheng 已提交
4073 4074
}

4075 4076 4077 4078 4079
bool tsdbNextDataBlock(STsdbReader* pReader) {
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return false;
  }

4080
  if (pReader->innerReader[0] != NULL && pReader->step == 0) {
4081
    bool ret = doTsdbNextDataBlock(pReader->innerReader[0]);
4082
    pReader->step = EXTERNAL_ROWS_PREV;
4083 4084 4085
    if (ret) {
      return ret;
    }
4086
  }
4087

4088
  if (pReader->step == EXTERNAL_ROWS_PREV) {
4089 4090
    // prepare for the main scan
    int32_t code = doOpenReaderImpl(pReader);
4091
    resetAllDataBlockScanInfo(pReader->status.pTableMap, pReader->innerReader[0]->window.ekey);
4092 4093 4094 4095 4096

    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4097
    pReader->step = EXTERNAL_ROWS_MAIN;
4098 4099 4100 4101 4102 4103 4104
  }

  bool ret = doTsdbNextDataBlock(pReader);
  if (ret) {
    return ret;
  }

4105
  if (pReader->innerReader[1] != NULL && pReader->step == EXTERNAL_ROWS_MAIN) {
4106 4107
    // prepare for the next row scan
    int32_t code = doOpenReaderImpl(pReader->innerReader[1]);
4108
    resetAllDataBlockScanInfo(pReader->innerReader[1]->status.pTableMap, pReader->window.ekey);
4109 4110 4111 4112
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

4113
    bool ret1 = doTsdbNextDataBlock(pReader->innerReader[1]);
4114
    pReader->step = EXTERNAL_ROWS_NEXT;
4115 4116 4117 4118 4119 4120 4121 4122
    if (ret1) {
      return ret1;
    }
  }

  return false;
}

4123
static void doFillNullColSMA(SBlockLoadSuppInfo* pSup, int32_t numOfRows, int32_t numOfCols, SColumnDataAgg* pTsAgg) {
4124 4125
  // do fill all null column value SMA info
  int32_t i = 0, j = 0;
4126
  int32_t size = (int32_t)taosArrayGetSize(pSup->pColAgg);
4127
  taosArrayInsert(pSup->pColAgg, 0, pTsAgg);
4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138

  while (j < numOfCols && i < size) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colId[j]) {
      i += 1;
      j += 1;
    } else if (pAgg->colId < pSup->colId[j]) {
      i += 1;
    } else if (pSup->colId[j] < pAgg->colId) {
      if (pSup->colId[j] != PRIMARYKEY_TIMESTAMP_COL_ID) {
        SColumnDataAgg nullColAgg = {.colId = pSup->colId[j], .numOfNull = numOfRows};
4139
        taosArrayInsert(pSup->pColAgg, i, &nullColAgg);
4140 4141 4142 4143 4144 4145
      }
      j += 1;
    }
  }
}

4146
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SSDataBlock* pDataBlock, bool* allHave) {
H
Haojun Liao 已提交
4147 4148
  SColumnDataAgg*** pBlockSMA = &pDataBlock->pBlockAgg;

H
Hongze Cheng 已提交
4149
  int32_t code = 0;
4150
  *allHave = false;
H
Haojun Liao 已提交
4151
  *pBlockSMA = NULL;
H
Hongze Cheng 已提交
4152

4153
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
4154 4155 4156
    return TSDB_CODE_SUCCESS;
  }

4157
  // there is no statistics data for composed block
4158
  if (pReader->status.composedDataBlock || (!pReader->suppInfo.smaValid)) {
4159 4160
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
4161

4162
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
4163 4164
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

H
Haojun Liao 已提交
4165 4166 4167
  if (pReader->pResBlock->info.id.uid != pFBlock->uid) {
    return TSDB_CODE_SUCCESS;
  }
4168 4169

  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
H
Hongze Cheng 已提交
4170
  if (tDataBlkHasSma(pBlock)) {
H
Hongze Cheng 已提交
4171
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg);
4172
    if (code != TSDB_CODE_SUCCESS) {
4173 4174
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
4175 4176
      return code;
    }
4177
  } else {
H
Haojun Liao 已提交
4178
    *pBlockSMA = NULL;
4179
    return TSDB_CODE_SUCCESS;
4180
  }
H
Hongze Cheng 已提交
4181

4182
  *allHave = true;
H
Hongze Cheng 已提交
4183

4184 4185
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
4186

4187 4188
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
4189 4190 4191 4192
  pTsAgg->min = pReader->pResBlock->info.window.skey;
  pTsAgg->max = pReader->pResBlock->info.window.ekey;

  // update the number of NULL data rows
4193
  size_t numOfCols = pSup->numOfCols;
4194

4195
  // ensure capacity
H
Haojun Liao 已提交
4196 4197 4198
  if (pDataBlock->pDataBlock) {
    size_t colsNum = taosArrayGetSize(pDataBlock->pDataBlock);
    taosArrayEnsureCap(pSup->pColAgg, colsNum);
4199 4200
  }

4201 4202 4203 4204 4205
  SSDataBlock* pResBlock = pReader->pResBlock;
  if (pResBlock->pBlockAgg == NULL) {
    size_t num = taosArrayGetSize(pResBlock->pDataBlock);
    pResBlock->pBlockAgg = taosMemoryCalloc(num, sizeof(SColumnDataAgg));
  }
4206

4207
  // do fill all null column value SMA info
4208
  doFillNullColSMA(pSup, pBlock->nRow, numOfCols, pTsAgg);
H
Haojun Liao 已提交
4209
  size_t size = taosArrayGetSize(pSup->pColAgg);
4210

H
Haojun Liao 已提交
4211
  int32_t i = 0, j = 0;
4212 4213
  while (j < numOfCols && i < size) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
H
Haojun Liao 已提交
4214 4215
    if (pAgg->colId == pSup->colId[j]) {
      pResBlock->pBlockAgg[pSup->slotId[j]] = pAgg;
4216 4217
      i += 1;
      j += 1;
H
Haojun Liao 已提交
4218
    } else if (pAgg->colId < pSup->colId[j]) {
4219
      i += 1;
H
Haojun Liao 已提交
4220
    } else if (pSup->colId[j] < pAgg->colId) {
H
Haojun Liao 已提交
4221
      // ASSERT(pSup->colId[j] == PRIMARYKEY_TIMESTAMP_COL_ID);
4222
      pResBlock->pBlockAgg[pSup->slotId[j]] = &pSup->tsColAgg;
4223 4224 4225 4226
      j += 1;
    }
  }

H
Haojun Liao 已提交
4227
  *pBlockSMA = pResBlock->pBlockAgg;
4228
  pReader->cost.smaDataLoad += 1;
4229

4230
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", %s", 0, pFBlock->uid, pReader->idStr);
H
Hongze Cheng 已提交
4231
  return code;
H
Hongze Cheng 已提交
4232 4233
}

H
Haojun Liao 已提交
4234
static SSDataBlock* doRetrieveDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
4235 4236 4237
  SReaderStatus* pStatus = &pReader->status;

  if (pStatus->composedDataBlock) {
H
Haojun Liao 已提交
4238
    return pReader->pResBlock;
4239
  }
4240

H
Haojun Liao 已提交
4241
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(&pStatus->blockIter);
H
Hongze Cheng 已提交
4242 4243
  STableBlockScanInfo* pBlockScanInfo =
      *(STableBlockScanInfo**)taosHashGet(pStatus->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
H
Haojun Liao 已提交
4244 4245 4246 4247 4248 4249
  if (pBlockScanInfo == NULL) {
    terrno = TSDB_CODE_INVALID_PARA;
    tsdbError("failed to locate the uid:%" PRIu64 " in query table uid list, total tables:%d, %s", pBlockInfo->uid,
              taosHashGetSize(pReader->status.pTableMap), pReader->idStr);
    return NULL;
  }
4250

4251
  int32_t code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData, pBlockScanInfo->uid);
4252
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
4253
    tBlockDataDestroy(&pStatus->fileBlockData, 1);
4254 4255
    terrno = code;
    return NULL;
4256
  }
4257

4258
  copyBlockDataToSDataBlock(pReader);
H
Haojun Liao 已提交
4259
  return pReader->pResBlock;
H
Hongze Cheng 已提交
4260 4261
}

H
Haojun Liao 已提交
4262
SSDataBlock* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
      return doRetrieveDataBlock(pReader->innerReader[0]);
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
      return doRetrieveDataBlock(pReader->innerReader[1]);
    }
  }

  return doRetrieveDataBlock(pReader);
}

H
Haojun Liao 已提交
4274
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
H
Haojun Liao 已提交
4275
  if (isEmptyQueryTimeWindow(&pReader->window) || pReader->pReadSnap == NULL) {
4276 4277
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
4278

4279 4280
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

L
Liu Jicong 已提交
4281
  pReader->order = pCond->order;
4282
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
4283
  pReader->status.loadFromFile = true;
dengyihao's avatar
dengyihao 已提交
4284
  pReader->status.pTableIter = NULL;
H
Haojun Liao 已提交
4285
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
4286

4287
  // allocate buffer in order to load data blocks from file
4288
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
4289

4290
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
4291
  tsdbDataFReaderClose(&pReader->pFileReader);
4292

4293
  int32_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
L
Liu Jicong 已提交
4294

4295
  initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
4296
  resetDataBlockIterator(pBlockIter, pReader->order);
H
Haojun Liao 已提交
4297

H
Hongze Cheng 已提交
4298
  int64_t ts = ASCENDING_TRAVERSE(pReader->order) ? pReader->window.skey - 1 : pReader->window.ekey + 1;
4299
  resetAllDataBlockScanInfo(pReader->status.pTableMap, ts);
4300

4301
  int32_t code = 0;
4302

4303 4304 4305 4306 4307 4308
  // no data in files, let's try buffer in memory
  if (pReader->status.fileIter.numOfFiles == 0) {
    pReader->status.loadFromFile = false;
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
4309 4310
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
4311 4312 4313
      return code;
    }
  }
H
Hongze Cheng 已提交
4314

H
Hongze Cheng 已提交
4315
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, skey:%" PRId64 ", query range:%" PRId64 " - %" PRId64
H
Hongze Cheng 已提交
4316
            " in query %s",
H
Hongze Cheng 已提交
4317 4318
            pReader, pReader->suid, numOfTables, pCond->twindows.skey, pReader->window.skey, pReader->window.ekey,
            pReader->idStr);
4319

4320
  return code;
H
Hongze Cheng 已提交
4321
}
H
Hongze Cheng 已提交
4322

4323 4324 4325
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows) {
  return (numOfRows - startRow) / bucketRange;
}
H
Hongze Cheng 已提交
4326

4327 4328 4329 4330
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
H
Hongze Cheng 已提交
4331

4332 4333
  // find the start data block in file
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
4334

4335 4336 4337
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
4338

4339
  int32_t bucketRange = ceil((pc->maxRows - pc->minRows) / 20.0);
H
Hongze Cheng 已提交
4340

4341
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
4342

4343 4344
  int32_t numOfTables = (int32_t)taosHashGetSize(pStatus->pTableMap);
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
4345

4346 4347
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
4348

4349 4350
  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
H
Haojun Liao 已提交
4351
  }
H
Hongze Cheng 已提交
4352

4353
  pTableBlockInfo->numOfTables = numOfTables;
4354
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
4355

4356 4357
  while (true) {
    if (hasNext) {
H
Hongze Cheng 已提交
4358
      SDataBlk* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
4359

4360 4361
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
4362

4363 4364 4365
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
4366

4367 4368 4369
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
4370

4371 4372 4373
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
4374

4375
      pTableBlockInfo->totalSize += pBlock->aSubBlock[0].szBlock;
4376

4377 4378
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows);
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
4379

H
Haojun Liao 已提交
4380
      hasNext = blockIteratorNext(&pStatus->blockIter, pReader->idStr);
4381 4382 4383 4384 4385
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        break;
      }
H
refact  
Hongze Cheng 已提交
4386

4387 4388
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
      hasNext = (pBlockIter->numOfBlocks > 0);
4389
    }
H
refact  
Hongze Cheng 已提交
4390

H
Hongze Cheng 已提交
4391 4392
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
4393
  }
H
Hongze Cheng 已提交
4394

H
refact  
Hongze Cheng 已提交
4395 4396
  return code;
}
H
Hongze Cheng 已提交
4397

H
refact  
Hongze Cheng 已提交
4398
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
4399
  int64_t rows = 0;
H
Hongze Cheng 已提交
4400

4401 4402
  SReaderStatus* pStatus = &pReader->status;
  pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
H
Hongze Cheng 已提交
4403

4404
  while (pStatus->pTableIter != NULL) {
4405
    STableBlockScanInfo* pBlockScanInfo = *(STableBlockScanInfo**)pStatus->pTableIter;
4406 4407

    STbData* d = NULL;
4408
    if (pReader->pReadSnap->pMem != NULL) {
H
Hongze Cheng 已提交
4409
      d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
4410 4411 4412 4413 4414 4415
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
4416
    if (pReader->pReadSnap->pIMem != NULL) {
H
Hongze Cheng 已提交
4417
      di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
4418 4419 4420 4421 4422 4423 4424 4425
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
  }
H
Hongze Cheng 已提交
4426

H
refact  
Hongze Cheng 已提交
4427
  return rows;
H
Hongze Cheng 已提交
4428
}
D
dapan1121 已提交
4429

L
Liu Jicong 已提交
4430
int32_t tsdbGetTableSchema(SVnode* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
4431 4432 4433 4434
  int32_t sversion = 1;

  SMetaReader mr = {0};
  metaReaderInit(&mr, pVnode->pMeta, 0);
H
Haojun Liao 已提交
4435
  int32_t code = metaGetTableEntryByUidCache(&mr, uid);
D
dapan1121 已提交
4436 4437 4438 4439 4440 4441 4442
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
4443

D
dapan1121 已提交
4444
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
4445
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
4446
    *suid = mr.me.ctbEntry.suid;
H
Haojun Liao 已提交
4447
    code = metaGetTableEntryByUidCache(&mr, *suid);
D
dapan1121 已提交
4448 4449 4450 4451 4452 4453
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
    sversion = mr.me.stbEntry.schemaRow.version;
H
Haojun Liao 已提交
4454
  } else if (mr.me.type == TSDB_NORMAL_TABLE) {
D
dapan1121 已提交
4455
    sversion = mr.me.ntbEntry.schemaRow.version;
H
Haojun Liao 已提交
4456 4457 4458 4459
  } else {
    terrno = TSDB_CODE_INVALID_PARA;
    metaReaderClear(&mr);
    return terrno;
D
dapan1121 已提交
4460 4461 4462
  }

  metaReaderClear(&mr);
4463
  *pSchema = metaGetTbTSchema(pVnode->pMeta, uid, sversion, 1);
L
Liu Jicong 已提交
4464

D
dapan1121 已提交
4465 4466
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
4467

H
Haojun Liao 已提交
4468
int32_t tsdbTakeReadSnap(STsdb* pTsdb, STsdbReadSnap** ppSnap, const char* idStr) {
H
Hongze Cheng 已提交
4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496
  int32_t code = 0;

  // alloc
  *ppSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap));
  if (*ppSnap == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
  code = taosThreadRwlockRdlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

  // take snapshot
  (*ppSnap)->pMem = pTsdb->mem;
  (*ppSnap)->pIMem = pTsdb->imem;

  if ((*ppSnap)->pMem) {
    tsdbRefMemTable((*ppSnap)->pMem);
  }

  if ((*ppSnap)->pIMem) {
    tsdbRefMemTable((*ppSnap)->pIMem);
  }

H
Hongze Cheng 已提交
4497
  // fs
H
Hongze Cheng 已提交
4498 4499 4500 4501 4502
  code = tsdbFSRef(pTsdb, &(*ppSnap)->fs);
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
4503 4504 4505 4506 4507 4508 4509 4510

  // unlock
  code = taosThreadRwlockUnlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

H
Haojun Liao 已提交
4511
  tsdbTrace("vgId:%d, take read snapshot, %s", TD_VID(pTsdb->pVnode), idStr);
H
Hongze Cheng 已提交
4512
_exit:
H
Hongze Cheng 已提交
4513 4514 4515
  return code;
}

H
Haojun Liao 已提交
4516
void tsdbUntakeReadSnap(STsdb* pTsdb, STsdbReadSnap* pSnap, const char* idStr) {
H
Hongze Cheng 已提交
4517 4518 4519 4520 4521 4522 4523 4524 4525
  if (pSnap) {
    if (pSnap->pMem) {
      tsdbUnrefMemTable(pSnap->pMem);
    }

    if (pSnap->pIMem) {
      tsdbUnrefMemTable(pSnap->pIMem);
    }

H
Hongze Cheng 已提交
4526
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
4527
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
4528
  }
H
Haojun Liao 已提交
4529 4530
  tsdbTrace("vgId:%d, untake read snapshot, %s", TD_VID(pTsdb->pVnode), idStr);
}