tsdbRead.c 125.3 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Haojun Liao 已提交
16
#include "osDef.h"
H
Hongze Cheng 已提交
17
#include "tsdb.h"
18

H
Hongze Cheng 已提交
19
#define ASCENDING_TRAVERSE(o)  (o == TSDB_ORDER_ASC)
20
#define ALL_ROWS_CHECKED_INDEX (INT16_MIN)
21
#define INITIAL_ROW_INDEX_VAL  (-1)
H
Hongze Cheng 已提交
22

23 24 25 26 27 28
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

29
typedef struct {
dengyihao's avatar
dengyihao 已提交
30
  STbDataIter* iter;
31 32 33 34
  int32_t      index;
  bool         hasVal;
} SIterInfo;

35 36
typedef struct {
  int32_t numOfBlocks;
37
  int32_t numOfLastFiles;
38 39
} SBlockNumber;

H
Haojun Liao 已提交
40
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
41 42
  uint64_t  uid;
  TSKEY     lastKey;
H
Hongze Cheng 已提交
43 44 45 46 47 48 49 50
  SMapData  mapData;            // block info (compressed)
  SArray*   pBlockList;         // block data index list
  SIterInfo iter;               // mem buffer skip list iterator
  SIterInfo iiter;              // imem buffer skip list iterator
  SArray*   delSkyline;         // delete info for this table
  int32_t   fileDelIndex;       // file block delete index
  int32_t   lastBlockDelIndex;  // delete index for last block
  bool      iterInit;           // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
51 52 53
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
54
  int64_t uid;
55
  int64_t offset;
H
Haojun Liao 已提交
56
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
57 58

typedef struct SBlockOrderSupporter {
59 60 61 62
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
63 64 65
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
66 67 68
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
69
  int64_t headFileLoad;
70
  double  headFileLoadTime;
71
  int64_t smaDataLoad;
72
  double  smaLoadTime;
73 74
  int64_t lastBlockLoad;
  double  lastBlockLoadTime;
H
Hongze Cheng 已提交
75 76 77
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
78
  SArray*          pColAgg;
79
  SColumnDataAgg   tsColAgg;
C
Cary Xu 已提交
80
  SColumnDataAgg** plist;
81 82
  int16_t*         colIds;    // column ids for loading file block data
  char**           buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
H
Hongze Cheng 已提交
83 84
} SBlockLoadSuppInfo;

85 86 87
typedef struct SLastBlockReader {
  STimeWindow   window;
  SVersionRange verRange;
88
  int32_t       order;
89
  uint64_t      uid;
90
  SMergeTree    mergeTree;
91 92
} SLastBlockReader;

93
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
94 95 96
  int32_t           numOfFiles;  // number of total files
  int32_t           index;       // current accessed index in the list
  SArray*           pFileList;   // data file list
97
  int32_t           order;
H
Hongze Cheng 已提交
98
  SLastBlockReader* pLastBlockReader;  // last file block reader
99
} SFilesetIter;
H
Haojun Liao 已提交
100 101

typedef struct SFileDataBlockInfo {
102
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
103
  uint64_t uid;
104
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
105 106 107
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
108
  int32_t   numOfBlocks;
109
  int32_t   index;
H
Hongze Cheng 已提交
110
  SArray*   blockList;  // SArray<SFileDataBlockInfo>
111
  int32_t   order;
H
Hongze Cheng 已提交
112
  SDataBlk  block;  // current SDataBlk data
113
  SHashObj* pTableMap;
H
Haojun Liao 已提交
114 115 116
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
117 118 119 120
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
121 122
} SFileBlockDumpInfo;

123
typedef struct SUidOrderCheckInfo {
124 125
  uint64_t* tableUidList;  // access table uid list in uid ascending order list
  int32_t   currentIndex;  // index in table uid list
126 127
} SUidOrderCheckInfo;

H
Haojun Liao 已提交
128
typedef struct SReaderStatus {
129
  bool                 loadFromFile;       // check file stage
130
  bool                 composedDataBlock;  // the returned data block is a composed block or not
131 132
  SHashObj*            pTableMap;          // SHash<STableBlockScanInfo>
  STableBlockScanInfo* pTableIter;         // table iterator used in building in-memory buffer data blocks.
133
  SUidOrderCheckInfo   uidCheckInfo;       // check all table in uid order
134
  SFileBlockDumpInfo   fBlockDumpInfo;
135 136 137 138
  SDFileSet*           pCurrentFileset;  // current opened file set
  SBlockData           fileBlockData;
  SFilesetIter         fileIter;
  SDataBlockIter       blockIter;
H
Haojun Liao 已提交
139 140
} SReaderStatus;

H
Hongze Cheng 已提交
141
struct STsdbReader {
H
Haojun Liao 已提交
142 143 144 145 146 147 148
  STsdb*             pTsdb;
  uint64_t           suid;
  int16_t            order;
  STimeWindow        window;  // the primary query time window that applies to all queries
  SSDataBlock*       pResBlock;
  int32_t            capacity;
  SReaderStatus      status;
149 150
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
151
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
152
  STsdbReadSnap*     pReadSnap;
153
  SIOCostSummary     cost;
154 155
  STSchema*          pSchema;     // the newest version schema
  STSchema*          pMemSchema;  // the previous schema for in-memory data, to avoid load schema too many times
156 157
  SDataFReader*      pFileReader;
  SVersionRange      verRange;
158

159 160
  int32_t      step;
  STsdbReader* innerReader[2];
H
Hongze Cheng 已提交
161
};
H
Hongze Cheng 已提交
162

H
Haojun Liao 已提交
163
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
164 165
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
166
static TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
167 168
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                        SRowMerger* pMerger);
H
Hongze Cheng 已提交
169 170
static int32_t  doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
                                       SRowMerger* pMerger);
171
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
dengyihao's avatar
dengyihao 已提交
172
                                 STsdbReader* pReader);
H
Haojun Liao 已提交
173
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow, uint64_t uid);
174
static int32_t  doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
H
Hongze Cheng 已提交
175
                                         int32_t rowIndex);
176
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
177
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order);
178

H
Hongze Cheng 已提交
179 180 181 182
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
                                        STSRow** pTSRow, STsdbReader* pReader, bool* freeTSRow);
static int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo,
                                  STsdbReader* pReader, STSRow** pTSRow);
183 184
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                                     STsdbReader* pReader);
185

dengyihao's avatar
dengyihao 已提交
186 187 188 189
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
190
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Hongze Cheng 已提交
191 192 193
static int64_t       getCurrentKeyInLastBlock(SLastBlockReader* pLastBlockReader);
static bool          hasDataInLastBlock(SLastBlockReader* pLastBlockReader);
static int32_t       doBuildDataBlock(STsdbReader* pReader);
H
Haojun Liao 已提交
194

195 196 197
static int32_t setColumnIdSlotList(STsdbReader* pReader, SSDataBlock* pBlock) {
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

198
  size_t numOfCols = blockDataGetNumOfCols(pBlock);
199

200
  pSupInfo->colIds = taosMemoryMalloc(numOfCols * sizeof(int16_t));
201
  pSupInfo->buildBuf = taosMemoryCalloc(numOfCols, POINTER_BYTES);
202 203 204
  if (pSupInfo->buildBuf == NULL || pSupInfo->colIds == NULL) {
    taosMemoryFree(pSupInfo->colIds);
    taosMemoryFree(pSupInfo->buildBuf);
H
Haojun Liao 已提交
205 206
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
207

H
Haojun Liao 已提交
208 209
  for (int32_t i = 0; i < numOfCols; ++i) {
    SColumnInfoData* pCol = taosArrayGet(pBlock->pDataBlock, i);
210
    pSupInfo->colIds[i] = pCol->info.colId;
211 212 213 214

    if (IS_VAR_DATA_TYPE(pCol->info.type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCol->info.bytes);
    }
H
Haojun Liao 已提交
215
  }
H
Hongze Cheng 已提交
216

H
Haojun Liao 已提交
217 218
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
219

220
static SHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, const STableKeyInfo* idList, int32_t numOfTables) {
H
Haojun Liao 已提交
221
  // allocate buffer in order to load data blocks from file
222
  // todo use simple hash instead, optimize the memory consumption
223 224 225
  SHashObj* pTableMap =
      taosHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK);
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
226 227 228
    return NULL;
  }

229
  int32_t step = ASCENDING_TRAVERSE(pTsdbReader->order)? 1:-1;
230
  for (int32_t j = 0; j < numOfTables; ++j) {
231
    STableBlockScanInfo info = {.lastKey = 0, .uid = idList[j].uid};
232
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
233
      info.lastKey = pTsdbReader->window.skey - step;
wmmhello's avatar
wmmhello 已提交
234
    } else {
235
      info.lastKey = pTsdbReader->window.ekey - step;
H
Haojun Liao 已提交
236
    }
wmmhello's avatar
wmmhello 已提交
237

238 239 240
    taosHashPut(pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
    tsdbDebug("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, info.uid, info.lastKey,
              pTsdbReader->idStr);
H
Haojun Liao 已提交
241 242
  }

243 244
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->idStr);
245

246
  return pTableMap;
H
Hongze Cheng 已提交
247
}
H
Hongze Cheng 已提交
248

H
Haojun Liao 已提交
249
static void resetDataBlockScanInfo(SHashObj* pTableMap, int64_t ts) {
250 251
  STableBlockScanInfo* p = NULL;

dengyihao's avatar
dengyihao 已提交
252
  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
253 254
    p->iterInit = false;
    p->iiter.hasVal = false;
dengyihao's avatar
dengyihao 已提交
255
    if (p->iter.iter != NULL) {
256
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
257 258
    }

259
    p->delSkyline = taosArrayDestroy(p->delSkyline);
H
Haojun Liao 已提交
260
    p->lastKey = ts;
261 262 263
  }
}

264 265 266 267 268 269 270 271
static void destroyBlockScanInfo(SHashObj* pTableMap) {
  STableBlockScanInfo* p = NULL;

  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
    p->iterInit = false;
    p->iiter.hasVal = false;

    if (p->iter.iter != NULL) {
272
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
273 274 275
    }

    if (p->iiter.iter != NULL) {
276
      p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
277 278
    }

279 280
    p->delSkyline = taosArrayDestroy(p->delSkyline);
    p->pBlockList = taosArrayDestroy(p->pBlockList);
281
    tMapDataClear(&p->mapData);
282 283 284 285 286
  }

  taosHashCleanup(pTableMap);
}

287
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) {
288 289
  ASSERT(pWindow != NULL);
  return pWindow->skey > pWindow->ekey;
H
Haojun Liao 已提交
290
}
H
Hongze Cheng 已提交
291

292 293 294
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
295
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
296

297
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
298
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
299

dengyihao's avatar
dengyihao 已提交
300
  STimeWindow win = *pWindow;
301 302 303 304 305 306
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
307

H
Haojun Liao 已提交
308
static void limitOutputBufferSize(const SQueryTableDataCond* pCond, int32_t* capacity) {
H
Haojun Liao 已提交
309 310 311 312 313 314
  int32_t rowLen = 0;
  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    rowLen += pCond->colList[i].bytes;
  }

  // make sure the output SSDataBlock size be less than 2MB.
H
Haojun Liao 已提交
315 316 317
  const int32_t TWOMB = 2 * 1024 * 1024;
  if ((*capacity) * rowLen > TWOMB) {
    (*capacity) = TWOMB / rowLen;
H
Haojun Liao 已提交
318 319 320 321
  }
}

// init file iterator
H
Hongze Cheng 已提交
322 323
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet,
                                   STsdbReader* pReader /*int32_t order, const char* idstr*/) {
H
Hongze Cheng 已提交
324
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
325

326 327
  pIter->index = ASCENDING_TRAVERSE(pReader->order) ? -1 : numOfFileset;
  pIter->order = pReader->order;
H
Hongze Cheng 已提交
328
  pIter->pFileList = aDFileSet;
329
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
330

331 332 333 334
  if (pIter->pLastBlockReader == NULL) {
    pIter->pLastBlockReader = taosMemoryCalloc(1, sizeof(struct SLastBlockReader));
    if (pIter->pLastBlockReader == NULL) {
      int32_t code = TSDB_CODE_OUT_OF_MEMORY;
335
      tsdbError("failed to prepare the last block iterator, code:%d %s", tstrerror(code), pReader->idStr);
336 337
      return code;
    }
338 339
  }

340 341 342 343 344 345 346 347
  SLastBlockReader* pLReader = pIter->pLastBlockReader;
  pLReader->order = pReader->order;
  pLReader->window = pReader->window;
  pLReader->verRange = pReader->verRange;

  pLReader->uid = 0;
  tMergeTreeClose(&pLReader->mergeTree);

348
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
H
Haojun Liao 已提交
349 350 351
  return TSDB_CODE_SUCCESS;
}

352
static bool filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader) {
353 354
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
355 356 357
  pIter->index += step;

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
H
Haojun Liao 已提交
358 359 360
    return false;
  }

361 362 363
  pIter->pLastBlockReader->uid = 0;
  tMergeTreeClose(&pIter->pLastBlockReader->mergeTree);

H
Haojun Liao 已提交
364 365
  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
366

367
  while (1) {
H
Haojun Liao 已提交
368 369 370
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
371

372
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
373

374 375 376 377
    int32_t code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
378

379 380
    pReader->cost.headFileLoad += 1;

381 382 383 384 385 386 387 388 389 390 391 392
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
      return false;
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
393 394 395
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
        return false;
      }
396 397
      continue;
    }
C
Cary Xu 已提交
398

399
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
400
              pReader->window.ekey, pReader->idStr);
401 402
    return true;
  }
403

404
_err:
H
Haojun Liao 已提交
405 406 407
  return false;
}

408
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
409 410
  pIter->order = order;
  pIter->index = -1;
411
  pIter->numOfBlocks = 0;
412 413 414 415 416 417 418
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
}

L
Liu Jicong 已提交
419
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
420

H
Haojun Liao 已提交
421
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
422 423
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
424 425
}

426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    SColumnInfoData colInfo = {{0}, 0};
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }

  return pResBlock;
}

449 450
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, STsdbReader** ppReader, int32_t capacity,
                                const char* idstr) {
H
Haojun Liao 已提交
451
  int32_t      code = 0;
452
  int8_t       level = 0;
H
Haojun Liao 已提交
453
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
454 455
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
456
    goto _end;
H
Hongze Cheng 已提交
457 458
  }

C
Cary Xu 已提交
459 460 461 462
  if (VND_IS_TSMA(pVnode)) {
    tsdbDebug("vgId:%d, tsma is selected to query", TD_VID(pVnode));
  }

H
Haojun Liao 已提交
463
  initReaderStatus(&pReader->status);
464

L
Liu Jicong 已提交
465
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
466 467
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
468
  pReader->capacity = 4096;
dengyihao's avatar
dengyihao 已提交
469 470
  pReader->idStr = (idstr != NULL) ? strdup(idstr) : NULL;
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
471
  pReader->type = pCond->type;
472
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
473

474
  ASSERT(pCond->numOfCols > 0);
H
Hongze Cheng 已提交
475

476
  limitOutputBufferSize(pCond, &pReader->capacity);
477

478 479
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
480
  pSup->pColAgg = taosArrayInit(4, sizeof(SColumnDataAgg));
481
  pSup->plist = taosMemoryCalloc(pCond->numOfCols, POINTER_BYTES);
482
  if (pSup->pColAgg == NULL || pSup->plist == NULL) {
483 484 485
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
486

487 488
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;

H
Hongze Cheng 已提交
489
  code = tBlockDataCreate(&pReader->status.fileBlockData);
H
Haojun Liao 已提交
490 491 492 493 494
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

495 496 497 498
  pReader->pResBlock = createResBlock(pCond, pReader->capacity);
  if (pReader->pResBlock == NULL) {
    code = terrno;
    goto _end;
H
Hongze Cheng 已提交
499
  }
H
Hongze Cheng 已提交
500

501 502
  setColumnIdSlotList(pReader, pReader->pResBlock);

H
Hongze Cheng 已提交
503 504
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
505

H
Haojun Liao 已提交
506 507
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
508 509 510
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
511

H
Haojun Liao 已提交
512
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
513
  SArray* aBlockIdx = taosArrayInit(8, sizeof(SBlockIdx));
H
Hongze Cheng 已提交
514

515
  int64_t st = taosGetTimestampUs();
H
Hongze Cheng 已提交
516
  int32_t code = tsdbReadBlockIdx(pFileReader, aBlockIdx);
H
Haojun Liao 已提交
517
  if (code != TSDB_CODE_SUCCESS) {
518
    goto _end;
H
Haojun Liao 已提交
519
  }
H
Hongze Cheng 已提交
520

521 522
  size_t num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
H
Haojun Liao 已提交
523
    taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
524 525
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
526

527 528 529 530
  int64_t et1 = taosGetTimestampUs();

  SBlockIdx* pBlockIdx = NULL;
  for (int32_t i = 0; i < num; ++i) {
531
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Haojun Liao 已提交
532

533
    // uid check
H
Hongze Cheng 已提交
534
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
535 536 537 538
      continue;
    }

    // this block belongs to a table that is not queried.
H
Hongze Cheng 已提交
539
    void* p = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(uint64_t));
H
Haojun Liao 已提交
540 541 542 543 544 545
    if (p == NULL) {
      continue;
    }

    STableBlockScanInfo* pScanInfo = p;
    if (pScanInfo->pBlockList == NULL) {
546
      pScanInfo->pBlockList = taosArrayInit(4, sizeof(int32_t));
H
Haojun Liao 已提交
547 548
    }

H
Hongze Cheng 已提交
549
    taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
550
  }
H
Hongze Cheng 已提交
551

552
  int64_t et2 = taosGetTimestampUs();
553
  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
554
            (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0, pReader->idStr);
555 556 557

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

558
_end:
H
Hongze Cheng 已提交
559
  taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
560 561
  return code;
}
H
Hongze Cheng 已提交
562

563
static void cleanupTableScanInfo(SHashObj* pTableMap) {
564
  STableBlockScanInfo* px = NULL;
dengyihao's avatar
dengyihao 已提交
565
  while (1) {
566
    px = taosHashIterate(pTableMap, px);
567 568 569 570
    if (px == NULL) {
      break;
    }

571
    // reset the index in last block when handing a new file
572
    tMapDataClear(&px->mapData);
573 574
    taosArrayClear(px->pBlockList);
  }
575 576
}

577
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum) {
578 579 580 581 582 583
  int32_t numOfQTable = 0;
  size_t  sizeInDisk = 0;
  size_t  numOfTables = taosArrayGetSize(pIndexList);

  int64_t st = taosGetTimestampUs();
  cleanupTableScanInfo(pReader->status.pTableMap);
584

dengyihao's avatar
dengyihao 已提交
585
  for (int32_t i = 0; i < numOfTables; ++i) {
H
Haojun Liao 已提交
586
    SBlockIdx* pBlockIdx = taosArrayGet(pIndexList, i);
H
Hongze Cheng 已提交
587

588
    STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(int64_t));
H
Hongze Cheng 已提交
589

590
    tMapDataReset(&pScanInfo->mapData);
H
Hongze Cheng 已提交
591
    tsdbReadBlock(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);
592

593
    sizeInDisk += pScanInfo->mapData.nData;
594
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Hongze Cheng 已提交
595 596
      SDataBlk block = {0};
      tMapDataGetItemByIdx(&pScanInfo->mapData, j, &block, tGetDataBlk);
H
Hongze Cheng 已提交
597

598
      // 1. time range check
599
      if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
H
Haojun Liao 已提交
600 601
        continue;
      }
H
Hongze Cheng 已提交
602

603
      // 2. version range check
H
Hongze Cheng 已提交
604
      if (block.minVer > pReader->verRange.maxVer || block.maxVer < pReader->verRange.minVer) {
605 606
        continue;
      }
607

608
      void* p = taosArrayPush(pScanInfo->pBlockList, &j);
H
Haojun Liao 已提交
609
      if (p == NULL) {
610
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
611 612
        return TSDB_CODE_OUT_OF_MEMORY;
      }
613

614
      pBlockNum->numOfBlocks += 1;
H
Haojun Liao 已提交
615
    }
H
Hongze Cheng 已提交
616

H
Haojun Liao 已提交
617
    if (pScanInfo->pBlockList != NULL && taosArrayGetSize(pScanInfo->pBlockList) > 0) {
618 619 620 621
      numOfQTable += 1;
    }
  }

H
Hongze Cheng 已提交
622
  pBlockNum->numOfLastFiles = pReader->pFileReader->pSet->nSttF;
623
  int32_t total = pBlockNum->numOfLastFiles + pBlockNum->numOfBlocks;
624

625
  double el = (taosGetTimestampUs() - st) / 1000.0;
H
Hongze Cheng 已提交
626
  tsdbDebug(
627
      "load block of %d tables completed, blocks:%d in %d tables, last-files:%d, block-info-size:%.2f Kb, elapsed "
628
      "time:%.2f ms %s",
629
      numOfTables, pBlockNum->numOfBlocks, numOfQTable, pBlockNum->numOfLastFiles, sizeInDisk / 1000.0, el,
H
Hongze Cheng 已提交
630
      pReader->idStr);
631

632
  pReader->cost.numOfBlocks += total;
633
  pReader->cost.headFileLoadTime += el;
634

H
Haojun Liao 已提交
635 636
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
637

638
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
639
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
640
  pDumpInfo->allDumped = true;
641
  pDumpInfo->lastKey = maxKey + step;
H
Haojun Liao 已提交
642 643
}

644 645
static void doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
                         SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
646
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
647
    if (pColVal->isNull || pColVal->isNone) {
H
Haojun Liao 已提交
648 649 650
      colDataAppendNULL(pColInfoData, rowIndex);
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
H
Haojun Liao 已提交
651
      ASSERT(pColVal->value.nData <= pColInfoData->info.bytes);
H
Haojun Liao 已提交
652 653 654 655
      memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      colDataAppend(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
    }
  } else {
656
    colDataAppend(pColInfoData, rowIndex, (const char*)&pColVal->value, pColVal->isNull || pColVal->isNone);
H
Haojun Liao 已提交
657
  }
H
Haojun Liao 已提交
658 659
}

660
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
661 662
  if (taosArrayGetSize(pBlockIter->blockList) == 0) {
    ASSERT(pBlockIter->numOfBlocks == taosArrayGetSize(pBlockIter->blockList));
663 664
    return NULL;
  }
665 666 667

  SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pBlockInfo;
668 669
}

H
Hongze Cheng 已提交
670
static SDataBlk* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
671

672
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
673
  SReaderStatus*  pStatus = &pReader->status;
674
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
H
Hongze Cheng 已提交
675

676
  SBlockData*         pBlockData = &pStatus->fileBlockData;
H
Haojun Liao 已提交
677
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
H
Hongze Cheng 已提交
678
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
H
Haojun Liao 已提交
679
  SSDataBlock*        pResBlock = pReader->pResBlock;
680
  int32_t             numOfOutputCols = blockDataGetNumOfCols(pResBlock);
H
Haojun Liao 已提交
681

H
Haojun Liao 已提交
682
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
683
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Haojun Liao 已提交
684

H
Haojun Liao 已提交
685
  SColVal cv = {0};
686
  int64_t st = taosGetTimestampUs();
687 688
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
689

690
  int32_t rowIndex = 0;
691 692
  int32_t remain = asc ? (pBlockData->nRow - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex + 1);

693 694 695 696 697 698 699 700
  int32_t endIndex = 0;
  if (remain <= pReader->capacity) {
    endIndex = pBlockData->nRow;
  } else {
    endIndex = pDumpInfo->rowIndex + step * pReader->capacity;
    remain = pReader->capacity;
  }

701
  int32_t          i = 0;
702 703
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
704
    for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
705 706 707 708 709
      colDataAppend(pColData, rowIndex++, (const char*)&pBlockData->aTSKEY[j], false);
    }
    i += 1;
  }

710 711 712
  int32_t colIndex = 0;
  int32_t num = taosArrayGetSize(pBlockData->aIdx);
  while (i < numOfOutputCols && colIndex < num) {
713 714 715
    rowIndex = 0;
    pColData = taosArrayGet(pResBlock->pDataBlock, i);

H
Hongze Cheng 已提交
716
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
717 718 719
    if (pData->cid < pColData->info.colId) {
      colIndex += 1;
    } else if (pData->cid == pColData->info.colId) {
720
      for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
721 722
        tColDataGetValue(pData, j, &cv);
        doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
H
Haojun Liao 已提交
723
      }
724
      colIndex += 1;
725
      i += 1;
726
      ASSERT(rowIndex == remain);
727 728
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNNULL(pColData, 0, remain);
729
      i += 1;
H
Haojun Liao 已提交
730
    }
731 732
  }

733
  while (i < numOfOutputCols) {
734 735 736
    pColData = taosArrayGet(pResBlock->pDataBlock, i);
    colDataAppendNNULL(pColData, 0, remain);
    i += 1;
H
Haojun Liao 已提交
737
  }
H
Haojun Liao 已提交
738

739
  pResBlock->info.rows = remain;
740
  pDumpInfo->rowIndex += step * remain;
741

742
  setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
H
Haojun Liao 已提交
743

744
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
745
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
746

747
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
748
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
749
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
750
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, remain, unDumpedRows,
H
Hongze Cheng 已提交
751
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
752 753 754 755

  return TSDB_CODE_SUCCESS;
}

756
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData) {
757 758
  int64_t st = taosGetTimestampUs();

759
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
760
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
761
  ASSERT(pBlockInfo != NULL);
762

H
Hongze Cheng 已提交
763 764
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
  int32_t   code = tsdbReadDataBlock(pReader->pFileReader, pBlock, pBlockData);
765 766 767
  if (code != TSDB_CODE_SUCCESS) {
    tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
              ", rows:%d, code:%s %s",
768
              pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
769 770 771
              tstrerror(code), pReader->idStr);
    return code;
  }
772

773
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
774

775 776 777 778
  tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
779 780 781

  pReader->cost.blockLoadTime += elapsedTime;
  pDumpInfo->allDumped = false;
782

H
Haojun Liao 已提交
783
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
784
}
H
Hongze Cheng 已提交
785

H
Haojun Liao 已提交
786 787 788
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
789

H
Haojun Liao 已提交
790 791 792 793
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
794

H
Haojun Liao 已提交
795 796
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
797

H
Haojun Liao 已提交
798 799
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  ASSERT(numOfTables >= 1);
H
Hongze Cheng 已提交
800

H
Haojun Liao 已提交
801
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
802 803
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
804

H
Haojun Liao 已提交
805 806 807 808
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
809

H
Haojun Liao 已提交
810 811
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
812

H
Haojun Liao 已提交
813
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
814
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
815
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
816

H
Haojun Liao 已提交
817
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
818

H
Haojun Liao 已提交
819 820
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
821

H
Haojun Liao 已提交
822 823 824 825 826 827 828
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
829

830
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
831
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
832

833 834 835 836
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter) {
837 838 839
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo != NULL) {
    STableBlockScanInfo* pScanInfo = taosHashGet(pBlockIter->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
840
    int32_t*             mapDataIndex = taosArrayGet(pScanInfo->pBlockList, pBlockInfo->tbBlockIdx);
H
Hongze Cheng 已提交
841
    tMapDataGetItemByIdx(&pScanInfo->mapData, *mapDataIndex, &pBlockIter->block, tGetDataBlk);
842
  }
843 844 845 846 847 848

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
849
}
H
Hongze Cheng 已提交
850

851
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks) {
852
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
853

854
  pBlockIter->numOfBlocks = numOfBlocks;
855
  taosArrayClear(pBlockIter->blockList);
856
  pBlockIter->pTableMap = pReader->status.pTableMap;
857

858 859
  // access data blocks according to the offset of each block in asc/desc order.
  int32_t numOfTables = (int32_t)taosHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
860

861
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
862

863
  SBlockOrderSupporter sup = {0};
864
  int32_t              code = initBlockOrderSupporter(&sup, numOfTables);
865 866 867
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
868

869 870 871 872 873 874 875
  int32_t cnt = 0;
  void*   ptr = NULL;
  while (1) {
    ptr = taosHashIterate(pReader->status.pTableMap, ptr);
    if (ptr == NULL) {
      break;
    }
H
Haojun Liao 已提交
876

877 878 879 880
    STableBlockScanInfo* pTableScanInfo = (STableBlockScanInfo*)ptr;
    if (pTableScanInfo->pBlockList == NULL || taosArrayGetSize(pTableScanInfo->pBlockList) == 0) {
      continue;
    }
H
Haojun Liao 已提交
881

882 883
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
884

885 886 887 888 889
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
      return TSDB_CODE_TDB_OUT_OF_MEMORY;
    }
H
Haojun Liao 已提交
890

891
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
H
Hongze Cheng 已提交
892
    SDataBlk block = {0};
893 894
    for (int32_t k = 0; k < num; ++k) {
      SBlockOrderWrapper wrapper = {0};
895 896

      int32_t* mapDataIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
H
Hongze Cheng 已提交
897
      tMapDataGetItemByIdx(&pTableScanInfo->mapData, *mapDataIndex, &block, tGetDataBlk);
898

899
      wrapper.uid = pTableScanInfo->uid;
900
      wrapper.offset = block.aSubBlock[0].offset;
H
Haojun Liao 已提交
901

902 903 904 905 906 907
      sup.pDataBlockInfo[sup.numOfTables][k] = wrapper;
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
908

909
  ASSERT(numOfBlocks == cnt);
H
Haojun Liao 已提交
910

911
  // since there is only one table qualified, blocks are not sorted
912 913
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
914 915
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
916
    }
917

918
    int64_t et = taosGetTimestampUs();
919
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
920
              pReader, numOfBlocks, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
921

922
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
923
    cleanupBlockOrderSupporter(&sup);
924
    doSetCurrentBlock(pBlockIter);
925
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
926
  }
H
Haojun Liao 已提交
927

928 929
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
930

931
  ASSERT(cnt <= numOfBlocks && sup.numOfTables <= numOfTables);
H
Haojun Liao 已提交
932

933 934 935 936 937
  SMultiwayMergeTreeInfo* pTree = NULL;
  uint8_t                 ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_TDB_OUT_OF_MEMORY;
H
Haojun Liao 已提交
938
  }
H
Haojun Liao 已提交
939

940 941 942 943
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
944

945 946
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
947

948 949 950 951
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
952

953 954
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
955
  }
H
Haojun Liao 已提交
956

957
  int64_t et = taosGetTimestampUs();
H
Hongze Cheng 已提交
958 959
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, numOfBlocks,
            (et - st) / 1000.0, pReader->idStr);
960 961
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
962

963
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
964 965
  doSetCurrentBlock(pBlockIter);

966
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
967
}
H
Hongze Cheng 已提交
968

H
Haojun Liao 已提交
969
static bool blockIteratorNext(SDataBlockIter* pBlockIter) {
970 971
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

972
  int32_t step = asc ? 1 : -1;
973
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
974 975 976
    return false;
  }

977
  pBlockIter->index += step;
978 979
  doSetCurrentBlock(pBlockIter);

980 981 982
  return true;
}

983 984 985
/**
 * This is an two rectangles overlap cases.
 */
H
Hongze Cheng 已提交
986
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SDataBlk* pBlock) {
987 988
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
H
Hongze Cheng 已提交
989 990
         (pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
         (pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
H
Haojun Liao 已提交
991
}
H
Hongze Cheng 已提交
992

H
Hongze Cheng 已提交
993 994
static SDataBlk* getNeighborBlockOfSameTable(SFileDataBlockInfo* pFBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
                                             int32_t* nextIndex, int32_t order) {
995 996 997
  bool asc = ASCENDING_TRAVERSE(order);
  if (asc && pFBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
    return NULL;
998 999
  }

1000
  if (!asc && pFBlockInfo->tbBlockIdx == 0) {
1001 1002 1003
    return NULL;
  }

1004
  int32_t step = asc ? 1 : -1;
1005
  *nextIndex = pFBlockInfo->tbBlockIdx + step;
1006

H
Hongze Cheng 已提交
1007 1008
  SDataBlk* pBlock = taosMemoryCalloc(1, sizeof(SDataBlk));
  int32_t*  indexInMapdata = taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
1009

H
Hongze Cheng 已提交
1010
  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, *indexInMapdata, pBlock, tGetDataBlk);
1011
  return pBlock;
1012 1013 1014 1015 1016
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
  ASSERT(pBlockIter != NULL && pFBlockInfo != NULL);

1017
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1018 1019
  int32_t index = pBlockIter->index;

1020
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  ASSERT(0);
  return -1;
}

1033
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1034
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
1035 1036 1037 1038
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1039 1040 1041 1042 1043
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1044

1045 1046 1047
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1048

1049
  doSetCurrentBlock(pBlockIter);
1050 1051 1052
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
1053
static bool overlapWithNeighborBlock(SDataBlk* pBlock, SDataBlk* pNeighbor, int32_t order) {
1054 1055 1056 1057 1058 1059
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
    return pBlock->maxKey.ts == pNeighbor->minKey.ts;
  } else {
    return pBlock->minKey.ts == pNeighbor->maxKey.ts;
  }
H
Haojun Liao 已提交
1060
}
H
Hongze Cheng 已提交
1061

H
Hongze Cheng 已提交
1062
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SDataBlk* pBlock) {
H
Haojun Liao 已提交
1063
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1064

1065
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1066
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1067
}
H
Hongze Cheng 已提交
1068

H
Hongze Cheng 已提交
1069
static bool keyOverlapFileBlock(TSDBKEY key, SDataBlk* pBlock, SVersionRange* pVerRange) {
H
Hongze Cheng 已提交
1070 1071
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVer >= pVerRange->minVer) &&
         (pBlock->minVer <= pVerRange->maxVer);
H
Haojun Liao 已提交
1072 1073
}

H
Hongze Cheng 已提交
1074
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock) {
1075 1076 1077 1078 1079
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

  for (int32_t i = pBlockScanInfo->fileDelIndex; i < num; i += 1) {
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
H
Hongze Cheng 已提交
1080
      if (p->version >= pBlock->minVer) {
1081 1082 1083
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
H
Hongze Cheng 已提交
1084
      if (p->version >= pBlock->minVer) {
1085 1086 1087 1088 1089 1090 1091
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
          if (i + 1 == num - 1) {  // pnext is the last point
            if (pnext->ts >= pBlock->minKey.ts) {
              return true;
            }
          } else {
H
Hongze Cheng 已提交
1092
            if (pnext->ts >= pBlock->minKey.ts && pnext->version >= pBlock->minVer) {
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
              return true;
            }
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

H
Hongze Cheng 已提交
1108
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock, int32_t order) {
1109 1110 1111 1112
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1113
  // ts is not overlap
1114
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1115
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1116 1117 1118 1119 1120
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1121 1122 1123 1124
  if (ASCENDING_TRAVERSE(order)) {
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1125
    while (1) {
1126 1127 1128 1129 1130
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
        break;
1131 1132 1133
      }
    }

1134 1135
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  }
1136 1137
}

1138 1139 1140 1141
// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
1142
// 5. delete info should not overlap with current block data
H
Hongze Cheng 已提交
1143
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pFBlock, SDataBlk* pBlock,
1144
                                STableBlockScanInfo* pScanInfo, TSDBKEY key, SLastBlockReader* pLastBlockReader) {
H
Hongze Cheng 已提交
1145 1146
  int32_t   neighborIndex = 0;
  SDataBlk* pNeighbor = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &neighborIndex, pReader->order);
1147

1148
  // overlap with neighbor
1149 1150 1151
  bool overlapWithNeighbor = false;
  if (pNeighbor) {
    overlapWithNeighbor = overlapWithNeighborBlock(pBlock, pNeighbor, pReader->order);
1152
    taosMemoryFree(pNeighbor);
1153 1154
  }

1155
  // has duplicated ts of different version in this block
L
Liu Jicong 已提交
1156 1157
  bool hasDup = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  bool overlapWithDel = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1158

1159
  // todo here we need to each key in the last files to identify if it is really overlapped with last block
1160
  // todo
1161
  bool overlapWithlastBlock = false;
1162
#if 0
H
Hongze Cheng 已提交
1163
  if (taosArrayGetSize(pLastBlockReader->pSstBlk) > 0 && (pLastBlockReader->currentBlockIndex != -1)) {
H
Hongze Cheng 已提交
1164
    SSttBlk* pSstBlk = taosArrayGet(pLastBlockReader->pSstBlk, pLastBlockReader->currentBlockIndex);
H
Hongze Cheng 已提交
1165
    overlapWithlastBlock = !(pBlock->maxKey.ts < pSstBlk->minKey || pBlock->minKey.ts > pSstBlk->maxKey);
1166
  }
1167
#endif
1168

1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
  bool moreThanOutputCapacity = pBlock->nRow > pReader->capacity;
  bool partiallyRequired = dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock);
  bool overlapWithKey = keyOverlapFileBlock(key, pBlock, &pReader->verRange);

  bool loadDataBlock = (overlapWithNeighbor || hasDup || partiallyRequired || overlapWithKey ||
                        moreThanOutputCapacity || overlapWithDel || overlapWithlastBlock);

  // log the reason why load the datablock for profile
  if (loadDataBlock) {
    tsdbDebug("%p uid:%" PRIu64
1179
              " need to load the datablock, overlapwithneighborblock:%d, hasDup:%d, partiallyRequired:%d, "
1180 1181 1182 1183 1184 1185
              "overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithlastBlock:%d, %s",
              pReader, pFBlock->uid, overlapWithNeighbor, hasDup, partiallyRequired, overlapWithKey,
              moreThanOutputCapacity, overlapWithDel, overlapWithlastBlock, pReader->idStr);
  }

  return loadDataBlock;
H
Haojun Liao 已提交
1186 1187
}

1188
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1189
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1190 1191
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1192

1193 1194 1195
  SSDataBlock* pBlock = pReader->pResBlock;

  int64_t st = taosGetTimestampUs();
1196
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->capacity, pReader);
H
Haojun Liao 已提交
1197

1198
  blockDataUpdateTsWindow(pBlock, 0);
1199
  pBlock->info.uid = pBlockScanInfo->uid;
1200

1201
  setComposedBlockFlag(pReader, true);
1202

1203
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
S
Shengliang Guan 已提交
1204
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%d, brange:%" PRId64
1205 1206 1207
            " - %" PRId64 " %s",
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
            pReader->idStr);
1208 1209

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1210 1211 1212
  return code;
}

1213 1214
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key,
                                            SFileBlockDumpInfo* pDumpInfo) {
1215 1216 1217 1218 1219
  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && pReader->order == TSDB_ORDER_ASC) ||
      (pDumpInfo->rowIndex > 0 && pReader->order == TSDB_ORDER_DESC)) {
1220
    int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
1221 1222

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1223
    if (nextKey != key) {  // merge is not needed
1224
      doAppendRowFromFileBlock(pReader->pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
1225 1226 1227 1228 1229 1230 1231 1232
      pDumpInfo->rowIndex += step;
      return true;
    }
  }

  return false;
}

H
Haojun Liao 已提交
1233 1234 1235 1236 1237 1238
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
  // always set the newest schema version in pReader->pSchema
  if (pReader->pSchema == NULL) {
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, uid, -1);
  }

1239
  if (pReader->pSchema && sversion == pReader->pSchema->version) {
H
Haojun Liao 已提交
1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
    return pReader->pSchema;
  }

  if (pReader->pMemSchema == NULL) {
    int32_t code =
        metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pMemSchema);
    return pReader->pMemSchema;
  }

  if (pReader->pMemSchema->version == sversion) {
    return pReader->pMemSchema;
  }

  taosMemoryFree(pReader->pMemSchema);
  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pMemSchema);
  return pReader->pMemSchema;
}

1258
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
1259 1260 1261 1262 1263 1264
                                     SIterInfo* pIter, int64_t key, SLastBlockReader* pLastBlockReader) {
  SRowMerger          merge = {0};
  STSRow*             pTSRow = NULL;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1265
  int64_t tsLast = INT64_MIN;
1266
  if (hasDataInLastBlock(pLastBlockReader)) {
1267 1268
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
1269

H
Hongze Cheng 已提交
1270 1271
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1272

1273 1274
  int64_t minKey = 0;
  if (pReader->order == TSDB_ORDER_ASC) {
H
Hongze Cheng 已提交
1275
    minKey = INT64_MAX;  // chosen the minimum value
1276
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
1277 1278
      minKey = tsLast;
    }
1279

1280 1281 1282
    if (minKey > k.ts) {
      minKey = k.ts;
    }
1283

1284 1285 1286 1287 1288
    if (minKey > key && pBlockData->nRow > 0) {
      minKey = key;
    }
  } else {
    minKey = INT64_MIN;
1289
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
      minKey = tsLast;
    }

    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < key && pBlockData->nRow > 0) {
      minKey = key;
    }
1300 1301 1302 1303
  }

  bool init = false;

1304
  // ASC: file block ---> last block -----> imem -----> mem
H
Hongze Cheng 已提交
1305
  // DESC: mem -----> imem -----> last block -----> file block
1306 1307
  if (pReader->order == TSDB_ORDER_ASC) {
    if (minKey == key) {
1308
      init = true;
1309 1310
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1311 1312
    }

1313
    if (minKey == tsLast) {
1314
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1315 1316 1317
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1318 1319 1320
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1321
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1322
    }
1323

1324
    if (minKey == k.ts) {
H
Haojun Liao 已提交
1325 1326 1327
      if (init) {
        tRowMerge(&merge, pRow);
      } else {
1328 1329 1330 1331 1332 1333 1334 1335 1336
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, pRow, pSchema);
      }
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }
  } else {
    if (minKey == k.ts) {
      init = true;
1337 1338
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      tRowMergerInit(&merge, pRow, pSchema);
1339
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1340 1341
    }

1342
    if (minKey == tsLast) {
1343
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1344 1345 1346
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1347 1348 1349
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1350
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1351 1352 1353
    }

    if (minKey == key) {
H
Haojun Liao 已提交
1354 1355 1356
      if (init) {
        tRowMerge(&merge, &fRow);
      } else {
1357 1358 1359 1360 1361
        init = true;
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
      }
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
    }
1362 1363
  }

1364 1365 1366 1367 1368
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1369 1370 1371 1372 1373 1374 1375
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
  return TSDB_CODE_SUCCESS;
}

1376 1377 1378
static int32_t doMergeFileBlockAndLastBlock(SLastBlockReader* pLastBlockReader, STsdbReader* pReader,
                                            STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                            bool mergeBlockData) {
1379
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
S
Shengliang Guan 已提交
1380
  // SBlockData* pLastBlockData = &pLastBlockReader->lastBlockData;
H
Hongze Cheng 已提交
1381
  int64_t tsLastBlock = getCurrentKeyInLastBlock(pLastBlockReader);
1382 1383 1384 1385

  STSRow*    pTSRow = NULL;
  SRowMerger merge = {0};

1386
  TSDBROW fRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
1387 1388 1389 1390
  tRowMergerInit(&merge, &fRow, pReader->pSchema);
  doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, &merge);

  // merge with block data if ts == key
1391
  if (mergeBlockData && (tsLastBlock == pBlockData->aTSKEY[pDumpInfo->rowIndex])) {
1392 1393 1394
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
  }

1395 1396 1397 1398 1399
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1400 1401 1402 1403 1404 1405 1406
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
  return TSDB_CODE_SUCCESS;
}

1407 1408
static int32_t mergeFileBlockAndLastBlock(STsdbReader* pReader, SLastBlockReader* pLastBlockReader, int64_t key,
                                          STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
1409 1410
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1411 1412
  if (pBlockData->nRow > 0) {
    // no last block available, only data block exists
1413
    if (!hasDataInLastBlock(pLastBlockReader)) {
1414 1415 1416 1417 1418 1419 1420 1421 1422
      return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
    }

    // row in last file block
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
    int64_t ts = getCurrentKeyInLastBlock(pLastBlockReader);
    ASSERT(ts >= key);

    if (ASCENDING_TRAVERSE(pReader->order)) {
1423
      if (key < ts) {  // imem, mem are all empty, file blocks (data blocks and last block) exist
1424 1425 1426 1427
        return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
      } else if (key == ts) {
        STSRow*    pTSRow = NULL;
        SRowMerger merge = {0};
1428

1429 1430
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1431 1432 1433 1434

        TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
        tRowMerge(&merge, &fRow1);

1435
        doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, ts, &merge);
1436

1437 1438 1439 1440 1441
        int32_t code = tRowMergerGetRow(&merge, &pTSRow);
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

1442
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);
1443

1444 1445
        taosMemoryFree(pTSRow);
        tRowMergerClear(&merge);
1446
        return code;
1447
      } else {
1448 1449
        ASSERT(0);
        return TSDB_CODE_SUCCESS;
1450
      }
1451
    } else {  // desc order
1452
      return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, pBlockData, true);
1453
    }
1454
  } else {  // only last block exists
1455
    return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, NULL, false);
H
Haojun Liao 已提交
1456
  }
1457 1458
}

1459 1460
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                     SLastBlockReader* pLastBlockReader) {
1461 1462 1463 1464 1465 1466
  SRowMerger merge = {0};
  STSRow*    pTSRow = NULL;

  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SArray*             pDelList = pBlockScanInfo->delSkyline;

1467 1468
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
1469 1470
  ASSERT(pRow != NULL && piRow != NULL);

1471
  int64_t tsLast = INT64_MIN;
1472 1473 1474
  if (hasDataInLastBlock(pLastBlockReader)) {
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
1475 1476 1477 1478 1479 1480

  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];

  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

1481
  int64_t minKey = 0;
1482 1483 1484 1485 1486
  if (ASCENDING_TRAVERSE(pReader->order)) {
    minKey = INT64_MAX;  // let's find the minimum
    if (minKey > k.ts) {
      minKey = k.ts;
    }
1487

1488 1489 1490
    if (minKey > ik.ts) {
      minKey = ik.ts;
    }
1491

1492 1493 1494
    if (minKey > key && pBlockData->nRow > 0) {
      minKey = key;
    }
1495

1496
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
1497 1498 1499
      minKey = tsLast;
    }
  } else {
H
Hongze Cheng 已提交
1500
    minKey = INT64_MIN;  // let find the maximum ts value
1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < ik.ts) {
      minKey = ik.ts;
    }

    if (minKey < key && pBlockData->nRow > 0) {
      minKey = key;
    }

1513
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
1514 1515
      minKey = tsLast;
    }
1516 1517 1518 1519
  }

  bool init = false;

1520 1521 1522 1523
  // ASC: file block -----> last block -----> imem -----> mem
  // DESC: mem -----> imem -----> last block -----> file block
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (minKey == key) {
1524
      init = true;
1525 1526 1527
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1528 1529
    }

1530
    if (minKey == tsLast) {
1531
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1532 1533 1534
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1535 1536 1537
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1538
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1539 1540 1541
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
1542 1543 1544
      if (init) {
        tRowMerge(&merge, piRow);
      } else {
1545 1546 1547 1548 1549
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, piRow, pSchema);
      }
      doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1550 1551
    }

1552
    if (minKey == k.ts) {
H
Haojun Liao 已提交
1553 1554 1555
      if (init) {
        tRowMerge(&merge, pRow);
      } else {
1556 1557 1558 1559 1560 1561 1562 1563
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, pRow, pSchema);
      }
      doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }
  } else {
    if (minKey == k.ts) {
      init = true;
1564 1565
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      tRowMergerInit(&merge, pRow, pSchema);
1566 1567 1568 1569
      doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
1570 1571 1572
      if (init) {
        tRowMerge(&merge, piRow);
      } else {
1573 1574 1575 1576 1577 1578 1579 1580
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, piRow, pSchema);
      }
      doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }

    if (minKey == tsLast) {
1581
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1582 1583 1584
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1585 1586 1587
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1588
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1589 1590 1591
    }

    if (minKey == key) {
H
Haojun Liao 已提交
1592
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1593 1594
      if (!init) {
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
1595 1596
      } else {
        tRowMerge(&merge, &fRow);
1597 1598
      }
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1599 1600 1601
    }
  }

1602 1603 1604 1605 1606
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1607 1608 1609 1610
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
1611
  return code;
1612 1613
}

1614
#if 0
1615
static int32_t doMergeThreeLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
1616 1617 1618
  SRowMerger merge = {0};
  STSRow*    pTSRow = NULL;

1619
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
dengyihao's avatar
dengyihao 已提交
1620
  SArray*             pDelList = pBlockScanInfo->delSkyline;
1621

1622 1623
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
1624
  ASSERT(pRow != NULL && piRow != NULL);
H
Haojun Liao 已提交
1625

1626
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1627
  bool    freeTSRow = false;
H
Haojun Liao 已提交
1628

1629
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
1630

1631 1632 1633
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);
  if (ASCENDING_TRAVERSE(pReader->order)) {
1634 1635
    // [1&2] key <= [k.ts && ik.ts]
    if (key <= k.ts && key <= ik.ts) {
1636 1637 1638
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
      tRowMergerInit(&merge, &fRow, pReader->pSchema);

1639
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
H
Haojun Liao 已提交
1640

1641 1642
      if (ik.ts == key) {
        tRowMerge(&merge, piRow);
1643
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1644 1645
      }

1646 1647
      if (k.ts == key) {
        tRowMerge(&merge, pRow);
1648
        doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1649 1650 1651
      }

      tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1652
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1653
      return TSDB_CODE_SUCCESS;
1654
    } else {  // key > ik.ts || key > k.ts
1655 1656
      ASSERT(key != ik.ts);

1657
      // [3] ik.ts < key <= k.ts
1658
      // [4] ik.ts < k.ts <= key
1659
      if (ik.ts < k.ts) {
1660
        doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader, &freeTSRow);
H
Haojun Liao 已提交
1661
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1662 1663 1664
        if (freeTSRow) {
          taosMemoryFree(pTSRow);
        }
1665 1666 1667
        return TSDB_CODE_SUCCESS;
      }

1668 1669
      // [5] k.ts < key   <= ik.ts
      // [6] k.ts < ik.ts <= key
1670
      if (k.ts < ik.ts) {
1671
        doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, &pTSRow, pReader, &freeTSRow);
H
Haojun Liao 已提交
1672
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1673 1674 1675
        if (freeTSRow) {
          taosMemoryFree(pTSRow);
        }
1676 1677 1678
        return TSDB_CODE_SUCCESS;
      }

1679
      // [7] k.ts == ik.ts < key
1680
      if (k.ts == ik.ts) {
1681 1682
        ASSERT(key > ik.ts && key > k.ts);

1683
        doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pTSRow);
H
Haojun Liao 已提交
1684
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1685
        taosMemoryFree(pTSRow);
1686 1687 1688
        return TSDB_CODE_SUCCESS;
      }
    }
1689 1690 1691
  } else {  // descending order scan
    // [1/2] k.ts >= ik.ts && k.ts >= key
    if (k.ts >= ik.ts && k.ts >= key) {
H
Haojun Liao 已提交
1692
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
1693

H
Haojun Liao 已提交
1694
      tRowMergerInit(&merge, pRow, pSchema);
1695
      doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1696 1697 1698

      if (ik.ts == k.ts) {
        tRowMerge(&merge, piRow);
1699
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1700 1701 1702 1703 1704 1705 1706 1707 1708
      }

      if (k.ts == key) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      }

      tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1709
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1710 1711
      return TSDB_CODE_SUCCESS;
    } else {
1712
      ASSERT(ik.ts != k.ts);  // this case has been included in the previous if branch
1713 1714 1715 1716

      // [3] ik.ts > k.ts >= Key
      // [4] ik.ts > key >= k.ts
      if (ik.ts > key) {
1717
        doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader, &freeTSRow);
H
Haojun Liao 已提交
1718
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1719 1720 1721
        if (freeTSRow) {
          taosMemoryFree(pTSRow);
        }
1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732
        return TSDB_CODE_SUCCESS;
      }

      // [5] key > ik.ts > k.ts
      // [6] key > k.ts > ik.ts
      if (key > ik.ts) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMergerInit(&merge, &fRow, pReader->pSchema);

        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1733
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1734
        taosMemoryFree(pTSRow);
1735 1736 1737 1738 1739
        return TSDB_CODE_SUCCESS;
      }

      //[7] key = ik.ts > k.ts
      if (key == ik.ts) {
1740
        doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader, &freeTSRow);
1741 1742 1743 1744 1745

        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1746
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1747 1748

        taosMemoryFree(pTSRow);
1749 1750 1751 1752 1753 1754
        return TSDB_CODE_SUCCESS;
      }
    }
  }

  ASSERT(0);
S
Shengliang Guan 已提交
1755
  return -1;
1756
}
1757
#endif
1758

1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

  int32_t code = TSDB_CODE_SUCCESS;

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
    startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
  } else {
    startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));

  STbData* d = NULL;
  if (pReader->pReadSnap->pMem != NULL) {
    d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
    if (d != NULL) {
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);

        tsdbDebug("%p uid:%" PRId64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
1784
                  "-%" PRId64 " %s",
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
      } else {
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
    tsdbDebug("%p uid:%" PRId64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
  }

  STbData* di = NULL;
  if (pReader->pReadSnap->pIMem != NULL) {
    di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
    if (di != NULL) {
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);

        tsdbDebug("%p uid:%" PRId64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
1805
                  "-%" PRId64 " %s",
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
      } else {
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
    tsdbDebug("%p uid:%" PRId64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
  }

  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);

  pBlockScanInfo->iterInit = true;
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
1823 1824
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
1825 1826 1827 1828 1829 1830 1831 1832
  // it is an multi-table data block
  if (pBlockData->aUid != NULL) {
    uint64_t uid = pBlockData->aUid[pDumpInfo->rowIndex];
    if (uid != pBlockScanInfo->uid) {  // move to next row
      return false;
    }
  }

1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

1844
  TSDBKEY k = {.ts = ts, .version = ver};
1845
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, &k, pReader->order)) {
1846 1847 1848
    return false;
  }

1849 1850 1851
  return true;
}

1852
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
1853

1854
static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pBlockScanInfo) {
H
Hongze Cheng 已提交
1855
  while (1) {
1856 1857 1858 1859
    bool hasVal = tMergeTreeNext(&pLastBlockReader->mergeTree);
    if (!hasVal) {
      return false;
    }
1860

1861 1862 1863 1864 1865
    TSDBROW row = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
    TSDBKEY k = TSDBROW_KEY(&row);
    if (!hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->lastBlockDelIndex, &k, pLastBlockReader->order)) {
      return true;
    }
1866
  }
1867 1868
}

1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881
static bool initLastBlockReader(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pBlockScanInfo,
                                STsdbReader* pReader) {
  // the last block reader has been initialized for this table.
  if (pLastBlockReader->uid == pBlockScanInfo->uid) {
    return true;
  }

  if (pLastBlockReader->uid != 0) {
    tMergeTreeClose(&pLastBlockReader->mergeTree);
  }

  initMemDataIterator(pBlockScanInfo, pReader);
  pLastBlockReader->uid = pBlockScanInfo->uid;
1882

H
Haojun Liao 已提交
1883
  int32_t step = ASCENDING_TRAVERSE(pLastBlockReader->order)? 1:-1;
1884 1885
  STimeWindow w = pLastBlockReader->window;
  if (ASCENDING_TRAVERSE(pLastBlockReader->order)) {
H
Haojun Liao 已提交
1886
    w.skey = pBlockScanInfo->lastKey + step;
1887
  } else {
H
Haojun Liao 已提交
1888
    w.ekey = pBlockScanInfo->lastKey + step;
1889 1890
  }

1891 1892
  int32_t code =
      tMergeTreeOpen(&pLastBlockReader->mergeTree, (pLastBlockReader->order == TSDB_ORDER_DESC), pReader->pFileReader,
1893
          pReader->suid, pBlockScanInfo->uid, &w, &pLastBlockReader->verRange);
1894 1895 1896 1897 1898 1899 1900
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

  return nextRowFromLastBlocks(pLastBlockReader, pBlockScanInfo);
}

1901
static int64_t getCurrentKeyInLastBlock(SLastBlockReader* pLastBlockReader) {
1902 1903 1904
  TSDBROW row = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
  TSDBKEY key = TSDBROW_KEY(&row);
  return key.ts;
1905 1906
}

H
Hongze Cheng 已提交
1907
static bool hasDataInLastBlock(SLastBlockReader* pLastBlockReader) { return pLastBlockReader->mergeTree.pIter != NULL; }
1908

1909 1910
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                              STsdbReader* pReader) {
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);

  if (tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo)) {
    return TSDB_CODE_SUCCESS;
  } else {
    STSRow*    pTSRow = NULL;
    SRowMerger merge = {0};

    tRowMergerInit(&merge, &fRow, pReader->pSchema);
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1923 1924 1925 1926 1927
    int32_t code = tRowMergerGetRow(&merge, &pTSRow);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

1928 1929 1930 1931 1932 1933
    doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

    taosMemoryFree(pTSRow);
    tRowMergerClear(&merge);
    return TSDB_CODE_SUCCESS;
  }
1934

1935 1936 1937
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
1938 1939
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
                                          SBlockData* pBlockData, SLastBlockReader* pLastBlockReader) {
1940 1941
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

H
Hongze Cheng 已提交
1942
  int64_t  key = (pBlockData->nRow > 0) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
1943 1944
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
1945

1946
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal) {
1947
    return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
1948
  } else {
1949
    // imem + file + last block
1950
    if (pBlockScanInfo->iiter.hasVal) {
1951
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key, pLastBlockReader);
1952 1953
    }

1954
    // mem + file + last block
1955
    if (pBlockScanInfo->iter.hasVal) {
1956
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key, pLastBlockReader);
H
Haojun Liao 已提交
1957
    }
1958

1959 1960
    // files data blocks + last block
    return mergeFileBlockAndLastBlock(pReader, pLastBlockReader, key, pBlockScanInfo, pBlockData);
1961 1962 1963
  }
}

1964
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
1965 1966
  SSDataBlock* pResBlock = pReader->pResBlock;

H
Hongze Cheng 已提交
1967
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
1968 1969 1970 1971 1972 1973 1974 1975

  STableBlockScanInfo* pBlockScanInfo = NULL;
  if (pBlockInfo != NULL) {
    pBlockScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
  } else {
    pBlockScanInfo = pReader->status.pTableIter;
  }

H
Haojun Liao 已提交
1976
  SLastBlockReader*   pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
1977
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1978 1979
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  int32_t             step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
1980

1981 1982
  int64_t st = taosGetTimestampUs();

1983
  while (1) {
1984
    // todo check the validate of row in file block
1985
    bool hasBlockData = false;
1986
    {
H
Haojun Liao 已提交
1987
      while (pBlockData->nRow > 0) {  // find the first qualified row in data block
1988 1989 1990 1991 1992
        if (isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
          hasBlockData = true;
          break;
        }

1993 1994
        pDumpInfo->rowIndex += step;

H
Hongze Cheng 已提交
1995
        SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
1996
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
1997
          setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
1998 1999 2000
          break;
        }
      }
2001
    }
2002

2003
    bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);
2004

2005 2006 2007
    // no data in last block and block, no need to proceed.
    if ((hasBlockData == false) && (hasBlockLData == false)) {
      break;
2008 2009
    }

2010
    buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
2011

2012
    // currently loaded file data block is consumed
2013
    if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
H
Hongze Cheng 已提交
2014
      SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
2015
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2016 2017 2018 2019 2020
      break;
    }

    if (pResBlock->info.rows >= pReader->capacity) {
      break;
2021 2022 2023 2024
    }
  }

  pResBlock->info.uid = pBlockScanInfo->uid;
2025 2026
  blockDataUpdateTsWindow(pResBlock, 0);

2027
  setComposedBlockFlag(pReader, true);
2028
  int64_t et = taosGetTimestampUs();
2029

2030 2031 2032 2033 2034 2035
  if (pResBlock->info.rows > 0) {
    tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
              " rows:%d, elapsed time:%.2f ms %s",
              pReader, pBlockScanInfo->uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
              pResBlock->info.rows, (et - st) / 1000.0, pReader->idStr);
  }
2036

2037 2038 2039 2040 2041
  return TSDB_CODE_SUCCESS;
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

dengyihao's avatar
dengyihao 已提交
2042 2043
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
2044 2045 2046
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
2047

2048 2049 2050
  int32_t code = 0;
  STsdb*  pTsdb = pReader->pTsdb;

2051 2052
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
2053
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
2054 2055
  if (pDelFile) {
    SDelFReader* pDelFReader = NULL;
H
more  
Hongze Cheng 已提交
2056
    code = tsdbDelFReaderOpen(&pDelFReader, pDelFile, pTsdb);
2057
    if (code != TSDB_CODE_SUCCESS) {
2058 2059 2060 2061 2062
      goto _err;
    }

    SArray* aDelIdx = taosArrayInit(4, sizeof(SDelIdx));
    if (aDelIdx == NULL) {
2063
      tsdbDelFReaderClose(&pDelFReader);
2064 2065 2066
      goto _err;
    }

H
Hongze Cheng 已提交
2067
    code = tsdbReadDelIdx(pDelFReader, aDelIdx);
2068 2069 2070
    if (code != TSDB_CODE_SUCCESS) {
      taosArrayDestroy(aDelIdx);
      tsdbDelFReaderClose(&pDelFReader);
2071 2072
      goto _err;
    }
2073

2074 2075 2076
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
    SDelIdx* pIdx = taosArraySearch(aDelIdx, &idx, tCmprDelIdx, TD_EQ);

H
Haojun Liao 已提交
2077
    if (pIdx != NULL) {
H
Hongze Cheng 已提交
2078
      code = tsdbReadDelData(pDelFReader, pIdx, pDelData);
2079 2080 2081 2082 2083 2084 2085
    }

    taosArrayDestroy(aDelIdx);
    tsdbDelFReaderClose(&pDelFReader);

    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
2086
    }
2087
  }
2088

2089 2090 2091 2092 2093 2094 2095
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
2096 2097
  }

2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
dengyihao's avatar
dengyihao 已提交
2112 2113
  pBlockScanInfo->iter.index =
      ASCENDING_TRAVERSE(pReader->order) ? 0 : taosArrayGetSize(pBlockScanInfo->delSkyline) - 1;
2114 2115
  pBlockScanInfo->iiter.index = pBlockScanInfo->iter.index;
  pBlockScanInfo->fileDelIndex = pBlockScanInfo->iter.index;
2116
  pBlockScanInfo->lastBlockDelIndex = pBlockScanInfo->iter.index;
2117 2118
  return code;

2119 2120 2121
_err:
  taosArrayDestroy(pDelData);
  return code;
2122 2123
}

2124
static TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
H
Hongze Cheng 已提交
2125
  TSDBKEY  key = {.ts = TSKEY_INITIAL_VAL};
2126
  TSDBROW* pRow = getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
2127
  if (pRow != NULL) {
2128 2129 2130
    key = TSDBROW_KEY(pRow);
  }

2131
  pRow = getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
2132
  if (pRow != NULL) {
2133 2134 2135 2136 2137 2138 2139 2140 2141
    TSDBKEY k = TSDBROW_KEY(pRow);
    if (key.ts > k.ts) {
      key = k;
    }
  }

  return key;
}

2142
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum) {
H
Haojun Liao 已提交
2143
  SReaderStatus* pStatus = &pReader->status;
2144
  pBlockNum->numOfBlocks = 0;
2145
  pBlockNum->numOfLastFiles = 0;
2146

2147
  size_t  numOfTables = taosHashGetSize(pReader->status.pTableMap);
2148
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
2149 2150

  while (1) {
2151
    bool hasNext = filesetIteratorNext(&pStatus->fileIter, pReader);
2152
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
2153 2154 2155
      break;
    }

H
Haojun Liao 已提交
2156
    taosArrayClear(pIndexList);
H
Haojun Liao 已提交
2157 2158
    int32_t code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2159
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2160 2161 2162
      return code;
    }

H
Hongze Cheng 已提交
2163
    if (taosArrayGetSize(pIndexList) > 0 || pReader->pFileReader->pSet->nSttF > 0) {
2164
      code = doLoadFileBlock(pReader, pIndexList, pBlockNum);
H
Haojun Liao 已提交
2165
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2166
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2167 2168 2169
        return code;
      }

2170
      if (pBlockNum->numOfBlocks + pBlockNum->numOfLastFiles > 0) {
H
Haojun Liao 已提交
2171 2172 2173
        break;
      }
    }
2174

H
Haojun Liao 已提交
2175 2176 2177
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
2178
  taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2179 2180 2181
  return TSDB_CODE_SUCCESS;
}

2182
static int32_t uidComparFunc(const void* p1, const void* p2) {
2183 2184
  uint64_t pu1 = *(uint64_t*)p1;
  uint64_t pu2 = *(uint64_t*)p2;
2185 2186 2187
  if (pu1 == pu2) {
    return 0;
  } else {
2188
    return (pu1 < pu2) ? -1 : 1;
2189 2190
  }
}
2191

2192
static void extractOrderedTableUidList(SUidOrderCheckInfo* pOrderCheckInfo, SReaderStatus* pStatus) {
2193 2194 2195 2196
  int32_t index = 0;
  int32_t total = taosHashGetSize(pStatus->pTableMap);

  void* p = taosHashIterate(pStatus->pTableMap, NULL);
2197
  while (p != NULL) {
2198 2199 2200 2201 2202 2203 2204 2205
    STableBlockScanInfo* pScanInfo = p;
    pOrderCheckInfo->tableUidList[index++] = pScanInfo->uid;
    p = taosHashIterate(pStatus->pTableMap, p);
  }

  taosSort(pOrderCheckInfo->tableUidList, total, sizeof(uint64_t), uidComparFunc);
}

2206
static int32_t initOrderCheckInfo(SUidOrderCheckInfo* pOrderCheckInfo, SReaderStatus* pStatus) {
2207 2208 2209 2210
  int32_t total = taosHashGetSize(pStatus->pTableMap);
  if (total == 0) {
    return TSDB_CODE_SUCCESS;
  }
2211

2212
  if (pOrderCheckInfo->tableUidList == NULL) {
2213 2214 2215 2216 2217 2218
    pOrderCheckInfo->currentIndex = 0;
    pOrderCheckInfo->tableUidList = taosMemoryMalloc(total * sizeof(uint64_t));
    if (pOrderCheckInfo->tableUidList == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

2219
    extractOrderedTableUidList(pOrderCheckInfo, pStatus);
2220 2221 2222
    uint64_t uid = pOrderCheckInfo->tableUidList[0];
    pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
  } else {
2223 2224
    if (pStatus->pTableIter == NULL) {  // it is the last block of a new file
      pOrderCheckInfo->currentIndex = 0;
2225 2226
      uint64_t uid = pOrderCheckInfo->tableUidList[pOrderCheckInfo->currentIndex];
      pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2227 2228

      // the tableMap has already updated
2229
      if (pStatus->pTableIter == NULL) {
2230
        void* p = taosMemoryRealloc(pOrderCheckInfo->tableUidList, total * sizeof(uint64_t));
2231 2232 2233 2234 2235 2236 2237 2238 2239
        if (p == NULL) {
          return TSDB_CODE_OUT_OF_MEMORY;
        }

        pOrderCheckInfo->tableUidList = p;
        extractOrderedTableUidList(pOrderCheckInfo, pStatus);

        uid = pOrderCheckInfo->tableUidList[0];
        pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2240
      }
2241
    }
2242
  }
2243

2244 2245 2246
  return TSDB_CODE_SUCCESS;
}

2247
static bool moveToNextTable(SUidOrderCheckInfo* pOrderedCheckInfo, SReaderStatus* pStatus) {
2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259
  pOrderedCheckInfo->currentIndex += 1;
  if (pOrderedCheckInfo->currentIndex >= taosHashGetSize(pStatus->pTableMap)) {
    pStatus->pTableIter = NULL;
    return false;
  }

  uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
  pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
  ASSERT(pStatus->pTableIter != NULL);
  return true;
}

2260
static int32_t doLoadLastBlockSequentially(STsdbReader* pReader) {
2261
  SReaderStatus*    pStatus = &pReader->status;
2262 2263
  SLastBlockReader* pLastBlockReader = pStatus->fileIter.pLastBlockReader;

2264 2265
  SUidOrderCheckInfo* pOrderedCheckInfo = &pStatus->uidCheckInfo;
  int32_t             code = initOrderCheckInfo(pOrderedCheckInfo, pStatus);
2266
  if (code != TSDB_CODE_SUCCESS || (taosHashGetSize(pStatus->pTableMap) == 0)) {
2267 2268
    return code;
  }
2269

2270
  while (1) {
2271 2272
    // load the last data block of current table
    STableBlockScanInfo* pScanInfo = pStatus->pTableIter;
H
Hongze Cheng 已提交
2273
    bool                 hasVal = initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
2274
    if (!hasVal) {
2275 2276
      bool hasNexTable = moveToNextTable(pOrderedCheckInfo, pStatus);
      if (!hasNexTable) {
2277 2278 2279
        return TSDB_CODE_SUCCESS;
      }
      continue;
2280 2281
    }

2282 2283 2284 2285 2286 2287 2288 2289
    code = doBuildDataBlock(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
2290

2291
    // current table is exhausted, let's try next table
2292 2293
    bool hasNexTable = moveToNextTable(pOrderedCheckInfo, pStatus);
    if (!hasNexTable) {
2294 2295
      return TSDB_CODE_SUCCESS;
    }
2296 2297 2298
  }
}

2299
static int32_t doBuildDataBlock(STsdbReader* pReader) {
H
Hongze Cheng 已提交
2300 2301
  int32_t   code = TSDB_CODE_SUCCESS;
  SDataBlk* pBlock = NULL;
2302 2303 2304

  SReaderStatus*       pStatus = &pReader->status;
  SDataBlockIter*      pBlockIter = &pStatus->blockIter;
2305 2306 2307
  STableBlockScanInfo* pScanInfo = NULL;
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(pBlockIter);
  SLastBlockReader*    pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
2308

2309
  if (pBlockInfo != NULL) {
2310 2311 2312 2313 2314 2315
    pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
  } else {
    pScanInfo = pReader->status.pTableIter;
  }

  if (pBlockInfo != NULL) {
2316
    pBlock = getCurrentBlock(pBlockIter);
2317 2318
  }

2319
  initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
2320
  TSDBKEY key = getCurrentKeyInBuf(pScanInfo, pReader);
2321

2322 2323 2324 2325 2326 2327 2328
  if (pBlockInfo == NULL) {  // build data block from last data file
    ASSERT(pBlockIter->numOfBlocks == 0);
    code = buildComposedDataBlock(pReader);
  } else if (fileBlockShouldLoad(pReader, pBlockInfo, pBlock, pScanInfo, key, pLastBlockReader)) {
    tBlockDataReset(&pStatus->fileBlockData);
    code = tBlockDataInit(&pStatus->fileBlockData, pReader->suid, pScanInfo->uid, pReader->pSchema);
    if (code != TSDB_CODE_SUCCESS) {
2329
      return code;
2330
    }
2331

2332 2333 2334
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
2335 2336 2337
    }

    // build composed data block
2338
    code = buildComposedDataBlock(pReader);
2339 2340
  } else if (bufferDataInFileBlockGap(pReader->order, key, pBlock)) {
    // data in memory that are earlier than current file block
2341
    // todo rows in buffer should be less than the file block in asc, greater than file block in desc
2342
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
2343
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
2344 2345 2346 2347
  } else {
    if (hasDataInLastBlock(pLastBlockReader) && !ASCENDING_TRAVERSE(pReader->order)) {
      // only return the rows in last block
      int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
H
Hongze Cheng 已提交
2348
      ASSERT(tsLast >= pBlock->maxKey.ts);
2349 2350 2351
      tBlockDataReset(&pReader->status.fileBlockData);

      code = buildComposedDataBlock(pReader);
H
Hongze Cheng 已提交
2352
    } else {  // whole block is required, return it directly
2353 2354 2355 2356 2357 2358 2359
      SDataBlockInfo* pInfo = &pReader->pResBlock->info;
      pInfo->rows = pBlock->nRow;
      pInfo->uid = pScanInfo->uid;
      pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
      setComposedBlockFlag(pReader, false);
      setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
    }
2360 2361 2362 2363 2364
  }

  return code;
}

H
Haojun Liao 已提交
2365
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
2366 2367
  SReaderStatus* pStatus = &pReader->status;

2368
  while (1) {
2369 2370 2371
    if (pStatus->pTableIter == NULL) {
      pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
      if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
2372
        return TSDB_CODE_SUCCESS;
2373 2374 2375 2376
      }
    }

    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;
2377
    initMemDataIterator(pBlockScanInfo, pReader);
2378

2379
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
2380
    int32_t code = buildDataBlockFromBuf(pReader, pBlockScanInfo, endKey);
H
Haojun Liao 已提交
2381 2382 2383 2384
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2385
    if (pReader->pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
2386
      return TSDB_CODE_SUCCESS;
2387 2388 2389 2390 2391
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
    if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
2392
      return TSDB_CODE_SUCCESS;
2393 2394 2395 2396
    }
  }
}

2397
// set the correct start position in case of the first/last file block, according to the query time window
2398
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
H
Hongze Cheng 已提交
2399
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
2400

2401 2402 2403
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
2404 2405 2406

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
2407
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
2408 2409
}

2410
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
2411 2412
  SBlockNumber num = {0};

2413
  int32_t code = moveToNextFile(pReader, &num);
2414 2415 2416 2417 2418
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

  // all data files are consumed, try data in buffer
2419
  if (num.numOfBlocks + num.numOfLastFiles == 0) {
2420 2421 2422 2423 2424
    pReader->status.loadFromFile = false;
    return code;
  }

  // initialize the block iterator for a new fileset
2425 2426
  if (num.numOfBlocks > 0) {
    code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks);
H
Hongze Cheng 已提交
2427
  } else {  // no block data, only last block exists
2428
    tBlockDataReset(&pReader->status.fileBlockData);
2429
    resetDataBlockIterator(pBlockIter, pReader->order);
2430
  }
2431 2432

  // set the correct start position according to the query time window
2433
  initBlockDumpInfo(pReader, pBlockIter);
2434 2435 2436
  return code;
}

2437
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
2438 2439
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
2440 2441
}

2442
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
H
Haojun Liao 已提交
2443
  int32_t code = TSDB_CODE_SUCCESS;
2444 2445
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

2446 2447
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

2448
  if (pBlockIter->numOfBlocks == 0) {
H
Hongze Cheng 已提交
2449
  _begin:
2450 2451 2452 2453 2454
    code = doLoadLastBlockSequentially(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2455 2456 2457 2458
    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }

2459
    // all data blocks are checked in this last block file, now let's try the next file
2460 2461 2462 2463 2464 2465 2466 2467
    if (pReader->status.pTableIter == NULL) {
      code = initForFirstBlockInFile(pReader, pBlockIter);

      // error happens or all the data files are completely checked
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        return code;
      }

2468
      // this file does not have data files, let's start check the last block file if exists
2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483
      if (pBlockIter->numOfBlocks == 0) {
        goto _begin;
      }
    }

    code = doBuildDataBlock(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }

2484
  while (1) {
2485 2486
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2487
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
2488
      code = buildComposedDataBlock(pReader);
2489 2490 2491 2492 2493 2494 2495
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
        bool hasNext = blockIteratorNext(&pReader->status.blockIter);
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
2496
        } else {
H
Haojun Liao 已提交
2497
          if (pReader->status.pCurrentFileset->nSttF > 0) {
2498 2499 2500 2501 2502 2503
            // data blocks in current file are exhausted, let's try the next file now
            tBlockDataReset(&pReader->status.fileBlockData);
            resetDataBlockIterator(pBlockIter, pReader->order);
            goto _begin;
          } else {
            code = initForFirstBlockInFile(pReader, pBlockIter);
2504

2505 2506 2507 2508
            // error happens or all the data files are completely checked
            if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
              return code;
            }
2509

2510 2511 2512 2513
            // this file does not have blocks, let's start check the last block file
            if (pBlockIter->numOfBlocks == 0) {
              goto _begin;
            }
2514
          }
2515
        }
H
Haojun Liao 已提交
2516
      }
2517 2518

      code = doBuildDataBlock(pReader);
2519 2520
    }

2521 2522 2523 2524 2525 2526 2527 2528
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }
2529
}
H
refact  
Hongze Cheng 已提交
2530

2531 2532
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
2533
  if (VND_IS_RSMA(pVnode)) {
2534
    int8_t  level = 0;
2535 2536
    int64_t now = taosGetTimestamp(pVnode->config.tsdbCfg.precision);

2537
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
      if ((now - pRetention->keep) <= winSKey) {
        break;
      }
      ++level;
    }

2551
    const char* str = (idStr != NULL) ? idStr : "";
2552 2553

    if (level == TSDB_RETENTION_L0) {
2554
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
2555
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
2556 2557
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
2558
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
2559
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
2560 2561
      return VND_RSMA1(pVnode);
    } else {
2562
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
2563
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
2564 2565 2566 2567 2568 2569 2570
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
2571
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
2572
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
2573 2574

  int64_t endVer = 0;
L
Liu Jicong 已提交
2575 2576
  if (pCond->endVersion ==
      -1) {  // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
2577 2578
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
2579
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
2580 2581
  }

H
Haojun Liao 已提交
2582
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
2583 2584
}

2585
bool hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order) {
2586 2587 2588 2589
  ASSERT(pKey != NULL);
  if (pDelList == NULL) {
    return false;
  }
L
Liu Jicong 已提交
2590 2591 2592
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
2593

2594 2595 2596 2597 2598 2599
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
      ASSERT(pKey->ts >= last->ts);

      if (pKey->ts > last->ts) {
2600
        return false;
2601 2602 2603
      } else if (pKey->ts == last->ts) {
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
        return (prev->version >= pKey->version);
2604 2605
      }
    } else {
2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

      if (pKey->ts < pCurrent->ts) {
        return false;
      }

      if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
        return true;
      }

      while (pNext->ts <= pKey->ts && (*index) < num - 1) {
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

          if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
            return true;
          }
        }
      }

      return false;
2636 2637
    }
  } else {
2638 2639
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
2640

2641 2642 2643 2644 2645 2646 2647
      if (pKey->ts < pFirst->ts) {
        return false;
      } else if (pKey->ts == pFirst->ts) {
        return pFirst->version >= pKey->version;
      } else {
        ASSERT(0);
      }
2648
    } else {
2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

      if (pKey->ts > pCurrent->ts) {
        return false;
      }

      if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
        return true;
      }

      while (pPrev->ts >= pKey->ts && (*index) > 1) {
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

          if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
            return true;
          }
        }
2676 2677 2678 2679 2680
      }

      return false;
    }
  }
2681 2682

  return false;
2683 2684
}

2685
TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
2686
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
2687 2688
    return NULL;
  }
H
Hongze Cheng 已提交
2689

2690
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
2691
  TSDBKEY  key = {.ts = pRow->pTSRow->ts, .version = pRow->version};
2692
  if (outOfTimeWindow(key.ts, &pReader->window)) {
2693
    pIter->hasVal = false;
H
Haojun Liao 已提交
2694 2695
    return NULL;
  }
H
Hongze Cheng 已提交
2696

2697
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
2698
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
2699
      (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2700 2701
    return pRow;
  }
H
Hongze Cheng 已提交
2702

2703
  while (1) {
2704 2705
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2706 2707
      return NULL;
    }
H
Hongze Cheng 已提交
2708

2709
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
2710

H
Haojun Liao 已提交
2711
    key = TSDBROW_KEY(pRow);
2712
    if (outOfTimeWindow(key.ts, &pReader->window)) {
2713
      pIter->hasVal = false;
H
Haojun Liao 已提交
2714 2715
      return NULL;
    }
H
Hongze Cheng 已提交
2716

dengyihao's avatar
dengyihao 已提交
2717
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
2718
        (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2719 2720 2721 2722
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
2723

2724 2725
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
                         STsdbReader* pReader) {
H
Haojun Liao 已提交
2726
  while (1) {
2727 2728
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2729 2730
      break;
    }
H
Hongze Cheng 已提交
2731

2732
    // data exists but not valid
2733
    TSDBROW* pRow = getValidMemRow(pIter, pDelList, pReader);
2734 2735 2736 2737 2738
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
2739
    TSDBKEY k = TSDBROW_KEY(pRow);
2740
    if (k.ts != ts) {
H
Haojun Liao 已提交
2741 2742 2743
      break;
    }

H
Haojun Liao 已提交
2744
    STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
2745
    tRowMergerAdd(pMerger, pRow, pTSchema);
H
Haojun Liao 已提交
2746 2747 2748 2749 2750
  }

  return TSDB_CODE_SUCCESS;
}

2751
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
2752
                                          SVersionRange* pVerRange, int32_t step) {
2753 2754
  while (pBlockData->aTSKEY[rowIndex] == key && rowIndex < pBlockData->nRow && rowIndex >= 0) {
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
2755
      rowIndex += step;
2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
    tRowMerge(pMerger, &fRow);
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

H
Hongze Cheng 已提交
2772
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SDataBlk* pBlock,
2773 2774
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
2775
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2776
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
2777

2778
  *state = CHECK_FILEBLOCK_QUIT;
2779
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
2780

H
Hongze Cheng 已提交
2781 2782
  int32_t   nextIndex = -1;
  SDataBlk* pNeighborBlock = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &nextIndex, pReader->order);
2783
  if (pNeighborBlock == NULL) {  // do nothing
2784 2785 2786 2787
    return 0;
  }

  bool overlap = overlapWithNeighborBlock(pBlock, pNeighborBlock, pReader->order);
2788 2789
  taosMemoryFree(pNeighborBlock);

2790
  if (overlap) {  // load next block
2791
    SReaderStatus*  pStatus = &pReader->status;
2792 2793
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

2794
    // 1. find the next neighbor block in the scan block list
2795
    SFileDataBlockInfo fb = {.uid = pFBlock->uid, .tbBlockIdx = nextIndex};
2796
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);
2797

2798
    // 2. remove it from the scan block list
2799
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);
2800

2801
    // 3. load the neighbor block, and set it to be the currently accessed file data block
H
Haojun Liao 已提交
2802
    tBlockDataReset(&pStatus->fileBlockData);
2803 2804 2805 2806 2807 2808
    int32_t code = tBlockDataInit(&pStatus->fileBlockData, pReader->suid, pFBlock->uid, pReader->pSchema);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData);
2809 2810 2811 2812
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2813
    // 4. check the data values
2814 2815 2816 2817
    initBlockDumpInfo(pReader, pBlockIter);

    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
H
Haojun Liao 已提交
2818
    if (pDumpInfo->rowIndex >= pDumpInfo->totalRows) {
2819 2820 2821 2822 2823 2824 2825
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

  return TSDB_CODE_SUCCESS;
}

2826 2827
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                SRowMerger* pMerger) {
2828 2829
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2830
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
2831
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
2832
  int32_t step = asc ? 1 : -1;
2833

2834
  pDumpInfo->rowIndex += step;
2835
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
2836 2837 2838
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
2839

2840 2841 2842 2843
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
2844

2845
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
2846
      SDataBlk*           pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
2847 2848 2849
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
2850
      }
2851
    }
H
Haojun Liao 已提交
2852
  }
2853

H
Haojun Liao 已提交
2854 2855 2856
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
2857 2858
int32_t doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
                               SRowMerger* pMerger) {
H
Haojun Liao 已提交
2859
  pScanInfo->lastKey = ts;
2860
  while (nextRowFromLastBlocks(pLastBlockReader, pScanInfo)) {
2861 2862
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 == ts) {
2863
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
2864 2865 2866 2867 2868 2869 2870 2871 2872
      tRowMerge(pMerger, &fRow1);
    } else {
      break;
    }
  }

  return TSDB_CODE_SUCCESS;
}

2873 2874
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                                 STsdbReader* pReader, bool* freeTSRow) {
H
Haojun Liao 已提交
2875
  TSDBROW* pNextRow = NULL;
2876
  TSDBROW  current = *pRow;
2877

2878 2879
  {  // if the timestamp of the next valid row has a different ts, return current row directly
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
2880

2881 2882 2883
    if (!pIter->hasVal) {
      *pTSRow = current.pTSRow;
      *freeTSRow = false;
2884
      return TSDB_CODE_SUCCESS;
2885
    } else {  // has next point in mem/imem
2886
      pNextRow = getValidMemRow(pIter, pDelList, pReader);
2887 2888 2889
      if (pNextRow == NULL) {
        *pTSRow = current.pTSRow;
        *freeTSRow = false;
2890
        return TSDB_CODE_SUCCESS;
2891 2892
      }

H
Haojun Liao 已提交
2893
      if (current.pTSRow->ts != pNextRow->pTSRow->ts) {
2894 2895
        *pTSRow = current.pTSRow;
        *freeTSRow = false;
2896
        return TSDB_CODE_SUCCESS;
2897
      }
2898
    }
2899 2900
  }

2901 2902
  SRowMerger merge = {0};

2903
  // get the correct schema for data in memory
H
Haojun Liao 已提交
2904
  STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(&current), pReader, uid);
H
Haojun Liao 已提交
2905

2906 2907
  if (pReader->pSchema == NULL) {
    pReader->pSchema = pTSchema;
2908
  }
H
Haojun Liao 已提交
2909

H
Haojun Liao 已提交
2910 2911 2912 2913 2914 2915
  tRowMergerInit2(&merge, pReader->pSchema, &current, pTSchema);

  STSchema* pTSchema1 = doGetSchemaForTSRow(TSDBROW_SVERSION(pNextRow), pReader, uid);
  tRowMergerAdd(&merge, pNextRow, pTSchema1);

  doMergeRowsInBuf(pIter, uid, current.pTSRow->ts, pDelList, &merge, pReader);
2916 2917 2918 2919
  int32_t code = tRowMergerGetRow(&merge, pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
M
Minglei Jin 已提交
2920

2921
  tRowMergerClear(&merge);
2922
  *freeTSRow = true;
2923
  return TSDB_CODE_SUCCESS;
2924 2925
}

2926
int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
H
Hongze Cheng 已提交
2927
                           STSRow** pTSRow) {
H
Haojun Liao 已提交
2928 2929
  SRowMerger merge = {0};

2930 2931 2932
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

2933
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
H
Haojun Liao 已提交
2934
    STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2935

H
Haojun Liao 已提交
2936
    tRowMergerInit(&merge, piRow, pSchema);
2937
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2938

2939
    tRowMerge(&merge, pRow);
2940
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2941
  } else {
H
Haojun Liao 已提交
2942
    STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2943

H
Haojun Liao 已提交
2944
    tRowMergerInit(&merge, pRow, pSchema);
2945
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2946 2947

    tRowMerge(&merge, piRow);
2948
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2949
  }
2950

2951 2952
  int32_t code = tRowMergerGetRow(&merge, pTSRow);
  return code;
2953 2954
}

2955 2956
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STSRow** pTSRow, int64_t endKey,
                            bool* freeTSRow) {
2957 2958
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
2959
  SArray*  pDelList = pBlockScanInfo->delSkyline;
2960
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
2961

2962 2963
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
2964
  if (pBlockScanInfo->iter.hasVal) {
2965 2966 2967 2968 2969 2970
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

2971
  if (pBlockScanInfo->iiter.hasVal) {
2972 2973 2974 2975 2976 2977
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

2978
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
2979
    TSDBKEY k = TSDBROW_KEY(pRow);
2980
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
2981

2982
    int32_t code = TSDB_CODE_SUCCESS;
2983 2984
    if (ik.ts != k.ts) {
      if (((ik.ts < k.ts) && asc) || ((ik.ts > k.ts) && (!asc))) {  // ik.ts < k.ts
2985
        code = doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader, freeTSRow);
2986
      } else if (((k.ts < ik.ts) && asc) || ((k.ts > ik.ts) && (!asc))) {
2987
        code = doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader, freeTSRow);
2988
      }
2989
    } else {  // ik.ts == k.ts
2990
      *freeTSRow = true;
2991 2992 2993 2994
      code = doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, pTSRow);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
2995
    }
2996

2997
    return code;
H
Haojun Liao 已提交
2998 2999
  }

3000
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
H
Hongze Cheng 已提交
3001 3002
    return doMergeMemTableMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader,
                                    freeTSRow);
H
Haojun Liao 已提交
3003 3004
  }

3005
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
3006
    return doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader, freeTSRow);
H
Haojun Liao 已提交
3007 3008 3009 3010 3011
  }

  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
3012
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow, uint64_t uid) {
3013 3014 3015
  int32_t numOfRows = pBlock->info.rows;
  int32_t numOfCols = (int32_t)taosArrayGetSize(pBlock->pDataBlock);

3016
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
3017
  STSchema*           pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
3018

3019
  SColVal colVal = {0};
3020
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
3021

3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032
  SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
  if (pColInfoData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppend(pColInfoData, numOfRows, (const char*)&pTSRow->ts, false);
    i += 1;
  }

  while (i < numOfCols && j < pSchema->numOfCols) {
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    col_id_t colId = pColInfoData->info.colId;

    if (colId == pSchema->columns[j].colId) {
H
Haojun Liao 已提交
3033
      tTSRowGetVal(pTSRow, pSchema, j, &colVal);
3034 3035 3036 3037 3038 3039 3040 3041
      doCopyColVal(pColInfoData, numOfRows, i, &colVal, pSupInfo);
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
      colDataAppendNULL(pColInfoData, numOfRows);
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
3042
    }
3043 3044
  }

3045
  // set null value since current column does not exist in the "pSchema"
3046
  while (i < numOfCols) {
3047 3048 3049 3050 3051
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    colDataAppendNULL(pColInfoData, numOfRows);
    i += 1;
  }

3052 3053 3054 3055
  pBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3056 3057
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                 int32_t rowIndex) {
3058 3059 3060 3061 3062 3063 3064 3065
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppendInt64(pColData, outputRowIndex, &pBlockData->aTSKEY[rowIndex]);
3066
    i += 1;
3067 3068 3069 3070 3071 3072
  }

  SColVal cv = {0};
  int32_t numOfInputCols = taosArrayGetSize(pBlockData->aIdx);
  int32_t numOfOutputCols = blockDataGetNumOfCols(pResBlock);

3073
  while (i < numOfOutputCols && j < numOfInputCols) {
3074
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
3075
    SColData*        pData = tBlockDataGetColDataByIdx(pBlockData, j);
3076 3077

    if (pData->cid == pCol->info.colId) {
3078 3079
      tColDataGetValue(pData, rowIndex, &cv);
      doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
3080 3081 3082 3083 3084 3085 3086 3087 3088 3089
      j += 1;
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNULL(pCol, outputRowIndex);
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
3090
    colDataAppendNULL(pCol, outputRowIndex);
3091 3092 3093 3094 3095 3096 3097
    i += 1;
  }

  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

3098 3099
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
H
Haojun Liao 已提交
3100 3101 3102 3103
  SSDataBlock* pBlock = pReader->pResBlock;

  do {
    STSRow* pTSRow = NULL;
3104
    bool    freeTSRow = false;
3105
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &pTSRow, endKey, &freeTSRow);
3106 3107
    if (pTSRow == NULL) {
      break;
H
Haojun Liao 已提交
3108 3109
    }

H
Haojun Liao 已提交
3110
    doAppendRowFromTSRow(pBlock, pReader, pTSRow, pBlockScanInfo->uid);
3111 3112 3113
    if (freeTSRow) {
      taosMemoryFree(pTSRow);
    }
H
Haojun Liao 已提交
3114 3115

    // no data in buffer, return immediately
3116
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
3117 3118 3119
      break;
    }

3120
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
3121 3122 3123 3124
      break;
    }
  } while (1);

3125
  ASSERT(pBlock->info.rows <= capacity);
H
Haojun Liao 已提交
3126 3127
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3128

3129
// todo refactor, use arraylist instead
H
Hongze Cheng 已提交
3130
int32_t tsdbSetTableId(STsdbReader* pReader, int64_t uid) {
3131 3132 3133
  ASSERT(pReader != NULL);
  taosHashClear(pReader->status.pTableMap);

3134
  STableBlockScanInfo info = {.lastKey = 0, .uid = uid};
3135
  taosHashPut(pReader->status.pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
H
Hongze Cheng 已提交
3136 3137 3138
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
3139 3140 3141 3142 3143 3144
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
3145

dengyihao's avatar
dengyihao 已提交
3146 3147 3148 3149 3150 3151
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
3152

H
Hongze Cheng 已提交
3153
uint64_t getReaderMaxVersion(STsdbReader* pReader) { return pReader->verRange.maxVer; }
3154

H
refact  
Hongze Cheng 已提交
3155
// ====================================== EXPOSED APIs ======================================
3156 3157
int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, SArray* pTableList, STsdbReader** ppReader,
                       const char* idstr) {
3158 3159
  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, 4096, idstr);
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3160 3161
    goto _err;
  }
H
Hongze Cheng 已提交
3162

3163
  // check for query time window
H
Haojun Liao 已提交
3164
  STsdbReader* pReader = *ppReader;
3165
  if (isEmptyQueryTimeWindow(&pReader->window)) {
H
Haojun Liao 已提交
3166 3167 3168
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3169

3170 3171 3172
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
    STimeWindow w = pCond->twindows;
3173
    int32_t     order = pCond->order;
3174 3175 3176 3177 3178 3179 3180 3181 3182 3183
    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.ekey = pCond->twindows.skey;
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
      pCond->twindows.skey = pCond->twindows.ekey;
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

3184
    // here we only need one more row, so the capacity is set to be ONE.
3185 3186 3187 3188 3189 3190 3191 3192
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[0], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.skey = w.ekey;
      pCond->twindows.ekey = INT64_MAX;
3193
    } else {
3194 3195 3196 3197 3198 3199 3200 3201 3202 3203
      pCond->twindows.skey = INT64_MIN;
      pCond->twindows.ekey = w.ekey;
    }
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[1], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

  if (pCond->suid != 0) {
3204
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, pCond->endVersion);
3205 3206
  } else if (taosArrayGetSize(pTableList) > 0) {
    STableKeyInfo* pKey = taosArrayGet(pTableList, 0);
3207
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, pCond->endVersion);
3208 3209
  }

3210 3211
  int32_t numOfTables = taosArrayGetSize(pTableList);
  pReader->status.pTableMap = createDataBlockScanInfo(pReader, pTableList->pData, numOfTables);
H
Haojun Liao 已提交
3212 3213 3214
  if (pReader->status.pTableMap == NULL) {
    tsdbReaderClose(pReader);
    *ppReader = NULL;
H
Haojun Liao 已提交
3215

H
Haojun Liao 已提交
3216 3217 3218
    code = TSDB_CODE_TDB_OUT_OF_MEMORY;
    goto _err;
  }
H
Hongze Cheng 已提交
3219

H
Hongze Cheng 已提交
3220
  code = tsdbTakeReadSnap(pReader->pTsdb, &pReader->pReadSnap);
3221 3222 3223
  if (code != TSDB_CODE_SUCCESS) {
    goto _err;
  }
H
Hongze Cheng 已提交
3224

3225 3226
  if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
    SDataBlockIter* pBlockIter = &pReader->status.blockIter;
3227

3228
    initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
3229
    resetDataBlockIterator(&pReader->status.blockIter, pReader->order);
3230 3231 3232 3233 3234 3235 3236 3237 3238 3239

    // no data in files, let's try buffer in memory
    if (pReader->status.fileIter.numOfFiles == 0) {
      pReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
3240
  } else {
3241
    STsdbReader*    pPrevReader = pReader->innerReader[0];
3242 3243
    SDataBlockIter* pBlockIter = &pPrevReader->status.blockIter;

3244 3245 3246 3247 3248
    code = tsdbTakeReadSnap(pPrevReader->pTsdb, &pPrevReader->pReadSnap);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

3249
    initFilesetIterator(&pPrevReader->status.fileIter, pPrevReader->pReadSnap->fs.aDFileSet, pPrevReader);
3250
    resetDataBlockIterator(&pPrevReader->status.blockIter, pPrevReader->order);
3251 3252 3253 3254 3255 3256 3257 3258 3259

    // no data in files, let's try buffer in memory
    if (pPrevReader->status.fileIter.numOfFiles == 0) {
      pPrevReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pPrevReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
3260 3261 3262
    }
  }

3263
  tsdbDebug("%p total numOfTable:%d in this query %s", pReader, numOfTables, pReader->idStr);
H
Hongze Cheng 已提交
3264
  return code;
H
Hongze Cheng 已提交
3265 3266

_err:
S
Shengliang Guan 已提交
3267
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), pReader->idStr);
H
Hongze Cheng 已提交
3268
  return code;
H
refact  
Hongze Cheng 已提交
3269 3270 3271
}

void tsdbReaderClose(STsdbReader* pReader) {
3272 3273
  if (pReader == NULL) {
    return;
3274
  }
H
refact  
Hongze Cheng 已提交
3275

3276
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
H
Hongze Cheng 已提交
3277

3278 3279 3280 3281
  taosMemoryFreeClear(pSupInfo->plist);
  taosMemoryFree(pSupInfo->colIds);

  taosArrayDestroy(pSupInfo->pColAgg);
L
Liu Jicong 已提交
3282
  for (int32_t i = 0; i < blockDataGetNumOfCols(pReader->pResBlock); ++i) {
3283 3284 3285 3286
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
3287

3288
  taosMemoryFree(pSupInfo->buildBuf);
H
Hongze Cheng 已提交
3289
  tBlockDataDestroy(&pReader->status.fileBlockData, true);
3290 3291

  cleanupDataBlockIterator(&pReader->status.blockIter);
3292 3293

  size_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
3294
  destroyBlockScanInfo(pReader->status.pTableMap);
3295
  blockDataDestroy(pReader->pResBlock);
3296

H
Haojun Liao 已提交
3297 3298 3299
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
3300

3301 3302
  tsdbUntakeReadSnap(pReader->pTsdb, pReader->pReadSnap);

3303 3304
  taosMemoryFree(pReader->status.uidCheckInfo.tableUidList);

H
Haojun Liao 已提交
3305 3306
  SFilesetIter* pFilesetIter = &pReader->status.fileIter;
  if (pFilesetIter->pLastBlockReader != NULL) {
3307
    tMergeTreeClose(&pFilesetIter->pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
3308 3309 3310
    taosMemoryFree(pFilesetIter->pLastBlockReader);
  }

3311
  SIOCostSummary* pCost = &pReader->cost;
H
refact  
Hongze Cheng 已提交
3312

3313
  tsdbDebug("%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
3314 3315
            " SMA-time:%.2f ms, fileBlocks:%" PRId64
            ", fileBlocks-time:%.2f ms, "
3316 3317 3318 3319 3320
            "build in-memory-block-time:%.2f ms, lastBlocks:%" PRId64
            ", lastBlocks-time:%.2f ms, STableBlockScanInfo size:%.2f Kb %s",
            pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime,
            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock, pCost->lastBlockLoad,
            pCost->lastBlockLoadTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pReader->idStr);
H
refact  
Hongze Cheng 已提交
3321

3322 3323
  taosMemoryFree(pReader->idStr);
  taosMemoryFree(pReader->pSchema);
3324 3325 3326
  if (pReader->pMemSchema != pReader->pSchema) {
    taosMemoryFree(pReader->pMemSchema);
  }
3327
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
3328 3329
}

3330
static bool doTsdbNextDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
3331
  // cleanup the data that belongs to the previous data block
3332 3333
  SSDataBlock* pBlock = pReader->pResBlock;
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
3334

3335
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
3336

3337 3338 3339 3340 3341
  if (pStatus->loadFromFile) {
    int32_t code = buildBlockFromFiles(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return false;
    }
3342

3343 3344 3345
    if (pBlock->info.rows > 0) {
      return true;
    } else {
H
Haojun Liao 已提交
3346
      buildBlockFromBufferSequentially(pReader);
3347
      return pBlock->info.rows > 0;
H
Haojun Liao 已提交
3348
    }
3349 3350 3351
  } else {  // no data in files, let's try the buffer
    buildBlockFromBufferSequentially(pReader);
    return pBlock->info.rows > 0;
H
Haojun Liao 已提交
3352
  }
3353

3354
  return false;
H
refact  
Hongze Cheng 已提交
3355 3356
}

3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393
bool tsdbNextDataBlock(STsdbReader* pReader) {
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return false;
  }

  if (pReader->innerReader[0] != NULL) {
    bool ret = doTsdbNextDataBlock(pReader->innerReader[0]);
    if (ret) {
      pReader->step = EXTERNAL_ROWS_PREV;
      return ret;
    }

    tsdbReaderClose(pReader->innerReader[0]);
    pReader->innerReader[0] = NULL;
  }

  pReader->step = EXTERNAL_ROWS_MAIN;
  bool ret = doTsdbNextDataBlock(pReader);
  if (ret) {
    return ret;
  }

  if (pReader->innerReader[1] != NULL) {
    bool ret1 = doTsdbNextDataBlock(pReader->innerReader[1]);
    if (ret1) {
      pReader->step = EXTERNAL_ROWS_NEXT;
      return ret1;
    }

    tsdbReaderClose(pReader->innerReader[1]);
    pReader->innerReader[1] = NULL;
  }

  return false;
}

static void setBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
3394 3395 3396 3397
  ASSERT(pDataBlockInfo != NULL && pReader != NULL);
  pDataBlockInfo->rows = pReader->pResBlock->info.rows;
  pDataBlockInfo->uid = pReader->pResBlock->info.uid;
  pDataBlockInfo->window = pReader->pResBlock->info.window;
H
Hongze Cheng 已提交
3398 3399
}

3400 3401
void tsdbRetrieveDataBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
3402
    if (pReader->step == EXTERNAL_ROWS_MAIN) {
3403
      setBlockInfo(pReader, pDataBlockInfo);
3404
    } else if (pReader->step == EXTERNAL_ROWS_PREV) {
3405 3406 3407 3408 3409 3410 3411 3412 3413
      setBlockInfo(pReader->innerReader[0], pDataBlockInfo);
    } else {
      setBlockInfo(pReader->innerReader[1], pDataBlockInfo);
    }
  } else {
    setBlockInfo(pReader, pDataBlockInfo);
  }
}

3414
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SColumnDataAgg*** pBlockStatis, bool* allHave) {
H
Hongze Cheng 已提交
3415
  int32_t code = 0;
3416
  *allHave = false;
H
Hongze Cheng 已提交
3417

3418
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
3419 3420 3421 3422
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }

3423
  // there is no statistics data for composed block
3424 3425 3426 3427
  if (pReader->status.composedDataBlock) {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3428

3429
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
3430

H
Hongze Cheng 已提交
3431 3432
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
  int64_t   stime = taosGetTimestampUs();
H
Hongze Cheng 已提交
3433

3434 3435
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

H
Hongze Cheng 已提交
3436
  if (tDataBlkHasSma(pBlock)) {
H
Hongze Cheng 已提交
3437
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg);
3438
    if (code != TSDB_CODE_SUCCESS) {
3439 3440
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
3441 3442
      return code;
    }
3443 3444 3445
  } else {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
3446
  }
H
Hongze Cheng 已提交
3447

3448
  *allHave = true;
H
Hongze Cheng 已提交
3449

3450 3451
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
3452

3453 3454
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470
  pTsAgg->min = pReader->pResBlock->info.window.skey;
  pTsAgg->max = pReader->pResBlock->info.window.ekey;
  pSup->plist[0] = pTsAgg;

  // update the number of NULL data rows
  size_t numOfCols = blockDataGetNumOfCols(pReader->pResBlock);

  int32_t i = 0, j = 0;
  while (j < numOfCols && i < taosArrayGetSize(pSup->pColAgg)) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colIds[j]) {
      if (IS_BSMA_ON(&(pReader->pSchema->columns[i]))) {
        pSup->plist[j] = pAgg;
      } else {
        *allHave = false;
      }
3471 3472
      i += 1;
      j += 1;
3473 3474 3475 3476 3477 3478 3479
    } else if (pAgg->colId < pSup->colIds[j]) {
      i += 1;
    } else if (pSup->colIds[j] < pAgg->colId) {
      j += 1;
    }
  }

3480
  double elapsed = (taosGetTimestampUs() - stime) / 1000.0;
3481
  pReader->cost.smaLoadTime += elapsed;
3482
  pReader->cost.smaDataLoad += 1;
3483 3484 3485

  *pBlockStatis = pSup->plist;

3486
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", elapsed time:%.2f ms, %s", 0, pFBlock->uid,
3487 3488
            elapsed, pReader->idStr);

H
Hongze Cheng 已提交
3489
  return code;
H
Hongze Cheng 已提交
3490 3491
}

3492
static SArray* doRetrieveDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
3493 3494 3495
  SReaderStatus* pStatus = &pReader->status;

  if (pStatus->composedDataBlock) {
3496
    return pReader->pResBlock->pDataBlock;
3497
  }
3498

3499
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pStatus->blockIter);
3500
  STableBlockScanInfo* pBlockScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
3501

H
Haojun Liao 已提交
3502
  tBlockDataReset(&pStatus->fileBlockData);
3503 3504
  int32_t code = tBlockDataInit(&pStatus->fileBlockData, pReader->suid, pBlockScanInfo->uid, pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS) {
3505 3506
    terrno = code;
    return NULL;
3507 3508 3509
  }

  code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData);
3510
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
3511
    tBlockDataDestroy(&pStatus->fileBlockData, 1);
3512 3513
    terrno = code;
    return NULL;
3514
  }
3515 3516 3517

  copyBlockDataToSDataBlock(pReader, pBlockScanInfo);
  return pReader->pResBlock->pDataBlock;
H
Hongze Cheng 已提交
3518 3519
}

3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531
SArray* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
      return doRetrieveDataBlock(pReader->innerReader[0]);
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
      return doRetrieveDataBlock(pReader->innerReader[1]);
    }
  }

  return doRetrieveDataBlock(pReader);
}

H
Haojun Liao 已提交
3532
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
3533 3534 3535
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3536

L
Liu Jicong 已提交
3537
  pReader->order = pCond->order;
3538
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
3539
  pReader->status.loadFromFile = true;
dengyihao's avatar
dengyihao 已提交
3540
  pReader->status.pTableIter = NULL;
H
Haojun Liao 已提交
3541
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
3542

3543
  // allocate buffer in order to load data blocks from file
3544
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
3545 3546
  memset(pReader->suppInfo.plist, 0, POINTER_BYTES);

3547
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
3548
  tsdbDataFReaderClose(&pReader->pFileReader);
3549

3550
  int32_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
L
Liu Jicong 已提交
3551

3552
  initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
3553
  resetDataBlockIterator(&pReader->status.blockIter, pReader->order);
H
Haojun Liao 已提交
3554

3555
  int64_t ts = ASCENDING_TRAVERSE(pReader->order)?pReader->window.skey-1:pReader->window.ekey+1;
H
Haojun Liao 已提交
3556
  resetDataBlockScanInfo(pReader->status.pTableMap, ts);
3557

3558
  int32_t         code = 0;
3559 3560
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

3561 3562 3563 3564 3565 3566
  // no data in files, let's try buffer in memory
  if (pReader->status.fileIter.numOfFiles == 0) {
    pReader->status.loadFromFile = false;
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
3567 3568
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
3569 3570 3571
      return code;
    }
  }
H
Hongze Cheng 已提交
3572

dengyihao's avatar
dengyihao 已提交
3573 3574
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s",
            pReader, pReader->suid, numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
3575

3576
  return code;
H
Hongze Cheng 已提交
3577
}
H
Hongze Cheng 已提交
3578

3579 3580 3581
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows) {
  return (numOfRows - startRow) / bucketRange;
}
H
Hongze Cheng 已提交
3582

3583 3584 3585 3586
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
H
Hongze Cheng 已提交
3587

3588 3589
  // find the start data block in file
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
3590

3591 3592 3593
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
3594

3595
  int32_t bucketRange = ceil((pc->maxRows - pc->minRows) / 20.0);
H
Hongze Cheng 已提交
3596

3597
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
3598

3599 3600
  int32_t numOfTables = (int32_t)taosHashGetSize(pStatus->pTableMap);
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
3601

3602 3603
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
3604

3605 3606
  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
H
Haojun Liao 已提交
3607
  }
H
Hongze Cheng 已提交
3608

3609
  pTableBlockInfo->numOfTables = numOfTables;
3610
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
3611

3612 3613
  while (true) {
    if (hasNext) {
H
Hongze Cheng 已提交
3614
      SDataBlk* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
3615

3616 3617
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
3618

3619 3620 3621
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
3622

3623 3624 3625
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
3626

3627 3628 3629
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
3630

3631 3632
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows);
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
3633 3634

      hasNext = blockIteratorNext(&pStatus->blockIter);
3635 3636 3637 3638 3639
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        break;
      }
H
refact  
Hongze Cheng 已提交
3640

3641 3642
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
      hasNext = (pBlockIter->numOfBlocks > 0);
3643
    }
H
refact  
Hongze Cheng 已提交
3644

H
Hongze Cheng 已提交
3645 3646
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
3647
  }
H
Hongze Cheng 已提交
3648

H
refact  
Hongze Cheng 已提交
3649 3650
  return code;
}
H
Hongze Cheng 已提交
3651

H
refact  
Hongze Cheng 已提交
3652
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
3653
  int64_t rows = 0;
H
Hongze Cheng 已提交
3654

3655 3656
  SReaderStatus* pStatus = &pReader->status;
  pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
H
Hongze Cheng 已提交
3657

3658 3659 3660 3661 3662
  while (pStatus->pTableIter != NULL) {
    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;

    STbData* d = NULL;
    if (pReader->pTsdb->mem != NULL) {
H
Hongze Cheng 已提交
3663
      d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
3664 3665 3666 3667 3668 3669 3670
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
    if (pReader->pTsdb->imem != NULL) {
H
Hongze Cheng 已提交
3671
      di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
3672 3673 3674 3675 3676 3677 3678 3679
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
  }
H
Hongze Cheng 已提交
3680

H
refact  
Hongze Cheng 已提交
3681
  return rows;
H
Hongze Cheng 已提交
3682
}
D
dapan1121 已提交
3683

L
Liu Jicong 已提交
3684
int32_t tsdbGetTableSchema(SVnode* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696
  int32_t sversion = 1;

  SMetaReader mr = {0};
  metaReaderInit(&mr, pVnode->pMeta, 0);
  int32_t code = metaGetTableEntryByUid(&mr, uid);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
3697

D
dapan1121 已提交
3698
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
3699
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714
    *suid = mr.me.ctbEntry.suid;
    code = metaGetTableEntryByUid(&mr, *suid);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
    sversion = mr.me.stbEntry.schemaRow.version;
  } else {
    ASSERT(mr.me.type == TSDB_NORMAL_TABLE);
    sversion = mr.me.ntbEntry.schemaRow.version;
  }

  metaReaderClear(&mr);
  *pSchema = metaGetTbTSchema(pVnode->pMeta, uid, sversion);
L
Liu Jicong 已提交
3715

D
dapan1121 已提交
3716 3717
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747

int32_t tsdbTakeReadSnap(STsdb* pTsdb, STsdbReadSnap** ppSnap) {
  int32_t code = 0;

  // alloc
  *ppSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap));
  if (*ppSnap == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
  code = taosThreadRwlockRdlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

  // take snapshot
  (*ppSnap)->pMem = pTsdb->mem;
  (*ppSnap)->pIMem = pTsdb->imem;

  if ((*ppSnap)->pMem) {
    tsdbRefMemTable((*ppSnap)->pMem);
  }

  if ((*ppSnap)->pIMem) {
    tsdbRefMemTable((*ppSnap)->pIMem);
  }

H
Hongze Cheng 已提交
3748
  // fs
H
Hongze Cheng 已提交
3749 3750 3751 3752 3753
  code = tsdbFSRef(pTsdb, &(*ppSnap)->fs);
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
3754 3755 3756 3757 3758 3759 3760 3761

  // unlock
  code = taosThreadRwlockUnlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

S
Shengliang Guan 已提交
3762
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776
_exit:
  return code;
}

void tsdbUntakeReadSnap(STsdb* pTsdb, STsdbReadSnap* pSnap) {
  if (pSnap) {
    if (pSnap->pMem) {
      tsdbUnrefMemTable(pSnap->pMem);
    }

    if (pSnap->pIMem) {
      tsdbUnrefMemTable(pSnap->pIMem);
    }

H
Hongze Cheng 已提交
3777
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
3778
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
3779
  }
H
Hongze Cheng 已提交
3780

S
Shengliang Guan 已提交
3781
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
3782
}