tsdbRead.c 125.5 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Haojun Liao 已提交
16
#include "osDef.h"
H
Hongze Cheng 已提交
17
#include "tsdb.h"
18

H
Hongze Cheng 已提交
19
#define ASCENDING_TRAVERSE(o)  (o == TSDB_ORDER_ASC)
20
#define ALL_ROWS_CHECKED_INDEX (INT16_MIN)
21
#define INITIAL_ROW_INDEX_VAL  (-1)
H
Hongze Cheng 已提交
22

23 24 25 26 27 28
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

29
typedef struct {
dengyihao's avatar
dengyihao 已提交
30
  STbDataIter* iter;
31 32 33 34
  int32_t      index;
  bool         hasVal;
} SIterInfo;

35 36
typedef struct {
  int32_t numOfBlocks;
37
  int32_t numOfLastFiles;
38 39
} SBlockNumber;

H
Haojun Liao 已提交
40
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
41 42
  uint64_t  uid;
  TSKEY     lastKey;
H
Hongze Cheng 已提交
43 44 45 46 47 48 49 50
  SMapData  mapData;            // block info (compressed)
  SArray*   pBlockList;         // block data index list
  SIterInfo iter;               // mem buffer skip list iterator
  SIterInfo iiter;              // imem buffer skip list iterator
  SArray*   delSkyline;         // delete info for this table
  int32_t   fileDelIndex;       // file block delete index
  int32_t   lastBlockDelIndex;  // delete index for last block
  bool      iterInit;           // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
51 52 53
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
54
  int64_t uid;
55
  int64_t offset;
H
Haojun Liao 已提交
56
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
57 58

typedef struct SBlockOrderSupporter {
59 60 61 62
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
63 64 65
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
66 67 68
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
69
  int64_t headFileLoad;
70
  double  headFileLoadTime;
71
  int64_t smaDataLoad;
72
  double  smaLoadTime;
73 74
  int64_t lastBlockLoad;
  double  lastBlockLoadTime;
H
Hongze Cheng 已提交
75 76 77
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
78
  SArray*          pColAgg;
79
  SColumnDataAgg   tsColAgg;
C
Cary Xu 已提交
80
  SColumnDataAgg** plist;
81 82
  int16_t*         colIds;    // column ids for loading file block data
  char**           buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
H
Hongze Cheng 已提交
83 84
} SBlockLoadSuppInfo;

85 86 87
typedef struct SLastBlockReader {
  STimeWindow   window;
  SVersionRange verRange;
88
  int32_t       order;
89
  uint64_t      uid;
90
  SMergeTree    mergeTree;
91 92
} SLastBlockReader;

93
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
94 95 96
  int32_t           numOfFiles;  // number of total files
  int32_t           index;       // current accessed index in the list
  SArray*           pFileList;   // data file list
97
  int32_t           order;
H
Hongze Cheng 已提交
98
  SLastBlockReader* pLastBlockReader;  // last file block reader
99
} SFilesetIter;
H
Haojun Liao 已提交
100 101

typedef struct SFileDataBlockInfo {
102
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
103
  uint64_t uid;
104
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
105 106 107
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
108
  int32_t   numOfBlocks;
109
  int32_t   index;
H
Hongze Cheng 已提交
110
  SArray*   blockList;  // SArray<SFileDataBlockInfo>
111
  int32_t   order;
H
Hongze Cheng 已提交
112
  SDataBlk  block;  // current SDataBlk data
113
  SHashObj* pTableMap;
H
Haojun Liao 已提交
114 115 116
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
117 118 119 120
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
121 122
} SFileBlockDumpInfo;

123
typedef struct SUidOrderCheckInfo {
124 125
  uint64_t* tableUidList;  // access table uid list in uid ascending order list
  int32_t   currentIndex;  // index in table uid list
126 127
} SUidOrderCheckInfo;

H
Haojun Liao 已提交
128
typedef struct SReaderStatus {
129
  bool                 loadFromFile;       // check file stage
130
  bool                 composedDataBlock;  // the returned data block is a composed block or not
131 132
  SHashObj*            pTableMap;          // SHash<STableBlockScanInfo>
  STableBlockScanInfo* pTableIter;         // table iterator used in building in-memory buffer data blocks.
133
  SUidOrderCheckInfo   uidCheckInfo;       // check all table in uid order
134
  SFileBlockDumpInfo   fBlockDumpInfo;
135 136 137 138
  SDFileSet*           pCurrentFileset;  // current opened file set
  SBlockData           fileBlockData;
  SFilesetIter         fileIter;
  SDataBlockIter       blockIter;
H
Haojun Liao 已提交
139 140
} SReaderStatus;

H
Hongze Cheng 已提交
141
struct STsdbReader {
H
Haojun Liao 已提交
142 143 144 145 146 147 148
  STsdb*             pTsdb;
  uint64_t           suid;
  int16_t            order;
  STimeWindow        window;  // the primary query time window that applies to all queries
  SSDataBlock*       pResBlock;
  int32_t            capacity;
  SReaderStatus      status;
149 150
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
151
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
152
  STsdbReadSnap*     pReadSnap;
153
  SIOCostSummary     cost;
154 155
  STSchema*          pSchema;     // the newest version schema
  STSchema*          pMemSchema;  // the previous schema for in-memory data, to avoid load schema too many times
156 157
  SDataFReader*      pFileReader;
  SVersionRange      verRange;
158

159 160
  int32_t      step;
  STsdbReader* innerReader[2];
H
Hongze Cheng 已提交
161
};
H
Hongze Cheng 已提交
162

H
Haojun Liao 已提交
163
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
164 165
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
166
static TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
167 168
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                        SRowMerger* pMerger);
H
Hongze Cheng 已提交
169 170
static int32_t  doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
                                       SRowMerger* pMerger);
171
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
dengyihao's avatar
dengyihao 已提交
172
                                 STsdbReader* pReader);
H
Haojun Liao 已提交
173
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow, uint64_t uid);
174
static int32_t  doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
H
Hongze Cheng 已提交
175
                                         int32_t rowIndex);
176
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
177
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order);
178

H
Hongze Cheng 已提交
179 180 181 182
static int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList,
                                        STSRow** pTSRow, STsdbReader* pReader, bool* freeTSRow);
static int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo,
                                  STsdbReader* pReader, STSRow** pTSRow);
183 184
static int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                                     STsdbReader* pReader);
185

dengyihao's avatar
dengyihao 已提交
186 187 188 189
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
190
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Hongze Cheng 已提交
191 192 193
static int64_t       getCurrentKeyInLastBlock(SLastBlockReader* pLastBlockReader);
static bool          hasDataInLastBlock(SLastBlockReader* pLastBlockReader);
static int32_t       doBuildDataBlock(STsdbReader* pReader);
H
Haojun Liao 已提交
194

195 196 197
static int32_t setColumnIdSlotList(STsdbReader* pReader, SSDataBlock* pBlock) {
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

198
  size_t numOfCols = blockDataGetNumOfCols(pBlock);
199

200
  pSupInfo->colIds = taosMemoryMalloc(numOfCols * sizeof(int16_t));
201
  pSupInfo->buildBuf = taosMemoryCalloc(numOfCols, POINTER_BYTES);
202 203 204
  if (pSupInfo->buildBuf == NULL || pSupInfo->colIds == NULL) {
    taosMemoryFree(pSupInfo->colIds);
    taosMemoryFree(pSupInfo->buildBuf);
H
Haojun Liao 已提交
205 206
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
207

H
Haojun Liao 已提交
208 209
  for (int32_t i = 0; i < numOfCols; ++i) {
    SColumnInfoData* pCol = taosArrayGet(pBlock->pDataBlock, i);
210
    pSupInfo->colIds[i] = pCol->info.colId;
211 212 213 214

    if (IS_VAR_DATA_TYPE(pCol->info.type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCol->info.bytes);
    }
H
Haojun Liao 已提交
215
  }
H
Hongze Cheng 已提交
216

H
Haojun Liao 已提交
217 218
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
219

220
static SHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, const STableKeyInfo* idList, int32_t numOfTables) {
H
Haojun Liao 已提交
221
  // allocate buffer in order to load data blocks from file
222
  // todo use simple hash instead, optimize the memory consumption
223 224 225
  SHashObj* pTableMap =
      taosHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK);
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
226 227 228
    return NULL;
  }

229
  for (int32_t j = 0; j < numOfTables; ++j) {
230
    STableBlockScanInfo info = {.lastKey = 0, .uid = idList[j].uid};
231
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
H
Haojun Liao 已提交
232 233 234 235
      info.lastKey = pTsdbReader->window.skey;
//      if (info.lastKey == INT64_MIN || info.lastKey < pTsdbReader->window.skey) {
//        info.lastKey = pTsdbReader->window.skey - step;
//      }
236
      ASSERT(info.lastKey >= pTsdbReader->window.skey && info.lastKey <= pTsdbReader->window.ekey);
wmmhello's avatar
wmmhello 已提交
237
    } else {
238
      info.lastKey = pTsdbReader->window.ekey;
H
Haojun Liao 已提交
239
    }
wmmhello's avatar
wmmhello 已提交
240

241 242 243
    taosHashPut(pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
    tsdbDebug("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, info.uid, info.lastKey,
              pTsdbReader->idStr);
H
Haojun Liao 已提交
244 245
  }

246 247
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->idStr);
248

249
  return pTableMap;
H
Hongze Cheng 已提交
250
}
H
Hongze Cheng 已提交
251

H
Haojun Liao 已提交
252
static void resetDataBlockScanInfo(SHashObj* pTableMap, int64_t ts) {
253 254
  STableBlockScanInfo* p = NULL;

dengyihao's avatar
dengyihao 已提交
255
  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
256 257
    p->iterInit = false;
    p->iiter.hasVal = false;
dengyihao's avatar
dengyihao 已提交
258
    if (p->iter.iter != NULL) {
259
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
260 261
    }

262
    p->delSkyline = taosArrayDestroy(p->delSkyline);
H
Haojun Liao 已提交
263
    p->lastKey = ts;
264 265 266
  }
}

267 268 269 270 271 272 273 274
static void destroyBlockScanInfo(SHashObj* pTableMap) {
  STableBlockScanInfo* p = NULL;

  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
    p->iterInit = false;
    p->iiter.hasVal = false;

    if (p->iter.iter != NULL) {
275
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
276 277 278
    }

    if (p->iiter.iter != NULL) {
279
      p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
280 281
    }

282 283
    p->delSkyline = taosArrayDestroy(p->delSkyline);
    p->pBlockList = taosArrayDestroy(p->pBlockList);
284
    tMapDataClear(&p->mapData);
285 286 287 288 289
  }

  taosHashCleanup(pTableMap);
}

290
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) {
291 292
  ASSERT(pWindow != NULL);
  return pWindow->skey > pWindow->ekey;
H
Haojun Liao 已提交
293
}
H
Hongze Cheng 已提交
294

295 296 297
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
298
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
299

300
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
301
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
302

dengyihao's avatar
dengyihao 已提交
303
  STimeWindow win = *pWindow;
304 305 306 307 308 309
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
310

H
Haojun Liao 已提交
311
static void limitOutputBufferSize(const SQueryTableDataCond* pCond, int32_t* capacity) {
H
Haojun Liao 已提交
312 313 314 315 316 317
  int32_t rowLen = 0;
  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    rowLen += pCond->colList[i].bytes;
  }

  // make sure the output SSDataBlock size be less than 2MB.
H
Haojun Liao 已提交
318 319 320
  const int32_t TWOMB = 2 * 1024 * 1024;
  if ((*capacity) * rowLen > TWOMB) {
    (*capacity) = TWOMB / rowLen;
H
Haojun Liao 已提交
321 322 323 324
  }
}

// init file iterator
H
Hongze Cheng 已提交
325 326
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet,
                                   STsdbReader* pReader /*int32_t order, const char* idstr*/) {
H
Hongze Cheng 已提交
327
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
328

329 330
  pIter->index = ASCENDING_TRAVERSE(pReader->order) ? -1 : numOfFileset;
  pIter->order = pReader->order;
H
Hongze Cheng 已提交
331
  pIter->pFileList = aDFileSet;
332
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
333

334 335 336 337
  if (pIter->pLastBlockReader == NULL) {
    pIter->pLastBlockReader = taosMemoryCalloc(1, sizeof(struct SLastBlockReader));
    if (pIter->pLastBlockReader == NULL) {
      int32_t code = TSDB_CODE_OUT_OF_MEMORY;
338
      tsdbError("failed to prepare the last block iterator, code:%d %s", tstrerror(code), pReader->idStr);
339 340
      return code;
    }
341 342
  }

343 344 345 346 347 348 349 350
  SLastBlockReader* pLReader = pIter->pLastBlockReader;
  pLReader->order = pReader->order;
  pLReader->window = pReader->window;
  pLReader->verRange = pReader->verRange;

  pLReader->uid = 0;
  tMergeTreeClose(&pLReader->mergeTree);

351
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, pReader->idStr);
H
Haojun Liao 已提交
352 353 354
  return TSDB_CODE_SUCCESS;
}

355
static bool filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader) {
356 357
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
358 359 360
  pIter->index += step;

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
H
Haojun Liao 已提交
361 362 363
    return false;
  }

364 365 366
  pIter->pLastBlockReader->uid = 0;
  tMergeTreeClose(&pIter->pLastBlockReader->mergeTree);

H
Haojun Liao 已提交
367 368
  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
369

370
  while (1) {
H
Haojun Liao 已提交
371 372 373
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
374

375
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
376

377 378 379 380
    int32_t code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
381

382 383
    pReader->cost.headFileLoad += 1;

384 385 386 387 388 389 390 391 392 393 394 395
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
      return false;
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
396 397 398
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
        return false;
      }
399 400
      continue;
    }
C
Cary Xu 已提交
401

402
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
403
              pReader->window.ekey, pReader->idStr);
404 405
    return true;
  }
406

407
_err:
H
Haojun Liao 已提交
408 409 410
  return false;
}

411
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order) {
412 413
  pIter->order = order;
  pIter->index = -1;
414
  pIter->numOfBlocks = 0;
415 416 417 418 419 420 421
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
}

L
Liu Jicong 已提交
422
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
423

H
Haojun Liao 已提交
424
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
425 426
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
427 428
}

429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    SColumnInfoData colInfo = {{0}, 0};
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }

  return pResBlock;
}

452 453
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, STsdbReader** ppReader, int32_t capacity,
                                const char* idstr) {
H
Haojun Liao 已提交
454
  int32_t      code = 0;
455
  int8_t       level = 0;
H
Haojun Liao 已提交
456
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
457 458
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
459
    goto _end;
H
Hongze Cheng 已提交
460 461
  }

C
Cary Xu 已提交
462 463 464 465
  if (VND_IS_TSMA(pVnode)) {
    tsdbDebug("vgId:%d, tsma is selected to query", TD_VID(pVnode));
  }

H
Haojun Liao 已提交
466
  initReaderStatus(&pReader->status);
467

L
Liu Jicong 已提交
468
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
469 470
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
471
  pReader->capacity = 4096;
dengyihao's avatar
dengyihao 已提交
472 473
  pReader->idStr = (idstr != NULL) ? strdup(idstr) : NULL;
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
474
  pReader->type = pCond->type;
475
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
476

477
  ASSERT(pCond->numOfCols > 0);
H
Hongze Cheng 已提交
478

479
  limitOutputBufferSize(pCond, &pReader->capacity);
480

481 482
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
483
  pSup->pColAgg = taosArrayInit(4, sizeof(SColumnDataAgg));
484
  pSup->plist = taosMemoryCalloc(pCond->numOfCols, POINTER_BYTES);
485
  if (pSup->pColAgg == NULL || pSup->plist == NULL) {
486 487 488
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
489

490 491
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;

H
Hongze Cheng 已提交
492
  code = tBlockDataCreate(&pReader->status.fileBlockData);
H
Haojun Liao 已提交
493 494 495 496 497
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

498 499 500 501
  pReader->pResBlock = createResBlock(pCond, pReader->capacity);
  if (pReader->pResBlock == NULL) {
    code = terrno;
    goto _end;
H
Hongze Cheng 已提交
502
  }
H
Hongze Cheng 已提交
503

504 505
  setColumnIdSlotList(pReader, pReader->pResBlock);

H
Hongze Cheng 已提交
506 507
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
508

H
Haojun Liao 已提交
509 510
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
511 512 513
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
514

H
Haojun Liao 已提交
515
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
516
  SArray* aBlockIdx = taosArrayInit(8, sizeof(SBlockIdx));
H
Hongze Cheng 已提交
517

518
  int64_t st = taosGetTimestampUs();
H
Hongze Cheng 已提交
519
  int32_t code = tsdbReadBlockIdx(pFileReader, aBlockIdx);
H
Haojun Liao 已提交
520
  if (code != TSDB_CODE_SUCCESS) {
521
    goto _end;
H
Haojun Liao 已提交
522
  }
H
Hongze Cheng 已提交
523

524 525
  size_t num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
H
Haojun Liao 已提交
526
    taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
527 528
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
529

530 531 532 533
  int64_t et1 = taosGetTimestampUs();

  SBlockIdx* pBlockIdx = NULL;
  for (int32_t i = 0; i < num; ++i) {
534
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Haojun Liao 已提交
535

536
    // uid check
H
Hongze Cheng 已提交
537
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
538 539 540 541
      continue;
    }

    // this block belongs to a table that is not queried.
H
Hongze Cheng 已提交
542
    void* p = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(uint64_t));
H
Haojun Liao 已提交
543 544 545 546 547 548
    if (p == NULL) {
      continue;
    }

    STableBlockScanInfo* pScanInfo = p;
    if (pScanInfo->pBlockList == NULL) {
549
      pScanInfo->pBlockList = taosArrayInit(4, sizeof(int32_t));
H
Haojun Liao 已提交
550 551
    }

H
Hongze Cheng 已提交
552
    taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
553
  }
H
Hongze Cheng 已提交
554

555
  int64_t et2 = taosGetTimestampUs();
556
  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
557
            (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0, pReader->idStr);
558 559 560

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

561
_end:
H
Hongze Cheng 已提交
562
  taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
563 564
  return code;
}
H
Hongze Cheng 已提交
565

566
static void cleanupTableScanInfo(SHashObj* pTableMap) {
567
  STableBlockScanInfo* px = NULL;
dengyihao's avatar
dengyihao 已提交
568
  while (1) {
569
    px = taosHashIterate(pTableMap, px);
570 571 572 573
    if (px == NULL) {
      break;
    }

574
    // reset the index in last block when handing a new file
575
    tMapDataClear(&px->mapData);
576 577
    taosArrayClear(px->pBlockList);
  }
578 579
}

580
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, SBlockNumber* pBlockNum) {
581 582 583 584 585 586
  int32_t numOfQTable = 0;
  size_t  sizeInDisk = 0;
  size_t  numOfTables = taosArrayGetSize(pIndexList);

  int64_t st = taosGetTimestampUs();
  cleanupTableScanInfo(pReader->status.pTableMap);
587

dengyihao's avatar
dengyihao 已提交
588
  for (int32_t i = 0; i < numOfTables; ++i) {
H
Haojun Liao 已提交
589
    SBlockIdx* pBlockIdx = taosArrayGet(pIndexList, i);
H
Hongze Cheng 已提交
590

591
    STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(int64_t));
H
Hongze Cheng 已提交
592

593
    tMapDataReset(&pScanInfo->mapData);
H
Hongze Cheng 已提交
594
    tsdbReadBlock(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData);
595

596
    sizeInDisk += pScanInfo->mapData.nData;
597
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Hongze Cheng 已提交
598 599
      SDataBlk block = {0};
      tMapDataGetItemByIdx(&pScanInfo->mapData, j, &block, tGetDataBlk);
H
Hongze Cheng 已提交
600

601
      // 1. time range check
602
      if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
H
Haojun Liao 已提交
603 604
        continue;
      }
H
Hongze Cheng 已提交
605

606
      // 2. version range check
H
Hongze Cheng 已提交
607
      if (block.minVer > pReader->verRange.maxVer || block.maxVer < pReader->verRange.minVer) {
608 609
        continue;
      }
610

611
      void* p = taosArrayPush(pScanInfo->pBlockList, &j);
H
Haojun Liao 已提交
612
      if (p == NULL) {
613
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
614 615
        return TSDB_CODE_OUT_OF_MEMORY;
      }
616

617
      pBlockNum->numOfBlocks += 1;
H
Haojun Liao 已提交
618
    }
H
Hongze Cheng 已提交
619

H
Haojun Liao 已提交
620
    if (pScanInfo->pBlockList != NULL && taosArrayGetSize(pScanInfo->pBlockList) > 0) {
621 622 623 624
      numOfQTable += 1;
    }
  }

H
Hongze Cheng 已提交
625
  pBlockNum->numOfLastFiles = pReader->pFileReader->pSet->nSttF;
626
  int32_t total = pBlockNum->numOfLastFiles + pBlockNum->numOfBlocks;
627

628
  double el = (taosGetTimestampUs() - st) / 1000.0;
H
Hongze Cheng 已提交
629
  tsdbDebug(
630
      "load block of %d tables completed, blocks:%d in %d tables, last-files:%d, block-info-size:%.2f Kb, elapsed "
631
      "time:%.2f ms %s",
632
      numOfTables, pBlockNum->numOfBlocks, numOfQTable, pBlockNum->numOfLastFiles, sizeInDisk / 1000.0, el,
H
Hongze Cheng 已提交
633
      pReader->idStr);
634

635
  pReader->cost.numOfBlocks += total;
636
  pReader->cost.headFileLoadTime += el;
637

H
Haojun Liao 已提交
638 639
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
640

641
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, int64_t maxKey, int32_t order) {
642
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
643
  pDumpInfo->allDumped = true;
644
  pDumpInfo->lastKey = maxKey + step;
H
Haojun Liao 已提交
645 646
}

647 648
static void doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
                         SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
649
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
650
    if (pColVal->isNull || pColVal->isNone) {
H
Haojun Liao 已提交
651 652 653
      colDataAppendNULL(pColInfoData, rowIndex);
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
H
Haojun Liao 已提交
654
      ASSERT(pColVal->value.nData <= pColInfoData->info.bytes);
H
Haojun Liao 已提交
655 656 657 658
      memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      colDataAppend(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
    }
  } else {
659
    colDataAppend(pColInfoData, rowIndex, (const char*)&pColVal->value, pColVal->isNull || pColVal->isNone);
H
Haojun Liao 已提交
660
  }
H
Haojun Liao 已提交
661 662
}

663
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
664 665
  if (taosArrayGetSize(pBlockIter->blockList) == 0) {
    ASSERT(pBlockIter->numOfBlocks == taosArrayGetSize(pBlockIter->blockList));
666 667
    return NULL;
  }
668 669 670

  SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pBlockInfo;
671 672
}

H
Hongze Cheng 已提交
673
static SDataBlk* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
674

675
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
676
  SReaderStatus*  pStatus = &pReader->status;
677
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
H
Hongze Cheng 已提交
678

679
  SBlockData*         pBlockData = &pStatus->fileBlockData;
H
Haojun Liao 已提交
680
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
H
Hongze Cheng 已提交
681
  SDataBlk*           pBlock = getCurrentBlock(pBlockIter);
H
Haojun Liao 已提交
682
  SSDataBlock*        pResBlock = pReader->pResBlock;
683
  int32_t             numOfOutputCols = blockDataGetNumOfCols(pResBlock);
H
Haojun Liao 已提交
684

H
Haojun Liao 已提交
685
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
686
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Haojun Liao 已提交
687

H
Haojun Liao 已提交
688
  SColVal cv = {0};
689
  int64_t st = taosGetTimestampUs();
690 691
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
692

693
  int32_t rowIndex = 0;
694 695
  int32_t remain = asc ? (pBlockData->nRow - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex + 1);

696 697 698 699 700 701 702 703
  int32_t endIndex = 0;
  if (remain <= pReader->capacity) {
    endIndex = pBlockData->nRow;
  } else {
    endIndex = pDumpInfo->rowIndex + step * pReader->capacity;
    remain = pReader->capacity;
  }

704
  int32_t          i = 0;
705 706
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
707
    for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
708 709 710 711 712
      colDataAppend(pColData, rowIndex++, (const char*)&pBlockData->aTSKEY[j], false);
    }
    i += 1;
  }

713 714 715
  int32_t colIndex = 0;
  int32_t num = taosArrayGetSize(pBlockData->aIdx);
  while (i < numOfOutputCols && colIndex < num) {
716 717 718
    rowIndex = 0;
    pColData = taosArrayGet(pResBlock->pDataBlock, i);

H
Hongze Cheng 已提交
719
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
720 721 722
    if (pData->cid < pColData->info.colId) {
      colIndex += 1;
    } else if (pData->cid == pColData->info.colId) {
723
      for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
724 725
        tColDataGetValue(pData, j, &cv);
        doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
H
Haojun Liao 已提交
726
      }
727
      colIndex += 1;
728
      i += 1;
729
      ASSERT(rowIndex == remain);
730 731
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNNULL(pColData, 0, remain);
732
      i += 1;
H
Haojun Liao 已提交
733
    }
734 735
  }

736
  while (i < numOfOutputCols) {
737 738 739
    pColData = taosArrayGet(pResBlock->pDataBlock, i);
    colDataAppendNNULL(pColData, 0, remain);
    i += 1;
H
Haojun Liao 已提交
740
  }
H
Haojun Liao 已提交
741

742
  pResBlock->info.rows = remain;
743
  pDumpInfo->rowIndex += step * remain;
744

745
  setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
H
Haojun Liao 已提交
746

747
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
748
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
749

750
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
751
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
752
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
753
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, remain, unDumpedRows,
H
Hongze Cheng 已提交
754
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
755 756 757 758

  return TSDB_CODE_SUCCESS;
}

759
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter, SBlockData* pBlockData) {
760 761
  int64_t st = taosGetTimestampUs();

762
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
763
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
764
  ASSERT(pBlockInfo != NULL);
765

H
Hongze Cheng 已提交
766 767
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
  int32_t   code = tsdbReadDataBlock(pReader->pFileReader, pBlock, pBlockData);
768 769 770
  if (code != TSDB_CODE_SUCCESS) {
    tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
              ", rows:%d, code:%s %s",
771
              pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
772 773 774
              tstrerror(code), pReader->idStr);
    return code;
  }
775

776
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
777

778 779 780 781
  tsdbDebug("%p load file block into buffer, global index:%d, index in table block list:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
            pReader, pBlockIter->index, pBlockInfo->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pBlock->minVer, pBlock->maxVer, elapsedTime, pReader->idStr);
782 783 784

  pReader->cost.blockLoadTime += elapsedTime;
  pDumpInfo->allDumped = false;
785

H
Haojun Liao 已提交
786
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
787
}
H
Hongze Cheng 已提交
788

H
Haojun Liao 已提交
789 790 791
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
792

H
Haojun Liao 已提交
793 794 795 796
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
797

H
Haojun Liao 已提交
798 799
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
800

H
Haojun Liao 已提交
801 802
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  ASSERT(numOfTables >= 1);
H
Hongze Cheng 已提交
803

H
Haojun Liao 已提交
804
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
805 806
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
807

H
Haojun Liao 已提交
808 809 810 811
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
812

H
Haojun Liao 已提交
813 814
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
815

H
Haojun Liao 已提交
816
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
817
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
818
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
819

H
Haojun Liao 已提交
820
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
821

H
Haojun Liao 已提交
822 823
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
824

H
Haojun Liao 已提交
825 826 827 828 829 830 831
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
832

833
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
834
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
835

836 837 838 839
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter) {
840 841 842
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(pBlockIter);
  if (pBlockInfo != NULL) {
    STableBlockScanInfo* pScanInfo = taosHashGet(pBlockIter->pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
843
    int32_t*             mapDataIndex = taosArrayGet(pScanInfo->pBlockList, pBlockInfo->tbBlockIdx);
H
Hongze Cheng 已提交
844
    tMapDataGetItemByIdx(&pScanInfo->mapData, *mapDataIndex, &pBlockIter->block, tGetDataBlk);
845
  }
846 847 848 849 850 851

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
852
}
H
Hongze Cheng 已提交
853

854
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks) {
855
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
856

857
  pBlockIter->numOfBlocks = numOfBlocks;
858
  taosArrayClear(pBlockIter->blockList);
859
  pBlockIter->pTableMap = pReader->status.pTableMap;
860

861 862
  // access data blocks according to the offset of each block in asc/desc order.
  int32_t numOfTables = (int32_t)taosHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
863

864
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
865

866
  SBlockOrderSupporter sup = {0};
867
  int32_t              code = initBlockOrderSupporter(&sup, numOfTables);
868 869 870
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
871

872 873 874 875 876 877 878
  int32_t cnt = 0;
  void*   ptr = NULL;
  while (1) {
    ptr = taosHashIterate(pReader->status.pTableMap, ptr);
    if (ptr == NULL) {
      break;
    }
H
Haojun Liao 已提交
879

880 881 882 883
    STableBlockScanInfo* pTableScanInfo = (STableBlockScanInfo*)ptr;
    if (pTableScanInfo->pBlockList == NULL || taosArrayGetSize(pTableScanInfo->pBlockList) == 0) {
      continue;
    }
H
Haojun Liao 已提交
884

885 886
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
887

888 889 890 891 892
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
      return TSDB_CODE_TDB_OUT_OF_MEMORY;
    }
H
Haojun Liao 已提交
893

894
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
H
Hongze Cheng 已提交
895
    SDataBlk block = {0};
896 897
    for (int32_t k = 0; k < num; ++k) {
      SBlockOrderWrapper wrapper = {0};
898 899

      int32_t* mapDataIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
H
Hongze Cheng 已提交
900
      tMapDataGetItemByIdx(&pTableScanInfo->mapData, *mapDataIndex, &block, tGetDataBlk);
901

902
      wrapper.uid = pTableScanInfo->uid;
903
      wrapper.offset = block.aSubBlock[0].offset;
H
Haojun Liao 已提交
904

905 906 907 908 909 910
      sup.pDataBlockInfo[sup.numOfTables][k] = wrapper;
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
911

912
  ASSERT(numOfBlocks == cnt);
H
Haojun Liao 已提交
913

914
  // since there is only one table qualified, blocks are not sorted
915 916
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
917 918
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
919
    }
920

921
    int64_t et = taosGetTimestampUs();
922
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
923
              pReader, numOfBlocks, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
924

925
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
926
    cleanupBlockOrderSupporter(&sup);
927
    doSetCurrentBlock(pBlockIter);
928
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
929
  }
H
Haojun Liao 已提交
930

931 932
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
933

934
  ASSERT(cnt <= numOfBlocks && sup.numOfTables <= numOfTables);
H
Haojun Liao 已提交
935

936 937 938 939 940
  SMultiwayMergeTreeInfo* pTree = NULL;
  uint8_t                 ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_TDB_OUT_OF_MEMORY;
H
Haojun Liao 已提交
941
  }
H
Haojun Liao 已提交
942

943 944 945 946
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
947

948 949
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
950

951 952 953 954
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
955

956 957
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
958
  }
H
Haojun Liao 已提交
959

960
  int64_t et = taosGetTimestampUs();
H
Hongze Cheng 已提交
961 962
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, numOfBlocks,
            (et - st) / 1000.0, pReader->idStr);
963 964
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
965

966
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
967 968
  doSetCurrentBlock(pBlockIter);

969
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
970
}
H
Hongze Cheng 已提交
971

H
Haojun Liao 已提交
972
static bool blockIteratorNext(SDataBlockIter* pBlockIter) {
973 974
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

975
  int32_t step = asc ? 1 : -1;
976
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
977 978 979
    return false;
  }

980
  pBlockIter->index += step;
981 982
  doSetCurrentBlock(pBlockIter);

983 984 985
  return true;
}

986 987 988
/**
 * This is an two rectangles overlap cases.
 */
H
Hongze Cheng 已提交
989
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SDataBlk* pBlock) {
990 991
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
H
Hongze Cheng 已提交
992 993
         (pVerRange->minVer > pBlock->minVer && pVerRange->minVer <= pBlock->maxVer) ||
         (pVerRange->maxVer < pBlock->maxVer && pVerRange->maxVer >= pBlock->minVer);
H
Haojun Liao 已提交
994
}
H
Hongze Cheng 已提交
995

H
Hongze Cheng 已提交
996 997
static SDataBlk* getNeighborBlockOfSameTable(SFileDataBlockInfo* pFBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
                                             int32_t* nextIndex, int32_t order) {
998 999 1000
  bool asc = ASCENDING_TRAVERSE(order);
  if (asc && pFBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
    return NULL;
1001 1002
  }

1003
  if (!asc && pFBlockInfo->tbBlockIdx == 0) {
1004 1005 1006
    return NULL;
  }

1007
  int32_t step = asc ? 1 : -1;
1008
  *nextIndex = pFBlockInfo->tbBlockIdx + step;
1009

H
Hongze Cheng 已提交
1010 1011
  SDataBlk* pBlock = taosMemoryCalloc(1, sizeof(SDataBlk));
  int32_t*  indexInMapdata = taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);
1012

H
Hongze Cheng 已提交
1013
  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, *indexInMapdata, pBlock, tGetDataBlk);
1014
  return pBlock;
1015 1016 1017 1018 1019
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
  ASSERT(pBlockIter != NULL && pFBlockInfo != NULL);

1020
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1021 1022
  int32_t index = pBlockIter->index;

1023
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  ASSERT(0);
  return -1;
}

1036
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1037
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
1038 1039 1040 1041
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1042 1043 1044 1045 1046
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1047

1048 1049 1050
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1051

1052
  doSetCurrentBlock(pBlockIter);
1053 1054 1055
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
1056
static bool overlapWithNeighborBlock(SDataBlk* pBlock, SDataBlk* pNeighbor, int32_t order) {
1057 1058 1059 1060 1061 1062
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
    return pBlock->maxKey.ts == pNeighbor->minKey.ts;
  } else {
    return pBlock->minKey.ts == pNeighbor->maxKey.ts;
  }
H
Haojun Liao 已提交
1063
}
H
Hongze Cheng 已提交
1064

H
Hongze Cheng 已提交
1065
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SDataBlk* pBlock) {
H
Haojun Liao 已提交
1066
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1067

1068
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1069
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1070
}
H
Hongze Cheng 已提交
1071

H
Hongze Cheng 已提交
1072
static bool keyOverlapFileBlock(TSDBKEY key, SDataBlk* pBlock, SVersionRange* pVerRange) {
H
Hongze Cheng 已提交
1073 1074
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVer >= pVerRange->minVer) &&
         (pBlock->minVer <= pVerRange->maxVer);
H
Haojun Liao 已提交
1075 1076
}

H
Hongze Cheng 已提交
1077
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock) {
1078 1079 1080 1081 1082
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

  for (int32_t i = pBlockScanInfo->fileDelIndex; i < num; i += 1) {
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
H
Hongze Cheng 已提交
1083
      if (p->version >= pBlock->minVer) {
1084 1085 1086
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
H
Hongze Cheng 已提交
1087
      if (p->version >= pBlock->minVer) {
1088 1089 1090 1091 1092 1093 1094
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
          if (i + 1 == num - 1) {  // pnext is the last point
            if (pnext->ts >= pBlock->minKey.ts) {
              return true;
            }
          } else {
H
Hongze Cheng 已提交
1095
            if (pnext->ts >= pBlock->minKey.ts && pnext->version >= pBlock->minVer) {
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
              return true;
            }
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

H
Hongze Cheng 已提交
1111
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SDataBlk* pBlock, int32_t order) {
1112 1113 1114 1115
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1116
  // ts is not overlap
1117
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1118
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1119 1120 1121 1122 1123
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1124 1125 1126 1127
  if (ASCENDING_TRAVERSE(order)) {
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1128
    while (1) {
1129 1130 1131 1132 1133
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
        break;
1134 1135 1136
      }
    }

1137 1138
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  }
1139 1140
}

1141 1142 1143 1144
// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
1145
// 5. delete info should not overlap with current block data
H
Hongze Cheng 已提交
1146
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pFBlock, SDataBlk* pBlock,
1147
                                STableBlockScanInfo* pScanInfo, TSDBKEY key, SLastBlockReader* pLastBlockReader) {
H
Hongze Cheng 已提交
1148 1149
  int32_t   neighborIndex = 0;
  SDataBlk* pNeighbor = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &neighborIndex, pReader->order);
1150

1151
  // overlap with neighbor
1152 1153 1154
  bool overlapWithNeighbor = false;
  if (pNeighbor) {
    overlapWithNeighbor = overlapWithNeighborBlock(pBlock, pNeighbor, pReader->order);
1155
    taosMemoryFree(pNeighbor);
1156 1157
  }

1158
  // has duplicated ts of different version in this block
L
Liu Jicong 已提交
1159 1160
  bool hasDup = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  bool overlapWithDel = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1161

1162
  // todo here we need to each key in the last files to identify if it is really overlapped with last block
1163
  // todo
1164
  bool overlapWithlastBlock = false;
1165
#if 0
H
Hongze Cheng 已提交
1166
  if (taosArrayGetSize(pLastBlockReader->pSstBlk) > 0 && (pLastBlockReader->currentBlockIndex != -1)) {
H
Hongze Cheng 已提交
1167
    SSttBlk* pSstBlk = taosArrayGet(pLastBlockReader->pSstBlk, pLastBlockReader->currentBlockIndex);
H
Hongze Cheng 已提交
1168
    overlapWithlastBlock = !(pBlock->maxKey.ts < pSstBlk->minKey || pBlock->minKey.ts > pSstBlk->maxKey);
1169
  }
1170
#endif
1171

1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
  bool moreThanOutputCapacity = pBlock->nRow > pReader->capacity;
  bool partiallyRequired = dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock);
  bool overlapWithKey = keyOverlapFileBlock(key, pBlock, &pReader->verRange);

  bool loadDataBlock = (overlapWithNeighbor || hasDup || partiallyRequired || overlapWithKey ||
                        moreThanOutputCapacity || overlapWithDel || overlapWithlastBlock);

  // log the reason why load the datablock for profile
  if (loadDataBlock) {
    tsdbDebug("%p uid:%" PRIu64
1182
              " need to load the datablock, overlapwithneighborblock:%d, hasDup:%d, partiallyRequired:%d, "
1183 1184 1185 1186 1187 1188
              "overlapWithKey:%d, greaterThanBuf:%d, overlapWithDel:%d, overlapWithlastBlock:%d, %s",
              pReader, pFBlock->uid, overlapWithNeighbor, hasDup, partiallyRequired, overlapWithKey,
              moreThanOutputCapacity, overlapWithDel, overlapWithlastBlock, pReader->idStr);
  }

  return loadDataBlock;
H
Haojun Liao 已提交
1189 1190
}

1191
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1192
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1193 1194
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1195

1196 1197 1198
  SSDataBlock* pBlock = pReader->pResBlock;

  int64_t st = taosGetTimestampUs();
1199
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->capacity, pReader);
H
Haojun Liao 已提交
1200

1201
  blockDataUpdateTsWindow(pBlock, 0);
1202
  pBlock->info.uid = pBlockScanInfo->uid;
1203

1204
  setComposedBlockFlag(pReader, true);
1205

1206
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
S
Shengliang Guan 已提交
1207
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%d, brange:%" PRId64
1208 1209 1210
            " - %" PRId64 " %s",
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
            pReader->idStr);
1211 1212

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1213 1214 1215
  return code;
}

1216 1217
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key,
                                            SFileBlockDumpInfo* pDumpInfo) {
1218 1219 1220 1221 1222
  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && pReader->order == TSDB_ORDER_ASC) ||
      (pDumpInfo->rowIndex > 0 && pReader->order == TSDB_ORDER_DESC)) {
1223
    int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
1224 1225

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1226
    if (nextKey != key) {  // merge is not needed
1227
      doAppendRowFromFileBlock(pReader->pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
1228 1229 1230 1231 1232 1233 1234 1235
      pDumpInfo->rowIndex += step;
      return true;
    }
  }

  return false;
}

H
Haojun Liao 已提交
1236 1237 1238 1239 1240 1241
static FORCE_INLINE STSchema* doGetSchemaForTSRow(int32_t sversion, STsdbReader* pReader, uint64_t uid) {
  // always set the newest schema version in pReader->pSchema
  if (pReader->pSchema == NULL) {
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, uid, -1);
  }

1242
  if (pReader->pSchema && sversion == pReader->pSchema->version) {
H
Haojun Liao 已提交
1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
    return pReader->pSchema;
  }

  if (pReader->pMemSchema == NULL) {
    int32_t code =
        metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pMemSchema);
    return pReader->pMemSchema;
  }

  if (pReader->pMemSchema->version == sversion) {
    return pReader->pMemSchema;
  }

  taosMemoryFree(pReader->pMemSchema);
  int32_t code = metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pMemSchema);
  return pReader->pMemSchema;
}

1261
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
1262 1263 1264 1265 1266 1267
                                     SIterInfo* pIter, int64_t key, SLastBlockReader* pLastBlockReader) {
  SRowMerger          merge = {0};
  STSRow*             pTSRow = NULL;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1268
  int64_t tsLast = INT64_MIN;
1269
  if (hasDataInLastBlock(pLastBlockReader)) {
1270 1271
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
1272

H
Hongze Cheng 已提交
1273 1274
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1275

1276 1277
  int64_t minKey = 0;
  if (pReader->order == TSDB_ORDER_ASC) {
H
Hongze Cheng 已提交
1278
    minKey = INT64_MAX;  // chosen the minimum value
1279
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
1280 1281
      minKey = tsLast;
    }
1282

1283 1284 1285
    if (minKey > k.ts) {
      minKey = k.ts;
    }
1286

1287 1288 1289 1290 1291
    if (minKey > key && pBlockData->nRow > 0) {
      minKey = key;
    }
  } else {
    minKey = INT64_MIN;
1292
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
1293 1294 1295 1296 1297 1298 1299 1300 1301 1302
      minKey = tsLast;
    }

    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < key && pBlockData->nRow > 0) {
      minKey = key;
    }
1303 1304 1305 1306
  }

  bool init = false;

1307
  // ASC: file block ---> last block -----> imem -----> mem
H
Hongze Cheng 已提交
1308
  // DESC: mem -----> imem -----> last block -----> file block
1309 1310
  if (pReader->order == TSDB_ORDER_ASC) {
    if (minKey == key) {
1311
      init = true;
1312 1313
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1314 1315
    }

1316
    if (minKey == tsLast) {
1317
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1318 1319 1320
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1321 1322 1323
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1324
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1325
    }
1326

1327
    if (minKey == k.ts) {
H
Haojun Liao 已提交
1328 1329 1330
      if (init) {
        tRowMerge(&merge, pRow);
      } else {
1331 1332 1333 1334 1335 1336 1337 1338 1339
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, pRow, pSchema);
      }
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }
  } else {
    if (minKey == k.ts) {
      init = true;
1340 1341
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      tRowMergerInit(&merge, pRow, pSchema);
1342
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1343 1344
    }

1345
    if (minKey == tsLast) {
1346
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1347 1348 1349
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1350 1351 1352
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1353
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1354 1355 1356
    }

    if (minKey == key) {
H
Haojun Liao 已提交
1357 1358 1359
      if (init) {
        tRowMerge(&merge, &fRow);
      } else {
1360 1361 1362 1363 1364
        init = true;
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
      }
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
    }
1365 1366
  }

1367 1368 1369 1370 1371
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1372 1373 1374 1375 1376 1377 1378
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
  return TSDB_CODE_SUCCESS;
}

1379 1380 1381
static int32_t doMergeFileBlockAndLastBlock(SLastBlockReader* pLastBlockReader, STsdbReader* pReader,
                                            STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                            bool mergeBlockData) {
1382
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
S
Shengliang Guan 已提交
1383
  // SBlockData* pLastBlockData = &pLastBlockReader->lastBlockData;
H
Hongze Cheng 已提交
1384
  int64_t tsLastBlock = getCurrentKeyInLastBlock(pLastBlockReader);
1385 1386 1387 1388

  STSRow*    pTSRow = NULL;
  SRowMerger merge = {0};

1389
  TSDBROW fRow = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
1390 1391 1392 1393
  tRowMergerInit(&merge, &fRow, pReader->pSchema);
  doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLastBlock, &merge);

  // merge with block data if ts == key
1394
  if (mergeBlockData && (tsLastBlock == pBlockData->aTSKEY[pDumpInfo->rowIndex])) {
1395 1396 1397
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
  }

1398 1399 1400 1401 1402
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1403 1404 1405 1406 1407 1408 1409
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
  return TSDB_CODE_SUCCESS;
}

1410 1411
static int32_t mergeFileBlockAndLastBlock(STsdbReader* pReader, SLastBlockReader* pLastBlockReader, int64_t key,
                                          STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
1412 1413
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1414 1415
  if (pBlockData->nRow > 0) {
    // no last block available, only data block exists
1416
    if (!hasDataInLastBlock(pLastBlockReader)) {
1417 1418 1419 1420 1421 1422 1423 1424 1425
      return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
    }

    // row in last file block
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
    int64_t ts = getCurrentKeyInLastBlock(pLastBlockReader);
    ASSERT(ts >= key);

    if (ASCENDING_TRAVERSE(pReader->order)) {
1426
      if (key < ts) {  // imem, mem are all empty, file blocks (data blocks and last block) exist
1427 1428 1429 1430
        return mergeRowsInFileBlocks(pBlockData, pBlockScanInfo, key, pReader);
      } else if (key == ts) {
        STSRow*    pTSRow = NULL;
        SRowMerger merge = {0};
1431

1432 1433
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1434 1435 1436 1437

        TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
        tRowMerge(&merge, &fRow1);

1438
        doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, ts, &merge);
1439

1440 1441 1442 1443 1444
        int32_t code = tRowMergerGetRow(&merge, &pTSRow);
        if (code != TSDB_CODE_SUCCESS) {
          return code;
        }

1445
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);
1446

1447 1448
        taosMemoryFree(pTSRow);
        tRowMergerClear(&merge);
1449
        return code;
1450
      } else {
1451 1452
        ASSERT(0);
        return TSDB_CODE_SUCCESS;
1453
      }
1454
    } else {  // desc order
1455
      return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, pBlockData, true);
1456
    }
1457
  } else {  // only last block exists
1458
    return doMergeFileBlockAndLastBlock(pLastBlockReader, pReader, pBlockScanInfo, NULL, false);
H
Haojun Liao 已提交
1459
  }
1460 1461
}

1462 1463
static int32_t doMergeMultiLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData,
                                     SLastBlockReader* pLastBlockReader) {
1464 1465 1466 1467 1468 1469
  SRowMerger merge = {0};
  STSRow*    pTSRow = NULL;

  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SArray*             pDelList = pBlockScanInfo->delSkyline;

1470 1471
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
1472 1473
  ASSERT(pRow != NULL && piRow != NULL);

1474
  int64_t tsLast = INT64_MIN;
1475 1476 1477
  if (hasDataInLastBlock(pLastBlockReader)) {
    tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
  }
1478 1479 1480 1481 1482 1483

  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];

  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

1484
  int64_t minKey = 0;
1485 1486 1487 1488 1489
  if (ASCENDING_TRAVERSE(pReader->order)) {
    minKey = INT64_MAX;  // let's find the minimum
    if (minKey > k.ts) {
      minKey = k.ts;
    }
1490

1491 1492 1493
    if (minKey > ik.ts) {
      minKey = ik.ts;
    }
1494

1495 1496 1497
    if (minKey > key && pBlockData->nRow > 0) {
      minKey = key;
    }
1498

1499
    if (minKey > tsLast && hasDataInLastBlock(pLastBlockReader)) {
1500 1501 1502
      minKey = tsLast;
    }
  } else {
H
Hongze Cheng 已提交
1503
    minKey = INT64_MIN;  // let find the maximum ts value
1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515
    if (minKey < k.ts) {
      minKey = k.ts;
    }

    if (minKey < ik.ts) {
      minKey = ik.ts;
    }

    if (minKey < key && pBlockData->nRow > 0) {
      minKey = key;
    }

1516
    if (minKey < tsLast && hasDataInLastBlock(pLastBlockReader)) {
1517 1518
      minKey = tsLast;
    }
1519 1520 1521 1522
  }

  bool init = false;

1523 1524 1525 1526
  // ASC: file block -----> last block -----> imem -----> mem
  // DESC: mem -----> imem -----> last block -----> file block
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (minKey == key) {
1527
      init = true;
1528 1529 1530
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1531 1532
    }

1533
    if (minKey == tsLast) {
1534
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1535 1536 1537
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1538 1539 1540
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1541
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1542 1543 1544
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
1545 1546 1547
      if (init) {
        tRowMerge(&merge, piRow);
      } else {
1548 1549 1550 1551 1552
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, piRow, pSchema);
      }
      doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1553 1554
    }

1555
    if (minKey == k.ts) {
H
Haojun Liao 已提交
1556 1557 1558
      if (init) {
        tRowMerge(&merge, pRow);
      } else {
1559 1560 1561 1562 1563 1564 1565 1566
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, pRow, pSchema);
      }
      doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }
  } else {
    if (minKey == k.ts) {
      init = true;
1567 1568
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
      tRowMergerInit(&merge, pRow, pSchema);
1569 1570 1571 1572
      doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }

    if (minKey == ik.ts) {
H
Haojun Liao 已提交
1573 1574 1575
      if (init) {
        tRowMerge(&merge, piRow);
      } else {
1576 1577 1578 1579 1580 1581 1582 1583
        init = true;
        STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(piRow), pReader, pBlockScanInfo->uid);
        tRowMergerInit(&merge, piRow, pSchema);
      }
      doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
    }

    if (minKey == tsLast) {
1584
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
1585 1586 1587
      if (init) {
        tRowMerge(&merge, &fRow1);
      } else {
1588 1589 1590
        init = true;
        tRowMergerInit(&merge, &fRow1, pReader->pSchema);
      }
H
Haojun Liao 已提交
1591
      doMergeRowsInLastBlock(pLastBlockReader, pBlockScanInfo, tsLast, &merge);
1592 1593 1594
    }

    if (minKey == key) {
H
Haojun Liao 已提交
1595
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1596 1597
      if (!init) {
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
H
Haojun Liao 已提交
1598 1599
      } else {
        tRowMerge(&merge, &fRow);
1600 1601
      }
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1602 1603 1604
    }
  }

1605 1606 1607 1608 1609
  int32_t code = tRowMergerGetRow(&merge, &pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

1610 1611 1612 1613
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

  taosMemoryFree(pTSRow);
  tRowMergerClear(&merge);
1614
  return code;
1615 1616
}

1617
#if 0
1618
static int32_t doMergeThreeLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
1619 1620 1621
  SRowMerger merge = {0};
  STSRow*    pTSRow = NULL;

1622
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
dengyihao's avatar
dengyihao 已提交
1623
  SArray*             pDelList = pBlockScanInfo->delSkyline;
1624

1625 1626
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pDelList, pReader);
1627
  ASSERT(pRow != NULL && piRow != NULL);
H
Haojun Liao 已提交
1628

1629
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1630
  bool    freeTSRow = false;
H
Haojun Liao 已提交
1631

1632
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
1633

1634 1635 1636
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);
  if (ASCENDING_TRAVERSE(pReader->order)) {
1637 1638
    // [1&2] key <= [k.ts && ik.ts]
    if (key <= k.ts && key <= ik.ts) {
1639 1640 1641
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
      tRowMergerInit(&merge, &fRow, pReader->pSchema);

1642
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
H
Haojun Liao 已提交
1643

1644 1645
      if (ik.ts == key) {
        tRowMerge(&merge, piRow);
1646
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1647 1648
      }

1649 1650
      if (k.ts == key) {
        tRowMerge(&merge, pRow);
1651
        doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1652 1653 1654
      }

      tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1655
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1656
      return TSDB_CODE_SUCCESS;
1657
    } else {  // key > ik.ts || key > k.ts
1658 1659
      ASSERT(key != ik.ts);

1660
      // [3] ik.ts < key <= k.ts
1661
      // [4] ik.ts < k.ts <= key
1662
      if (ik.ts < k.ts) {
1663
        doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader, &freeTSRow);
H
Haojun Liao 已提交
1664
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1665 1666 1667
        if (freeTSRow) {
          taosMemoryFree(pTSRow);
        }
1668 1669 1670
        return TSDB_CODE_SUCCESS;
      }

1671 1672
      // [5] k.ts < key   <= ik.ts
      // [6] k.ts < ik.ts <= key
1673
      if (k.ts < ik.ts) {
1674
        doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, &pTSRow, pReader, &freeTSRow);
H
Haojun Liao 已提交
1675
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1676 1677 1678
        if (freeTSRow) {
          taosMemoryFree(pTSRow);
        }
1679 1680 1681
        return TSDB_CODE_SUCCESS;
      }

1682
      // [7] k.ts == ik.ts < key
1683
      if (k.ts == ik.ts) {
1684 1685
        ASSERT(key > ik.ts && key > k.ts);

1686
        doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pTSRow);
H
Haojun Liao 已提交
1687
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1688
        taosMemoryFree(pTSRow);
1689 1690 1691
        return TSDB_CODE_SUCCESS;
      }
    }
1692 1693 1694
  } else {  // descending order scan
    // [1/2] k.ts >= ik.ts && k.ts >= key
    if (k.ts >= ik.ts && k.ts >= key) {
H
Haojun Liao 已提交
1695
      STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
1696

H
Haojun Liao 已提交
1697
      tRowMergerInit(&merge, pRow, pSchema);
1698
      doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1699 1700 1701

      if (ik.ts == k.ts) {
        tRowMerge(&merge, piRow);
1702
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1703 1704 1705 1706 1707 1708 1709 1710 1711
      }

      if (k.ts == key) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      }

      tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1712
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1713 1714
      return TSDB_CODE_SUCCESS;
    } else {
1715
      ASSERT(ik.ts != k.ts);  // this case has been included in the previous if branch
1716 1717 1718 1719

      // [3] ik.ts > k.ts >= Key
      // [4] ik.ts > key >= k.ts
      if (ik.ts > key) {
1720
        doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader, &freeTSRow);
H
Haojun Liao 已提交
1721
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1722 1723 1724
        if (freeTSRow) {
          taosMemoryFree(pTSRow);
        }
1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
        return TSDB_CODE_SUCCESS;
      }

      // [5] key > ik.ts > k.ts
      // [6] key > k.ts > ik.ts
      if (key > ik.ts) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMergerInit(&merge, &fRow, pReader->pSchema);

        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1736
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1737
        taosMemoryFree(pTSRow);
1738 1739 1740 1741 1742
        return TSDB_CODE_SUCCESS;
      }

      //[7] key = ik.ts > k.ts
      if (key == ik.ts) {
1743
        doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader, &freeTSRow);
1744 1745 1746 1747 1748

        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
H
Haojun Liao 已提交
1749
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, uid);
1750 1751

        taosMemoryFree(pTSRow);
1752 1753 1754 1755 1756 1757
        return TSDB_CODE_SUCCESS;
      }
    }
  }

  ASSERT(0);
S
Shengliang Guan 已提交
1758
  return -1;
1759
}
1760
#endif
1761

1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

  int32_t code = TSDB_CODE_SUCCESS;

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
    startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
  } else {
    startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));

  STbData* d = NULL;
  if (pReader->pReadSnap->pMem != NULL) {
    d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
    if (d != NULL) {
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);

        tsdbDebug("%p uid:%" PRId64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
1787
                  "-%" PRId64 " %s",
1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
      } else {
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
    tsdbDebug("%p uid:%" PRId64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
  }

  STbData* di = NULL;
  if (pReader->pReadSnap->pIMem != NULL) {
    di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
    if (di != NULL) {
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
      if (code == TSDB_CODE_SUCCESS) {
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);

        tsdbDebug("%p uid:%" PRId64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
H
Hongze Cheng 已提交
1808
                  "-%" PRId64 " %s",
1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
      } else {
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
        return code;
      }
    }
  } else {
    tsdbDebug("%p uid:%" PRId64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
  }

  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);

  pBlockScanInfo->iterInit = true;
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
1826 1827
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
1828 1829 1830 1831 1832 1833 1834 1835
  // it is an multi-table data block
  if (pBlockData->aUid != NULL) {
    uint64_t uid = pBlockData->aUid[pDumpInfo->rowIndex];
    if (uid != pBlockScanInfo->uid) {  // move to next row
      return false;
    }
  }

1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

1847
  TSDBKEY k = {.ts = ts, .version = ver};
1848
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, &k, pReader->order)) {
1849 1850 1851
    return false;
  }

1852 1853 1854
  return true;
}

1855
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
1856

1857
static bool nextRowFromLastBlocks(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pBlockScanInfo) {
H
Hongze Cheng 已提交
1858
  while (1) {
1859 1860 1861 1862
    bool hasVal = tMergeTreeNext(&pLastBlockReader->mergeTree);
    if (!hasVal) {
      return false;
    }
1863

1864 1865 1866 1867 1868
    TSDBROW row = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
    TSDBKEY k = TSDBROW_KEY(&row);
    if (!hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->lastBlockDelIndex, &k, pLastBlockReader->order)) {
      return true;
    }
1869
  }
1870 1871
}

1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884
static bool initLastBlockReader(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pBlockScanInfo,
                                STsdbReader* pReader) {
  // the last block reader has been initialized for this table.
  if (pLastBlockReader->uid == pBlockScanInfo->uid) {
    return true;
  }

  if (pLastBlockReader->uid != 0) {
    tMergeTreeClose(&pLastBlockReader->mergeTree);
  }

  initMemDataIterator(pBlockScanInfo, pReader);
  pLastBlockReader->uid = pBlockScanInfo->uid;
1885

H
Haojun Liao 已提交
1886
  int32_t step = ASCENDING_TRAVERSE(pLastBlockReader->order)? 1:-1;
1887 1888
  STimeWindow w = pLastBlockReader->window;
  if (ASCENDING_TRAVERSE(pLastBlockReader->order)) {
H
Haojun Liao 已提交
1889
    w.skey = pBlockScanInfo->lastKey + step;
1890
  } else {
H
Haojun Liao 已提交
1891
    w.ekey = pBlockScanInfo->lastKey + step;
1892 1893
  }

1894 1895
  int32_t code =
      tMergeTreeOpen(&pLastBlockReader->mergeTree, (pLastBlockReader->order == TSDB_ORDER_DESC), pReader->pFileReader,
1896
          pReader->suid, pBlockScanInfo->uid, &w, &pLastBlockReader->verRange);
1897 1898 1899 1900 1901 1902 1903
  if (code != TSDB_CODE_SUCCESS) {
    return false;
  }

  return nextRowFromLastBlocks(pLastBlockReader, pBlockScanInfo);
}

1904
static int64_t getCurrentKeyInLastBlock(SLastBlockReader* pLastBlockReader) {
1905 1906 1907
  TSDBROW row = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
  TSDBKEY key = TSDBROW_KEY(&row);
  return key.ts;
1908 1909
}

H
Hongze Cheng 已提交
1910
static bool hasDataInLastBlock(SLastBlockReader* pLastBlockReader) { return pLastBlockReader->mergeTree.pIter != NULL; }
1911

1912 1913
int32_t mergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pBlockScanInfo, int64_t key,
                              STsdbReader* pReader) {
1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);

  if (tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo)) {
    return TSDB_CODE_SUCCESS;
  } else {
    STSRow*    pTSRow = NULL;
    SRowMerger merge = {0};

    tRowMergerInit(&merge, &fRow, pReader->pSchema);
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1926 1927 1928 1929 1930
    int32_t code = tRowMergerGetRow(&merge, &pTSRow);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

1931 1932 1933 1934 1935 1936
    doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow, pBlockScanInfo->uid);

    taosMemoryFree(pTSRow);
    tRowMergerClear(&merge);
    return TSDB_CODE_SUCCESS;
  }
1937

1938 1939 1940
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
1941 1942
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo,
                                          SBlockData* pBlockData, SLastBlockReader* pLastBlockReader) {
1943 1944
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

H
Hongze Cheng 已提交
1945
  int64_t  key = (pBlockData->nRow > 0) ? pBlockData->aTSKEY[pDumpInfo->rowIndex] : INT64_MIN;
1946 1947
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
1948

1949
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal) {
1950
    return doMergeMultiLevelRows(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
1951
  } else {
1952
    // imem + file + last block
1953
    if (pBlockScanInfo->iiter.hasVal) {
1954
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key, pLastBlockReader);
1955 1956
    }

1957
    // mem + file + last block
1958
    if (pBlockScanInfo->iter.hasVal) {
1959
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key, pLastBlockReader);
H
Haojun Liao 已提交
1960
    }
1961

1962 1963
    // files data blocks + last block
    return mergeFileBlockAndLastBlock(pReader, pLastBlockReader, key, pBlockScanInfo, pBlockData);
1964 1965 1966
  }
}

1967
static int32_t buildComposedDataBlock(STsdbReader* pReader) {
1968 1969
  SSDataBlock* pResBlock = pReader->pResBlock;

H
Hongze Cheng 已提交
1970
  SFileDataBlockInfo* pBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
1971 1972 1973 1974 1975 1976 1977 1978

  STableBlockScanInfo* pBlockScanInfo = NULL;
  if (pBlockInfo != NULL) {
    pBlockScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
  } else {
    pBlockScanInfo = pReader->status.pTableIter;
  }

H
Haojun Liao 已提交
1979
  SLastBlockReader*   pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
1980
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1981 1982
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  int32_t             step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
1983

1984 1985
  int64_t st = taosGetTimestampUs();

1986
  while (1) {
1987
    // todo check the validate of row in file block
1988
    bool hasBlockData = false;
1989
    {
H
Haojun Liao 已提交
1990
      while (pBlockData->nRow > 0) {  // find the first qualified row in data block
1991 1992 1993 1994 1995
        if (isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
          hasBlockData = true;
          break;
        }

1996 1997
        pDumpInfo->rowIndex += step;

H
Hongze Cheng 已提交
1998
        SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
1999
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
2000
          setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2001 2002 2003
          break;
        }
      }
2004
    }
2005

2006
    bool hasBlockLData = hasDataInLastBlock(pLastBlockReader);
2007

2008 2009 2010
    // no data in last block and block, no need to proceed.
    if ((hasBlockData == false) && (hasBlockLData == false)) {
      break;
2011 2012
    }

2013
    buildComposedDataBlockImpl(pReader, pBlockScanInfo, pBlockData, pLastBlockReader);
2014

2015
    // currently loaded file data block is consumed
2016
    if ((pBlockData->nRow > 0) && (pDumpInfo->rowIndex >= pBlockData->nRow || pDumpInfo->rowIndex < 0)) {
H
Hongze Cheng 已提交
2017
      SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
2018
      setBlockAllDumped(pDumpInfo, pBlock->maxKey.ts, pReader->order);
2019 2020 2021 2022 2023
      break;
    }

    if (pResBlock->info.rows >= pReader->capacity) {
      break;
2024 2025 2026 2027
    }
  }

  pResBlock->info.uid = pBlockScanInfo->uid;
2028 2029
  blockDataUpdateTsWindow(pResBlock, 0);

2030
  setComposedBlockFlag(pReader, true);
2031
  int64_t et = taosGetTimestampUs();
2032

2033 2034 2035 2036 2037 2038
  if (pResBlock->info.rows > 0) {
    tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
              " rows:%d, elapsed time:%.2f ms %s",
              pReader, pBlockScanInfo->uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
              pResBlock->info.rows, (et - st) / 1000.0, pReader->idStr);
  }
2039

2040 2041 2042 2043 2044
  return TSDB_CODE_SUCCESS;
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

dengyihao's avatar
dengyihao 已提交
2045 2046
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
2047 2048 2049
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
2050

2051 2052 2053
  int32_t code = 0;
  STsdb*  pTsdb = pReader->pTsdb;

2054 2055
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
2056
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
2057 2058
  if (pDelFile) {
    SDelFReader* pDelFReader = NULL;
H
more  
Hongze Cheng 已提交
2059
    code = tsdbDelFReaderOpen(&pDelFReader, pDelFile, pTsdb);
2060
    if (code != TSDB_CODE_SUCCESS) {
2061 2062 2063 2064 2065
      goto _err;
    }

    SArray* aDelIdx = taosArrayInit(4, sizeof(SDelIdx));
    if (aDelIdx == NULL) {
2066
      tsdbDelFReaderClose(&pDelFReader);
2067 2068 2069
      goto _err;
    }

H
Hongze Cheng 已提交
2070
    code = tsdbReadDelIdx(pDelFReader, aDelIdx);
2071 2072 2073
    if (code != TSDB_CODE_SUCCESS) {
      taosArrayDestroy(aDelIdx);
      tsdbDelFReaderClose(&pDelFReader);
2074 2075
      goto _err;
    }
2076

2077 2078 2079
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
    SDelIdx* pIdx = taosArraySearch(aDelIdx, &idx, tCmprDelIdx, TD_EQ);

H
Haojun Liao 已提交
2080
    if (pIdx != NULL) {
H
Hongze Cheng 已提交
2081
      code = tsdbReadDelData(pDelFReader, pIdx, pDelData);
2082 2083 2084 2085 2086 2087 2088
    }

    taosArrayDestroy(aDelIdx);
    tsdbDelFReaderClose(&pDelFReader);

    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
2089
    }
2090
  }
2091

2092 2093 2094 2095 2096 2097 2098
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
2099 2100
  }

2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
dengyihao's avatar
dengyihao 已提交
2115 2116
  pBlockScanInfo->iter.index =
      ASCENDING_TRAVERSE(pReader->order) ? 0 : taosArrayGetSize(pBlockScanInfo->delSkyline) - 1;
2117 2118
  pBlockScanInfo->iiter.index = pBlockScanInfo->iter.index;
  pBlockScanInfo->fileDelIndex = pBlockScanInfo->iter.index;
2119
  pBlockScanInfo->lastBlockDelIndex = pBlockScanInfo->iter.index;
2120 2121
  return code;

2122 2123 2124
_err:
  taosArrayDestroy(pDelData);
  return code;
2125 2126
}

2127
static TSDBKEY getCurrentKeyInBuf(STableBlockScanInfo* pScanInfo, STsdbReader* pReader) {
H
Hongze Cheng 已提交
2128
  TSDBKEY  key = {.ts = TSKEY_INITIAL_VAL};
2129
  TSDBROW* pRow = getValidMemRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
2130
  if (pRow != NULL) {
2131 2132 2133
    key = TSDBROW_KEY(pRow);
  }

2134
  pRow = getValidMemRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
2135
  if (pRow != NULL) {
2136 2137 2138 2139 2140 2141 2142 2143 2144
    TSDBKEY k = TSDBROW_KEY(pRow);
    if (key.ts > k.ts) {
      key = k;
    }
  }

  return key;
}

2145
static int32_t moveToNextFile(STsdbReader* pReader, SBlockNumber* pBlockNum) {
H
Haojun Liao 已提交
2146
  SReaderStatus* pStatus = &pReader->status;
2147
  pBlockNum->numOfBlocks = 0;
2148
  pBlockNum->numOfLastFiles = 0;
2149

2150
  size_t  numOfTables = taosHashGetSize(pReader->status.pTableMap);
2151
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
2152 2153

  while (1) {
2154
    bool hasNext = filesetIteratorNext(&pStatus->fileIter, pReader);
2155
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
2156 2157 2158
      break;
    }

H
Haojun Liao 已提交
2159
    taosArrayClear(pIndexList);
H
Haojun Liao 已提交
2160 2161
    int32_t code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2162
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2163 2164 2165
      return code;
    }

H
Hongze Cheng 已提交
2166
    if (taosArrayGetSize(pIndexList) > 0 || pReader->pFileReader->pSet->nSttF > 0) {
2167
      code = doLoadFileBlock(pReader, pIndexList, pBlockNum);
H
Haojun Liao 已提交
2168
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2169
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2170 2171 2172
        return code;
      }

2173
      if (pBlockNum->numOfBlocks + pBlockNum->numOfLastFiles > 0) {
H
Haojun Liao 已提交
2174 2175 2176
        break;
      }
    }
2177

H
Haojun Liao 已提交
2178 2179 2180
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
2181
  taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
2182 2183 2184
  return TSDB_CODE_SUCCESS;
}

2185
static int32_t uidComparFunc(const void* p1, const void* p2) {
2186 2187
  uint64_t pu1 = *(uint64_t*)p1;
  uint64_t pu2 = *(uint64_t*)p2;
2188 2189 2190
  if (pu1 == pu2) {
    return 0;
  } else {
2191
    return (pu1 < pu2) ? -1 : 1;
2192 2193
  }
}
2194

2195
static void extractOrderedTableUidList(SUidOrderCheckInfo* pOrderCheckInfo, SReaderStatus* pStatus) {
2196 2197 2198 2199
  int32_t index = 0;
  int32_t total = taosHashGetSize(pStatus->pTableMap);

  void* p = taosHashIterate(pStatus->pTableMap, NULL);
2200
  while (p != NULL) {
2201 2202 2203 2204 2205 2206 2207 2208
    STableBlockScanInfo* pScanInfo = p;
    pOrderCheckInfo->tableUidList[index++] = pScanInfo->uid;
    p = taosHashIterate(pStatus->pTableMap, p);
  }

  taosSort(pOrderCheckInfo->tableUidList, total, sizeof(uint64_t), uidComparFunc);
}

2209
static int32_t initOrderCheckInfo(SUidOrderCheckInfo* pOrderCheckInfo, SReaderStatus* pStatus) {
2210 2211 2212 2213
  int32_t total = taosHashGetSize(pStatus->pTableMap);
  if (total == 0) {
    return TSDB_CODE_SUCCESS;
  }
2214

2215
  if (pOrderCheckInfo->tableUidList == NULL) {
2216 2217 2218 2219 2220 2221
    pOrderCheckInfo->currentIndex = 0;
    pOrderCheckInfo->tableUidList = taosMemoryMalloc(total * sizeof(uint64_t));
    if (pOrderCheckInfo->tableUidList == NULL) {
      return TSDB_CODE_OUT_OF_MEMORY;
    }

2222
    extractOrderedTableUidList(pOrderCheckInfo, pStatus);
2223 2224 2225
    uint64_t uid = pOrderCheckInfo->tableUidList[0];
    pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
  } else {
2226 2227
    if (pStatus->pTableIter == NULL) {  // it is the last block of a new file
      pOrderCheckInfo->currentIndex = 0;
2228 2229
      uint64_t uid = pOrderCheckInfo->tableUidList[pOrderCheckInfo->currentIndex];
      pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2230 2231

      // the tableMap has already updated
2232
      if (pStatus->pTableIter == NULL) {
2233
        void* p = taosMemoryRealloc(pOrderCheckInfo->tableUidList, total * sizeof(uint64_t));
2234 2235 2236 2237 2238 2239 2240 2241 2242
        if (p == NULL) {
          return TSDB_CODE_OUT_OF_MEMORY;
        }

        pOrderCheckInfo->tableUidList = p;
        extractOrderedTableUidList(pOrderCheckInfo, pStatus);

        uid = pOrderCheckInfo->tableUidList[0];
        pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
2243
      }
2244
    }
2245
  }
2246

2247 2248 2249
  return TSDB_CODE_SUCCESS;
}

2250
static bool moveToNextTable(SUidOrderCheckInfo* pOrderedCheckInfo, SReaderStatus* pStatus) {
2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262
  pOrderedCheckInfo->currentIndex += 1;
  if (pOrderedCheckInfo->currentIndex >= taosHashGetSize(pStatus->pTableMap)) {
    pStatus->pTableIter = NULL;
    return false;
  }

  uint64_t uid = pOrderedCheckInfo->tableUidList[pOrderedCheckInfo->currentIndex];
  pStatus->pTableIter = taosHashGet(pStatus->pTableMap, &uid, sizeof(uid));
  ASSERT(pStatus->pTableIter != NULL);
  return true;
}

2263
static int32_t doLoadLastBlockSequentially(STsdbReader* pReader) {
2264
  SReaderStatus*    pStatus = &pReader->status;
2265 2266
  SLastBlockReader* pLastBlockReader = pStatus->fileIter.pLastBlockReader;

2267 2268
  SUidOrderCheckInfo* pOrderedCheckInfo = &pStatus->uidCheckInfo;
  int32_t             code = initOrderCheckInfo(pOrderedCheckInfo, pStatus);
2269
  if (code != TSDB_CODE_SUCCESS || (taosHashGetSize(pStatus->pTableMap) == 0)) {
2270 2271
    return code;
  }
2272

2273
  while (1) {
2274 2275
    // load the last data block of current table
    STableBlockScanInfo* pScanInfo = pStatus->pTableIter;
H
Hongze Cheng 已提交
2276
    bool                 hasVal = initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
2277
    if (!hasVal) {
2278 2279
      bool hasNexTable = moveToNextTable(pOrderedCheckInfo, pStatus);
      if (!hasNexTable) {
2280 2281 2282
        return TSDB_CODE_SUCCESS;
      }
      continue;
2283 2284
    }

2285 2286 2287 2288 2289 2290 2291 2292
    code = doBuildDataBlock(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
2293

2294
    // current table is exhausted, let's try next table
2295 2296
    bool hasNexTable = moveToNextTable(pOrderedCheckInfo, pStatus);
    if (!hasNexTable) {
2297 2298
      return TSDB_CODE_SUCCESS;
    }
2299 2300 2301
  }
}

2302
static int32_t doBuildDataBlock(STsdbReader* pReader) {
H
Hongze Cheng 已提交
2303 2304
  int32_t   code = TSDB_CODE_SUCCESS;
  SDataBlk* pBlock = NULL;
2305 2306 2307

  SReaderStatus*       pStatus = &pReader->status;
  SDataBlockIter*      pBlockIter = &pStatus->blockIter;
2308 2309 2310
  STableBlockScanInfo* pScanInfo = NULL;
  SFileDataBlockInfo*  pBlockInfo = getCurrentBlockInfo(pBlockIter);
  SLastBlockReader*    pLastBlockReader = pReader->status.fileIter.pLastBlockReader;
2311

2312
  if (pBlockInfo != NULL) {
2313 2314 2315 2316 2317 2318
    pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockInfo->uid, sizeof(pBlockInfo->uid));
  } else {
    pScanInfo = pReader->status.pTableIter;
  }

  if (pBlockInfo != NULL) {
2319
    pBlock = getCurrentBlock(pBlockIter);
2320 2321
  }

2322
  initLastBlockReader(pLastBlockReader, pScanInfo, pReader);
2323
  TSDBKEY key = getCurrentKeyInBuf(pScanInfo, pReader);
2324

2325 2326 2327 2328 2329 2330 2331
  if (pBlockInfo == NULL) {  // build data block from last data file
    ASSERT(pBlockIter->numOfBlocks == 0);
    code = buildComposedDataBlock(pReader);
  } else if (fileBlockShouldLoad(pReader, pBlockInfo, pBlock, pScanInfo, key, pLastBlockReader)) {
    tBlockDataReset(&pStatus->fileBlockData);
    code = tBlockDataInit(&pStatus->fileBlockData, pReader->suid, pScanInfo->uid, pReader->pSchema);
    if (code != TSDB_CODE_SUCCESS) {
2332
      return code;
2333
    }
2334

2335 2336 2337
    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
2338 2339 2340
    }

    // build composed data block
2341
    code = buildComposedDataBlock(pReader);
2342 2343
  } else if (bufferDataInFileBlockGap(pReader->order, key, pBlock)) {
    // data in memory that are earlier than current file block
2344
    // todo rows in buffer should be less than the file block in asc, greater than file block in desc
2345
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
2346
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
2347 2348 2349 2350
  } else {
    if (hasDataInLastBlock(pLastBlockReader) && !ASCENDING_TRAVERSE(pReader->order)) {
      // only return the rows in last block
      int64_t tsLast = getCurrentKeyInLastBlock(pLastBlockReader);
H
Hongze Cheng 已提交
2351
      ASSERT(tsLast >= pBlock->maxKey.ts);
2352 2353 2354
      tBlockDataReset(&pReader->status.fileBlockData);

      code = buildComposedDataBlock(pReader);
H
Hongze Cheng 已提交
2355
    } else {  // whole block is required, return it directly
2356 2357 2358 2359 2360 2361 2362
      SDataBlockInfo* pInfo = &pReader->pResBlock->info;
      pInfo->rows = pBlock->nRow;
      pInfo->uid = pScanInfo->uid;
      pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
      setComposedBlockFlag(pReader, false);
      setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock->maxKey.ts, pReader->order);
    }
2363 2364 2365 2366 2367
  }

  return code;
}

H
Haojun Liao 已提交
2368
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
2369 2370
  SReaderStatus* pStatus = &pReader->status;

2371
  while (1) {
2372 2373 2374
    if (pStatus->pTableIter == NULL) {
      pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
      if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
2375
        return TSDB_CODE_SUCCESS;
2376 2377 2378 2379
      }
    }

    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;
2380
    initMemDataIterator(pBlockScanInfo, pReader);
2381

2382
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
2383
    int32_t code = buildDataBlockFromBuf(pReader, pBlockScanInfo, endKey);
H
Haojun Liao 已提交
2384 2385 2386 2387
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2388
    if (pReader->pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
2389
      return TSDB_CODE_SUCCESS;
2390 2391 2392 2393 2394
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
    if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
2395
      return TSDB_CODE_SUCCESS;
2396 2397 2398 2399
    }
  }
}

2400
// set the correct start position in case of the first/last file block, according to the query time window
2401
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
H
Hongze Cheng 已提交
2402
  SDataBlk* pBlock = getCurrentBlock(pBlockIter);
2403

2404 2405 2406
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
2407 2408 2409

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
2410
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
2411 2412
}

2413
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
2414 2415
  SBlockNumber num = {0};

2416
  int32_t code = moveToNextFile(pReader, &num);
2417 2418 2419 2420 2421
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

  // all data files are consumed, try data in buffer
2422
  if (num.numOfBlocks + num.numOfLastFiles == 0) {
2423 2424 2425 2426 2427
    pReader->status.loadFromFile = false;
    return code;
  }

  // initialize the block iterator for a new fileset
2428 2429
  if (num.numOfBlocks > 0) {
    code = initBlockIterator(pReader, pBlockIter, num.numOfBlocks);
H
Hongze Cheng 已提交
2430
  } else {  // no block data, only last block exists
2431
    tBlockDataReset(&pReader->status.fileBlockData);
2432
    resetDataBlockIterator(pBlockIter, pReader->order);
2433
  }
2434 2435

  // set the correct start position according to the query time window
2436
  initBlockDumpInfo(pReader, pBlockIter);
2437 2438 2439
  return code;
}

2440
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
2441 2442
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
2443 2444
}

2445
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
H
Haojun Liao 已提交
2446
  int32_t code = TSDB_CODE_SUCCESS;
2447 2448
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

2449 2450
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

2451
  if (pBlockIter->numOfBlocks == 0) {
H
Hongze Cheng 已提交
2452
  _begin:
2453 2454 2455 2456 2457
    code = doLoadLastBlockSequentially(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2458 2459 2460 2461
    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }

2462
    // all data blocks are checked in this last block file, now let's try the next file
2463 2464 2465 2466 2467 2468 2469 2470
    if (pReader->status.pTableIter == NULL) {
      code = initForFirstBlockInFile(pReader, pBlockIter);

      // error happens or all the data files are completely checked
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        return code;
      }

2471
      // this file does not have data files, let's start check the last block file if exists
2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486
      if (pBlockIter->numOfBlocks == 0) {
        goto _begin;
      }
    }

    code = doBuildDataBlock(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }

2487
  while (1) {
2488 2489
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2490
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
2491
      code = buildComposedDataBlock(pReader);
2492 2493 2494 2495 2496 2497 2498
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
        bool hasNext = blockIteratorNext(&pReader->status.blockIter);
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
2499
        } else {
H
Haojun Liao 已提交
2500
          if (pReader->status.pCurrentFileset->nSttF > 0) {
2501 2502 2503 2504 2505 2506
            // data blocks in current file are exhausted, let's try the next file now
            tBlockDataReset(&pReader->status.fileBlockData);
            resetDataBlockIterator(pBlockIter, pReader->order);
            goto _begin;
          } else {
            code = initForFirstBlockInFile(pReader, pBlockIter);
2507

2508 2509 2510 2511
            // error happens or all the data files are completely checked
            if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
              return code;
            }
2512

2513 2514 2515 2516
            // this file does not have blocks, let's start check the last block file
            if (pBlockIter->numOfBlocks == 0) {
              goto _begin;
            }
2517
          }
2518
        }
H
Haojun Liao 已提交
2519
      }
2520 2521

      code = doBuildDataBlock(pReader);
2522 2523
    }

2524 2525 2526 2527 2528 2529 2530 2531
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }
2532
}
H
refact  
Hongze Cheng 已提交
2533

2534 2535
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
2536
  if (VND_IS_RSMA(pVnode)) {
2537
    int8_t  level = 0;
2538 2539
    int64_t now = taosGetTimestamp(pVnode->config.tsdbCfg.precision);

2540
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
      if ((now - pRetention->keep) <= winSKey) {
        break;
      }
      ++level;
    }

2554
    const char* str = (idStr != NULL) ? idStr : "";
2555 2556

    if (level == TSDB_RETENTION_L0) {
2557
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
2558
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
2559 2560
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
2561
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
2562
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
2563 2564
      return VND_RSMA1(pVnode);
    } else {
2565
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
2566
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
2567 2568 2569 2570 2571 2572 2573
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
2574
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
2575
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
2576 2577

  int64_t endVer = 0;
L
Liu Jicong 已提交
2578 2579
  if (pCond->endVersion ==
      -1) {  // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
2580 2581
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
2582
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
2583 2584
  }

H
Haojun Liao 已提交
2585
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
2586 2587
}

2588
bool hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order) {
2589 2590 2591 2592
  ASSERT(pKey != NULL);
  if (pDelList == NULL) {
    return false;
  }
L
Liu Jicong 已提交
2593 2594 2595
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
2596

2597 2598 2599 2600 2601 2602
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
      ASSERT(pKey->ts >= last->ts);

      if (pKey->ts > last->ts) {
2603
        return false;
2604 2605 2606
      } else if (pKey->ts == last->ts) {
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
        return (prev->version >= pKey->version);
2607 2608
      }
    } else {
2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

      if (pKey->ts < pCurrent->ts) {
        return false;
      }

      if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
        return true;
      }

      while (pNext->ts <= pKey->ts && (*index) < num - 1) {
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

          if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
            return true;
          }
        }
      }

      return false;
2639 2640
    }
  } else {
2641 2642
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
2643

2644 2645 2646 2647 2648 2649 2650
      if (pKey->ts < pFirst->ts) {
        return false;
      } else if (pKey->ts == pFirst->ts) {
        return pFirst->version >= pKey->version;
      } else {
        ASSERT(0);
      }
2651
    } else {
2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

      if (pKey->ts > pCurrent->ts) {
        return false;
      }

      if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
        return true;
      }

      while (pPrev->ts >= pKey->ts && (*index) > 1) {
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

          if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
            return true;
          }
        }
2679 2680 2681 2682 2683
      }

      return false;
    }
  }
2684 2685

  return false;
2686 2687
}

2688
TSDBROW* getValidMemRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
2689
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
2690 2691
    return NULL;
  }
H
Hongze Cheng 已提交
2692

2693
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
2694
  TSDBKEY  key = {.ts = pRow->pTSRow->ts, .version = pRow->version};
2695
  if (outOfTimeWindow(key.ts, &pReader->window)) {
2696
    pIter->hasVal = false;
H
Haojun Liao 已提交
2697 2698
    return NULL;
  }
H
Hongze Cheng 已提交
2699

2700
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
2701
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
2702
      (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2703 2704
    return pRow;
  }
H
Hongze Cheng 已提交
2705

2706
  while (1) {
2707 2708
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2709 2710
      return NULL;
    }
H
Hongze Cheng 已提交
2711

2712
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
2713

H
Haojun Liao 已提交
2714
    key = TSDBROW_KEY(pRow);
2715
    if (outOfTimeWindow(key.ts, &pReader->window)) {
2716
      pIter->hasVal = false;
H
Haojun Liao 已提交
2717 2718
      return NULL;
    }
H
Hongze Cheng 已提交
2719

dengyihao's avatar
dengyihao 已提交
2720
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
2721
        (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2722 2723 2724 2725
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
2726

2727 2728
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
                         STsdbReader* pReader) {
H
Haojun Liao 已提交
2729
  while (1) {
2730 2731
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2732 2733
      break;
    }
H
Hongze Cheng 已提交
2734

2735
    // data exists but not valid
2736
    TSDBROW* pRow = getValidMemRow(pIter, pDelList, pReader);
2737 2738 2739 2740 2741
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
2742
    TSDBKEY k = TSDBROW_KEY(pRow);
2743
    if (k.ts != ts) {
H
Haojun Liao 已提交
2744 2745 2746
      break;
    }

H
Haojun Liao 已提交
2747
    STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, uid);
2748
    tRowMergerAdd(pMerger, pRow, pTSchema);
H
Haojun Liao 已提交
2749 2750 2751 2752 2753
  }

  return TSDB_CODE_SUCCESS;
}

2754
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
2755
                                          SVersionRange* pVerRange, int32_t step) {
2756 2757
  while (pBlockData->aTSKEY[rowIndex] == key && rowIndex < pBlockData->nRow && rowIndex >= 0) {
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
2758
      rowIndex += step;
2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
    tRowMerge(pMerger, &fRow);
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

H
Hongze Cheng 已提交
2775
static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SDataBlk* pBlock,
2776 2777
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
2778
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2779
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
2780

2781
  *state = CHECK_FILEBLOCK_QUIT;
2782
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
2783

H
Hongze Cheng 已提交
2784 2785
  int32_t   nextIndex = -1;
  SDataBlk* pNeighborBlock = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &nextIndex, pReader->order);
2786
  if (pNeighborBlock == NULL) {  // do nothing
2787 2788 2789 2790
    return 0;
  }

  bool overlap = overlapWithNeighborBlock(pBlock, pNeighborBlock, pReader->order);
2791 2792
  taosMemoryFree(pNeighborBlock);

2793
  if (overlap) {  // load next block
2794
    SReaderStatus*  pStatus = &pReader->status;
2795 2796
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

2797
    // 1. find the next neighbor block in the scan block list
2798
    SFileDataBlockInfo fb = {.uid = pFBlock->uid, .tbBlockIdx = nextIndex};
2799
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);
2800

2801
    // 2. remove it from the scan block list
2802
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);
2803

2804
    // 3. load the neighbor block, and set it to be the currently accessed file data block
H
Haojun Liao 已提交
2805
    tBlockDataReset(&pStatus->fileBlockData);
2806 2807 2808 2809 2810 2811
    int32_t code = tBlockDataInit(&pStatus->fileBlockData, pReader->suid, pFBlock->uid, pReader->pSchema);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    code = doLoadFileBlockData(pReader, pBlockIter, &pStatus->fileBlockData);
2812 2813 2814 2815
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2816
    // 4. check the data values
2817 2818 2819 2820
    initBlockDumpInfo(pReader, pBlockIter);

    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
H
Haojun Liao 已提交
2821
    if (pDumpInfo->rowIndex >= pDumpInfo->totalRows) {
2822 2823 2824 2825 2826 2827 2828
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

  return TSDB_CODE_SUCCESS;
}

2829 2830
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                SRowMerger* pMerger) {
2831 2832
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2833
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
2834
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
2835
  int32_t step = asc ? 1 : -1;
2836

2837
  pDumpInfo->rowIndex += step;
2838
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
2839 2840 2841
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
2842

2843 2844 2845 2846
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
2847

2848
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
2849
      SDataBlk*           pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
2850 2851 2852
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
2853
      }
2854
    }
H
Haojun Liao 已提交
2855
  }
2856

H
Haojun Liao 已提交
2857 2858 2859
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
2860 2861
int32_t doMergeRowsInLastBlock(SLastBlockReader* pLastBlockReader, STableBlockScanInfo* pScanInfo, int64_t ts,
                               SRowMerger* pMerger) {
H
Haojun Liao 已提交
2862
  pScanInfo->lastKey = ts;
2863
  while (nextRowFromLastBlocks(pLastBlockReader, pScanInfo)) {
2864 2865
    int64_t next1 = getCurrentKeyInLastBlock(pLastBlockReader);
    if (next1 == ts) {
2866
      TSDBROW fRow1 = tMergeTreeGetRow(&pLastBlockReader->mergeTree);
2867 2868 2869 2870 2871 2872 2873 2874 2875
      tRowMerge(pMerger, &fRow1);
    } else {
      break;
    }
  }

  return TSDB_CODE_SUCCESS;
}

2876 2877
int32_t doMergeMemTableMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                                 STsdbReader* pReader, bool* freeTSRow) {
H
Haojun Liao 已提交
2878
  TSDBROW* pNextRow = NULL;
2879
  TSDBROW  current = *pRow;
2880

2881 2882
  {  // if the timestamp of the next valid row has a different ts, return current row directly
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
2883

2884 2885 2886
    if (!pIter->hasVal) {
      *pTSRow = current.pTSRow;
      *freeTSRow = false;
2887
      return TSDB_CODE_SUCCESS;
2888
    } else {  // has next point in mem/imem
2889
      pNextRow = getValidMemRow(pIter, pDelList, pReader);
2890 2891 2892
      if (pNextRow == NULL) {
        *pTSRow = current.pTSRow;
        *freeTSRow = false;
2893
        return TSDB_CODE_SUCCESS;
2894 2895
      }

H
Haojun Liao 已提交
2896
      if (current.pTSRow->ts != pNextRow->pTSRow->ts) {
2897 2898
        *pTSRow = current.pTSRow;
        *freeTSRow = false;
2899
        return TSDB_CODE_SUCCESS;
2900
      }
2901
    }
2902 2903
  }

2904 2905
  SRowMerger merge = {0};

2906
  // get the correct schema for data in memory
H
Haojun Liao 已提交
2907
  STSchema* pTSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(&current), pReader, uid);
H
Haojun Liao 已提交
2908

2909 2910
  if (pReader->pSchema == NULL) {
    pReader->pSchema = pTSchema;
2911
  }
H
Haojun Liao 已提交
2912

H
Haojun Liao 已提交
2913 2914 2915 2916 2917 2918
  tRowMergerInit2(&merge, pReader->pSchema, &current, pTSchema);

  STSchema* pTSchema1 = doGetSchemaForTSRow(TSDBROW_SVERSION(pNextRow), pReader, uid);
  tRowMergerAdd(&merge, pNextRow, pTSchema1);

  doMergeRowsInBuf(pIter, uid, current.pTSRow->ts, pDelList, &merge, pReader);
2919 2920 2921 2922
  int32_t code = tRowMergerGetRow(&merge, pTSRow);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
M
Minglei Jin 已提交
2923

2924
  tRowMergerClear(&merge);
2925
  *freeTSRow = true;
2926
  return TSDB_CODE_SUCCESS;
2927 2928
}

2929
int32_t doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
H
Hongze Cheng 已提交
2930
                           STSRow** pTSRow) {
H
Haojun Liao 已提交
2931 2932
  SRowMerger merge = {0};

2933 2934 2935
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

2936
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
H
Haojun Liao 已提交
2937
    STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2938

H
Haojun Liao 已提交
2939
    tRowMergerInit(&merge, piRow, pSchema);
2940
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2941

2942
    tRowMerge(&merge, pRow);
2943
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2944
  } else {
H
Haojun Liao 已提交
2945
    STSchema* pSchema = doGetSchemaForTSRow(TSDBROW_SVERSION(pRow), pReader, pBlockScanInfo->uid);
2946

H
Haojun Liao 已提交
2947
    tRowMergerInit(&merge, pRow, pSchema);
2948
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2949 2950

    tRowMerge(&merge, piRow);
2951
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2952
  }
2953

2954 2955
  int32_t code = tRowMergerGetRow(&merge, pTSRow);
  return code;
2956 2957
}

2958 2959
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STSRow** pTSRow, int64_t endKey,
                            bool* freeTSRow) {
2960 2961
  TSDBROW* pRow = getValidMemRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidMemRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
2962
  SArray*  pDelList = pBlockScanInfo->delSkyline;
2963
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
2964

2965 2966
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
2967
  if (pBlockScanInfo->iter.hasVal) {
2968 2969 2970 2971 2972 2973
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

2974
  if (pBlockScanInfo->iiter.hasVal) {
2975 2976 2977 2978 2979 2980
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

2981
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
2982
    TSDBKEY k = TSDBROW_KEY(pRow);
2983
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
2984

2985
    int32_t code = TSDB_CODE_SUCCESS;
2986 2987
    if (ik.ts != k.ts) {
      if (((ik.ts < k.ts) && asc) || ((ik.ts > k.ts) && (!asc))) {  // ik.ts < k.ts
2988
        code = doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader, freeTSRow);
2989
      } else if (((k.ts < ik.ts) && asc) || ((k.ts > ik.ts) && (!asc))) {
2990
        code = doMergeMemTableMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader, freeTSRow);
2991
      }
2992
    } else {  // ik.ts == k.ts
2993
      *freeTSRow = true;
2994 2995 2996 2997
      code = doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, pTSRow);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
H
Haojun Liao 已提交
2998
    }
2999

3000
    return code;
H
Haojun Liao 已提交
3001 3002
  }

3003
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
H
Hongze Cheng 已提交
3004 3005
    return doMergeMemTableMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader,
                                    freeTSRow);
H
Haojun Liao 已提交
3006 3007
  }

3008
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
3009
    return doMergeMemTableMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader, freeTSRow);
H
Haojun Liao 已提交
3010 3011 3012 3013 3014
  }

  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
3015
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow, uint64_t uid) {
3016 3017 3018
  int32_t numOfRows = pBlock->info.rows;
  int32_t numOfCols = (int32_t)taosArrayGetSize(pBlock->pDataBlock);

3019
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
3020
  STSchema*           pSchema = doGetSchemaForTSRow(pTSRow->sver, pReader, uid);
3021

3022
  SColVal colVal = {0};
3023
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
3024

3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035
  SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
  if (pColInfoData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppend(pColInfoData, numOfRows, (const char*)&pTSRow->ts, false);
    i += 1;
  }

  while (i < numOfCols && j < pSchema->numOfCols) {
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    col_id_t colId = pColInfoData->info.colId;

    if (colId == pSchema->columns[j].colId) {
H
Haojun Liao 已提交
3036
      tTSRowGetVal(pTSRow, pSchema, j, &colVal);
3037 3038 3039 3040 3041 3042 3043 3044
      doCopyColVal(pColInfoData, numOfRows, i, &colVal, pSupInfo);
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
      colDataAppendNULL(pColInfoData, numOfRows);
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
3045
    }
3046 3047
  }

3048
  // set null value since current column does not exist in the "pSchema"
3049
  while (i < numOfCols) {
3050 3051 3052 3053 3054
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    colDataAppendNULL(pColInfoData, numOfRows);
    i += 1;
  }

3055 3056 3057 3058
  pBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

H
Hongze Cheng 已提交
3059 3060
int32_t doAppendRowFromFileBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                 int32_t rowIndex) {
3061 3062 3063 3064 3065 3066 3067 3068
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppendInt64(pColData, outputRowIndex, &pBlockData->aTSKEY[rowIndex]);
3069
    i += 1;
3070 3071 3072 3073 3074 3075
  }

  SColVal cv = {0};
  int32_t numOfInputCols = taosArrayGetSize(pBlockData->aIdx);
  int32_t numOfOutputCols = blockDataGetNumOfCols(pResBlock);

3076
  while (i < numOfOutputCols && j < numOfInputCols) {
3077
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
3078
    SColData*        pData = tBlockDataGetColDataByIdx(pBlockData, j);
3079 3080

    if (pData->cid == pCol->info.colId) {
3081 3082
      tColDataGetValue(pData, rowIndex, &cv);
      doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
3083 3084 3085 3086 3087 3088 3089 3090 3091 3092
      j += 1;
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNULL(pCol, outputRowIndex);
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
3093
    colDataAppendNULL(pCol, outputRowIndex);
3094 3095 3096 3097 3098 3099 3100
    i += 1;
  }

  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

3101 3102
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
H
Haojun Liao 已提交
3103 3104 3105 3106
  SSDataBlock* pBlock = pReader->pResBlock;

  do {
    STSRow* pTSRow = NULL;
3107
    bool    freeTSRow = false;
3108
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &pTSRow, endKey, &freeTSRow);
3109 3110
    if (pTSRow == NULL) {
      break;
H
Haojun Liao 已提交
3111 3112
    }

H
Haojun Liao 已提交
3113
    doAppendRowFromTSRow(pBlock, pReader, pTSRow, pBlockScanInfo->uid);
3114 3115 3116
    if (freeTSRow) {
      taosMemoryFree(pTSRow);
    }
H
Haojun Liao 已提交
3117 3118

    // no data in buffer, return immediately
3119
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
3120 3121 3122
      break;
    }

3123
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
3124 3125 3126 3127
      break;
    }
  } while (1);

3128
  ASSERT(pBlock->info.rows <= capacity);
H
Haojun Liao 已提交
3129 3130
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3131

3132
// todo refactor, use arraylist instead
H
Hongze Cheng 已提交
3133
int32_t tsdbSetTableId(STsdbReader* pReader, int64_t uid) {
3134 3135 3136
  ASSERT(pReader != NULL);
  taosHashClear(pReader->status.pTableMap);

3137
  STableBlockScanInfo info = {.lastKey = 0, .uid = uid};
3138
  taosHashPut(pReader->status.pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
H
Hongze Cheng 已提交
3139 3140 3141
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
3142 3143 3144 3145 3146 3147
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
3148

dengyihao's avatar
dengyihao 已提交
3149 3150 3151 3152 3153 3154
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
3155

H
Hongze Cheng 已提交
3156
uint64_t getReaderMaxVersion(STsdbReader* pReader) { return pReader->verRange.maxVer; }
3157

H
refact  
Hongze Cheng 已提交
3158
// ====================================== EXPOSED APIs ======================================
3159 3160
int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, SArray* pTableList, STsdbReader** ppReader,
                       const char* idstr) {
3161 3162
  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, 4096, idstr);
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
3163 3164
    goto _err;
  }
H
Hongze Cheng 已提交
3165

3166
  // check for query time window
H
Haojun Liao 已提交
3167
  STsdbReader* pReader = *ppReader;
3168
  if (isEmptyQueryTimeWindow(&pReader->window)) {
H
Haojun Liao 已提交
3169 3170 3171
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3172

3173 3174 3175
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
    STimeWindow w = pCond->twindows;
3176
    int32_t     order = pCond->order;
3177 3178 3179 3180 3181 3182 3183 3184 3185 3186
    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.ekey = pCond->twindows.skey;
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
      pCond->twindows.skey = pCond->twindows.ekey;
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

3187
    // here we only need one more row, so the capacity is set to be ONE.
3188 3189 3190 3191 3192 3193 3194 3195
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[0], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.skey = w.ekey;
      pCond->twindows.ekey = INT64_MAX;
3196
    } else {
3197 3198 3199 3200 3201 3202 3203 3204 3205 3206
      pCond->twindows.skey = INT64_MIN;
      pCond->twindows.ekey = w.ekey;
    }
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[1], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

  if (pCond->suid != 0) {
3207
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, pCond->endVersion);
3208 3209
  } else if (taosArrayGetSize(pTableList) > 0) {
    STableKeyInfo* pKey = taosArrayGet(pTableList, 0);
3210
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, pCond->endVersion);
3211 3212
  }

3213 3214
  int32_t numOfTables = taosArrayGetSize(pTableList);
  pReader->status.pTableMap = createDataBlockScanInfo(pReader, pTableList->pData, numOfTables);
H
Haojun Liao 已提交
3215 3216 3217
  if (pReader->status.pTableMap == NULL) {
    tsdbReaderClose(pReader);
    *ppReader = NULL;
H
Haojun Liao 已提交
3218

H
Haojun Liao 已提交
3219 3220 3221
    code = TSDB_CODE_TDB_OUT_OF_MEMORY;
    goto _err;
  }
H
Hongze Cheng 已提交
3222

H
Hongze Cheng 已提交
3223
  code = tsdbTakeReadSnap(pReader->pTsdb, &pReader->pReadSnap);
3224 3225 3226
  if (code != TSDB_CODE_SUCCESS) {
    goto _err;
  }
H
Hongze Cheng 已提交
3227

3228 3229
  if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
    SDataBlockIter* pBlockIter = &pReader->status.blockIter;
3230

3231
    initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
3232
    resetDataBlockIterator(&pReader->status.blockIter, pReader->order);
3233 3234 3235 3236 3237 3238 3239 3240 3241 3242

    // no data in files, let's try buffer in memory
    if (pReader->status.fileIter.numOfFiles == 0) {
      pReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
3243
  } else {
3244
    STsdbReader*    pPrevReader = pReader->innerReader[0];
3245 3246
    SDataBlockIter* pBlockIter = &pPrevReader->status.blockIter;

3247 3248 3249 3250 3251
    code = tsdbTakeReadSnap(pPrevReader->pTsdb, &pPrevReader->pReadSnap);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

3252
    initFilesetIterator(&pPrevReader->status.fileIter, pPrevReader->pReadSnap->fs.aDFileSet, pPrevReader);
3253
    resetDataBlockIterator(&pPrevReader->status.blockIter, pPrevReader->order);
3254 3255 3256 3257 3258 3259 3260 3261 3262

    // no data in files, let's try buffer in memory
    if (pPrevReader->status.fileIter.numOfFiles == 0) {
      pPrevReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pPrevReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
3263 3264 3265
    }
  }

3266
  tsdbDebug("%p total numOfTable:%d in this query %s", pReader, numOfTables, pReader->idStr);
H
Hongze Cheng 已提交
3267
  return code;
H
Hongze Cheng 已提交
3268 3269

_err:
S
Shengliang Guan 已提交
3270
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), pReader->idStr);
H
Hongze Cheng 已提交
3271
  return code;
H
refact  
Hongze Cheng 已提交
3272 3273 3274
}

void tsdbReaderClose(STsdbReader* pReader) {
3275 3276
  if (pReader == NULL) {
    return;
3277
  }
H
refact  
Hongze Cheng 已提交
3278

3279
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
H
Hongze Cheng 已提交
3280

3281 3282 3283 3284
  taosMemoryFreeClear(pSupInfo->plist);
  taosMemoryFree(pSupInfo->colIds);

  taosArrayDestroy(pSupInfo->pColAgg);
L
Liu Jicong 已提交
3285
  for (int32_t i = 0; i < blockDataGetNumOfCols(pReader->pResBlock); ++i) {
3286 3287 3288 3289
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
3290

3291
  taosMemoryFree(pSupInfo->buildBuf);
H
Hongze Cheng 已提交
3292
  tBlockDataDestroy(&pReader->status.fileBlockData, true);
3293 3294

  cleanupDataBlockIterator(&pReader->status.blockIter);
3295 3296

  size_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
3297
  destroyBlockScanInfo(pReader->status.pTableMap);
3298
  blockDataDestroy(pReader->pResBlock);
3299

H
Haojun Liao 已提交
3300 3301 3302
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
3303

3304 3305
  tsdbUntakeReadSnap(pReader->pTsdb, pReader->pReadSnap);

3306 3307
  taosMemoryFree(pReader->status.uidCheckInfo.tableUidList);

H
Haojun Liao 已提交
3308 3309
  SFilesetIter* pFilesetIter = &pReader->status.fileIter;
  if (pFilesetIter->pLastBlockReader != NULL) {
3310
    tMergeTreeClose(&pFilesetIter->pLastBlockReader->mergeTree);
H
Haojun Liao 已提交
3311 3312 3313
    taosMemoryFree(pFilesetIter->pLastBlockReader);
  }

3314
  SIOCostSummary* pCost = &pReader->cost;
H
refact  
Hongze Cheng 已提交
3315

3316
  tsdbDebug("%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
3317 3318
            " SMA-time:%.2f ms, fileBlocks:%" PRId64
            ", fileBlocks-time:%.2f ms, "
3319 3320 3321 3322 3323
            "build in-memory-block-time:%.2f ms, lastBlocks:%" PRId64
            ", lastBlocks-time:%.2f ms, STableBlockScanInfo size:%.2f Kb %s",
            pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaDataLoad, pCost->smaLoadTime,
            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock, pCost->lastBlockLoad,
            pCost->lastBlockLoadTime, numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pReader->idStr);
H
refact  
Hongze Cheng 已提交
3324

3325 3326
  taosMemoryFree(pReader->idStr);
  taosMemoryFree(pReader->pSchema);
3327 3328 3329
  if (pReader->pMemSchema != pReader->pSchema) {
    taosMemoryFree(pReader->pMemSchema);
  }
3330
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
3331 3332
}

3333
static bool doTsdbNextDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
3334
  // cleanup the data that belongs to the previous data block
3335 3336
  SSDataBlock* pBlock = pReader->pResBlock;
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
3337

3338
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
3339

3340 3341 3342 3343 3344
  if (pStatus->loadFromFile) {
    int32_t code = buildBlockFromFiles(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return false;
    }
3345

3346 3347 3348
    if (pBlock->info.rows > 0) {
      return true;
    } else {
H
Haojun Liao 已提交
3349
      buildBlockFromBufferSequentially(pReader);
3350
      return pBlock->info.rows > 0;
H
Haojun Liao 已提交
3351
    }
3352 3353 3354
  } else {  // no data in files, let's try the buffer
    buildBlockFromBufferSequentially(pReader);
    return pBlock->info.rows > 0;
H
Haojun Liao 已提交
3355
  }
3356

3357
  return false;
H
refact  
Hongze Cheng 已提交
3358 3359
}

3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396
bool tsdbNextDataBlock(STsdbReader* pReader) {
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return false;
  }

  if (pReader->innerReader[0] != NULL) {
    bool ret = doTsdbNextDataBlock(pReader->innerReader[0]);
    if (ret) {
      pReader->step = EXTERNAL_ROWS_PREV;
      return ret;
    }

    tsdbReaderClose(pReader->innerReader[0]);
    pReader->innerReader[0] = NULL;
  }

  pReader->step = EXTERNAL_ROWS_MAIN;
  bool ret = doTsdbNextDataBlock(pReader);
  if (ret) {
    return ret;
  }

  if (pReader->innerReader[1] != NULL) {
    bool ret1 = doTsdbNextDataBlock(pReader->innerReader[1]);
    if (ret1) {
      pReader->step = EXTERNAL_ROWS_NEXT;
      return ret1;
    }

    tsdbReaderClose(pReader->innerReader[1]);
    pReader->innerReader[1] = NULL;
  }

  return false;
}

static void setBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
3397 3398 3399 3400
  ASSERT(pDataBlockInfo != NULL && pReader != NULL);
  pDataBlockInfo->rows = pReader->pResBlock->info.rows;
  pDataBlockInfo->uid = pReader->pResBlock->info.uid;
  pDataBlockInfo->window = pReader->pResBlock->info.window;
H
Hongze Cheng 已提交
3401 3402
}

3403 3404
void tsdbRetrieveDataBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
3405
    if (pReader->step == EXTERNAL_ROWS_MAIN) {
3406
      setBlockInfo(pReader, pDataBlockInfo);
3407
    } else if (pReader->step == EXTERNAL_ROWS_PREV) {
3408 3409 3410 3411 3412 3413 3414 3415 3416
      setBlockInfo(pReader->innerReader[0], pDataBlockInfo);
    } else {
      setBlockInfo(pReader->innerReader[1], pDataBlockInfo);
    }
  } else {
    setBlockInfo(pReader, pDataBlockInfo);
  }
}

3417
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SColumnDataAgg*** pBlockStatis, bool* allHave) {
H
Hongze Cheng 已提交
3418
  int32_t code = 0;
3419
  *allHave = false;
H
Hongze Cheng 已提交
3420

3421
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
3422 3423 3424 3425
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }

3426
  // there is no statistics data for composed block
3427 3428 3429 3430
  if (pReader->status.composedDataBlock) {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3431

3432
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
3433

H
Hongze Cheng 已提交
3434 3435
  SDataBlk* pBlock = getCurrentBlock(&pReader->status.blockIter);
  int64_t   stime = taosGetTimestampUs();
H
Hongze Cheng 已提交
3436

3437 3438
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

H
Hongze Cheng 已提交
3439
  if (tDataBlkHasSma(pBlock)) {
H
Hongze Cheng 已提交
3440
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg);
3441
    if (code != TSDB_CODE_SUCCESS) {
3442 3443
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
3444 3445
      return code;
    }
3446 3447 3448
  } else {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
3449
  }
H
Hongze Cheng 已提交
3450

3451
  *allHave = true;
H
Hongze Cheng 已提交
3452

3453 3454
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
3455

3456 3457
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473
  pTsAgg->min = pReader->pResBlock->info.window.skey;
  pTsAgg->max = pReader->pResBlock->info.window.ekey;
  pSup->plist[0] = pTsAgg;

  // update the number of NULL data rows
  size_t numOfCols = blockDataGetNumOfCols(pReader->pResBlock);

  int32_t i = 0, j = 0;
  while (j < numOfCols && i < taosArrayGetSize(pSup->pColAgg)) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colIds[j]) {
      if (IS_BSMA_ON(&(pReader->pSchema->columns[i]))) {
        pSup->plist[j] = pAgg;
      } else {
        *allHave = false;
      }
3474 3475
      i += 1;
      j += 1;
3476 3477 3478 3479 3480 3481 3482
    } else if (pAgg->colId < pSup->colIds[j]) {
      i += 1;
    } else if (pSup->colIds[j] < pAgg->colId) {
      j += 1;
    }
  }

3483
  double elapsed = (taosGetTimestampUs() - stime) / 1000.0;
3484
  pReader->cost.smaLoadTime += elapsed;
3485
  pReader->cost.smaDataLoad += 1;
3486 3487 3488

  *pBlockStatis = pSup->plist;

3489
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", elapsed time:%.2f ms, %s", 0, pFBlock->uid,
3490 3491
            elapsed, pReader->idStr);

H
Hongze Cheng 已提交
3492
  return code;
H
Hongze Cheng 已提交
3493 3494
}

3495
static SArray* doRetrieveDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
3496 3497 3498
  SReaderStatus* pStatus = &pReader->status;

  if (pStatus->composedDataBlock) {
3499
    return pReader->pResBlock->pDataBlock;
3500
  }
3501

3502
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pStatus->blockIter);
3503
  STableBlockScanInfo* pBlockScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
3504

H
Haojun Liao 已提交
3505
  tBlockDataReset(&pStatus->fileBlockData);
3506 3507
  int32_t code = tBlockDataInit(&pStatus->fileBlockData, pReader->suid, pBlockScanInfo->uid, pReader->pSchema);
  if (code != TSDB_CODE_SUCCESS) {
3508 3509
    terrno = code;
    return NULL;
3510 3511 3512
  }

  code = doLoadFileBlockData(pReader, &pStatus->blockIter, &pStatus->fileBlockData);
3513
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
3514
    tBlockDataDestroy(&pStatus->fileBlockData, 1);
3515 3516
    terrno = code;
    return NULL;
3517
  }
3518 3519 3520

  copyBlockDataToSDataBlock(pReader, pBlockScanInfo);
  return pReader->pResBlock->pDataBlock;
H
Hongze Cheng 已提交
3521 3522
}

3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534
SArray* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
      return doRetrieveDataBlock(pReader->innerReader[0]);
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
      return doRetrieveDataBlock(pReader->innerReader[1]);
    }
  }

  return doRetrieveDataBlock(pReader);
}

H
Haojun Liao 已提交
3535
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
3536 3537 3538
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
3539

L
Liu Jicong 已提交
3540
  pReader->order = pCond->order;
3541
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
3542
  pReader->status.loadFromFile = true;
dengyihao's avatar
dengyihao 已提交
3543
  pReader->status.pTableIter = NULL;
H
Haojun Liao 已提交
3544
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
3545

3546
  // allocate buffer in order to load data blocks from file
3547
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
3548 3549
  memset(pReader->suppInfo.plist, 0, POINTER_BYTES);

3550
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
3551
  tsdbDataFReaderClose(&pReader->pFileReader);
3552

3553
  int32_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
L
Liu Jicong 已提交
3554

3555
  initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader);
3556
  resetDataBlockIterator(&pReader->status.blockIter, pReader->order);
H
Haojun Liao 已提交
3557 3558 3559

  int64_t ts = ASCENDING_TRAVERSE(pReader->order)?pReader->window.skey:pReader->window.ekey;
  resetDataBlockScanInfo(pReader->status.pTableMap, ts);
3560

3561
  int32_t         code = 0;
3562 3563
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

3564 3565 3566 3567 3568 3569
  // no data in files, let's try buffer in memory
  if (pReader->status.fileIter.numOfFiles == 0) {
    pReader->status.loadFromFile = false;
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
3570 3571
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
3572 3573 3574
      return code;
    }
  }
H
Hongze Cheng 已提交
3575

dengyihao's avatar
dengyihao 已提交
3576 3577
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s",
            pReader, pReader->suid, numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
3578

3579
  return code;
H
Hongze Cheng 已提交
3580
}
H
Hongze Cheng 已提交
3581

3582 3583 3584
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows) {
  return (numOfRows - startRow) / bucketRange;
}
H
Hongze Cheng 已提交
3585

3586 3587 3588 3589
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
H
Hongze Cheng 已提交
3590

3591 3592
  // find the start data block in file
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
3593

3594 3595 3596
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
3597

3598
  int32_t bucketRange = ceil((pc->maxRows - pc->minRows) / 20.0);
H
Hongze Cheng 已提交
3599

3600
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
3601

3602 3603
  int32_t numOfTables = (int32_t)taosHashGetSize(pStatus->pTableMap);
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
3604

3605 3606
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
3607

3608 3609
  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
H
Haojun Liao 已提交
3610
  }
H
Hongze Cheng 已提交
3611

3612
  pTableBlockInfo->numOfTables = numOfTables;
3613
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
3614

3615 3616
  while (true) {
    if (hasNext) {
H
Hongze Cheng 已提交
3617
      SDataBlk* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
3618

3619 3620
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
3621

3622 3623 3624
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
3625

3626 3627 3628
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
3629

3630 3631 3632
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
3633

3634 3635
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows);
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
3636 3637

      hasNext = blockIteratorNext(&pStatus->blockIter);
3638 3639 3640 3641 3642
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        break;
      }
H
refact  
Hongze Cheng 已提交
3643

3644 3645
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
      hasNext = (pBlockIter->numOfBlocks > 0);
3646
    }
H
refact  
Hongze Cheng 已提交
3647

H
Hongze Cheng 已提交
3648 3649
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
3650
  }
H
Hongze Cheng 已提交
3651

H
refact  
Hongze Cheng 已提交
3652 3653
  return code;
}
H
Hongze Cheng 已提交
3654

H
refact  
Hongze Cheng 已提交
3655
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
3656
  int64_t rows = 0;
H
Hongze Cheng 已提交
3657

3658 3659
  SReaderStatus* pStatus = &pReader->status;
  pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
H
Hongze Cheng 已提交
3660

3661 3662 3663 3664 3665
  while (pStatus->pTableIter != NULL) {
    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;

    STbData* d = NULL;
    if (pReader->pTsdb->mem != NULL) {
H
Hongze Cheng 已提交
3666
      d = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid);
3667 3668 3669 3670 3671 3672 3673
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
    if (pReader->pTsdb->imem != NULL) {
H
Hongze Cheng 已提交
3674
      di = tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid);
3675 3676 3677 3678 3679 3680 3681 3682
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
  }
H
Hongze Cheng 已提交
3683

H
refact  
Hongze Cheng 已提交
3684
  return rows;
H
Hongze Cheng 已提交
3685
}
D
dapan1121 已提交
3686

L
Liu Jicong 已提交
3687
int32_t tsdbGetTableSchema(SVnode* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699
  int32_t sversion = 1;

  SMetaReader mr = {0};
  metaReaderInit(&mr, pVnode->pMeta, 0);
  int32_t code = metaGetTableEntryByUid(&mr, uid);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
3700

D
dapan1121 已提交
3701
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
3702
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717
    *suid = mr.me.ctbEntry.suid;
    code = metaGetTableEntryByUid(&mr, *suid);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
    sversion = mr.me.stbEntry.schemaRow.version;
  } else {
    ASSERT(mr.me.type == TSDB_NORMAL_TABLE);
    sversion = mr.me.ntbEntry.schemaRow.version;
  }

  metaReaderClear(&mr);
  *pSchema = metaGetTbTSchema(pVnode->pMeta, uid, sversion);
L
Liu Jicong 已提交
3718

D
dapan1121 已提交
3719 3720
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750

int32_t tsdbTakeReadSnap(STsdb* pTsdb, STsdbReadSnap** ppSnap) {
  int32_t code = 0;

  // alloc
  *ppSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap));
  if (*ppSnap == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
  code = taosThreadRwlockRdlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

  // take snapshot
  (*ppSnap)->pMem = pTsdb->mem;
  (*ppSnap)->pIMem = pTsdb->imem;

  if ((*ppSnap)->pMem) {
    tsdbRefMemTable((*ppSnap)->pMem);
  }

  if ((*ppSnap)->pIMem) {
    tsdbRefMemTable((*ppSnap)->pIMem);
  }

H
Hongze Cheng 已提交
3751
  // fs
H
Hongze Cheng 已提交
3752 3753 3754 3755 3756
  code = tsdbFSRef(pTsdb, &(*ppSnap)->fs);
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
3757 3758 3759 3760 3761 3762 3763 3764

  // unlock
  code = taosThreadRwlockUnlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

S
Shengliang Guan 已提交
3765
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779
_exit:
  return code;
}

void tsdbUntakeReadSnap(STsdb* pTsdb, STsdbReadSnap* pSnap) {
  if (pSnap) {
    if (pSnap->pMem) {
      tsdbUnrefMemTable(pSnap->pMem);
    }

    if (pSnap->pIMem) {
      tsdbUnrefMemTable(pSnap->pIMem);
    }

H
Hongze Cheng 已提交
3780
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
3781
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
3782
  }
H
Hongze Cheng 已提交
3783

S
Shengliang Guan 已提交
3784
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
3785
}