tsdbRead.c 104.3 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Hongze Cheng 已提交
16
#include "tsdb.h"
17
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
H
Hongze Cheng 已提交
18

19 20 21 22 23 24
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

25
typedef struct {
dengyihao's avatar
dengyihao 已提交
26
  STbDataIter* iter;
27 28 29 30
  int32_t      index;
  bool         hasVal;
} SIterInfo;

H
Haojun Liao 已提交
31
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
32 33
  uint64_t  uid;
  TSKEY     lastKey;
34
  SMapData  mapData;     // block info (compressed)
dengyihao's avatar
dengyihao 已提交
35 36 37 38 39 40
  SArray*   pBlockList;  // block data index list
  SIterInfo iter;        // mem buffer skip list iterator
  SIterInfo iiter;       // imem buffer skip list iterator
  SArray*   delSkyline;  // delete info for this table
  int32_t   fileDelIndex;
  bool      iterInit;  // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
41 42 43
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
44
  int64_t uid;
45
  int64_t offset;
H
Haojun Liao 已提交
46
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
47 48

typedef struct SBlockOrderSupporter {
49 50 51 52
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
53 54 55
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
56 57 58
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
59
  int64_t headFileLoad;
60 61 62
  double  headFileLoadTime;
  int64_t smaData;
  double  smaLoadTime;
H
Hongze Cheng 已提交
63 64 65
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
66
  SArray*          pColAgg;
67
  SColumnDataAgg   tsColAgg;
C
Cary Xu 已提交
68
  SColumnDataAgg** plist;
69 70
  int16_t*         colIds;    // column ids for loading file block data
  char**           buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
H
Hongze Cheng 已提交
71 72
} SBlockLoadSuppInfo;

73
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
74 75 76 77
  int32_t numOfFiles;  // number of total files
  int32_t index;       // current accessed index in the list
  SArray* pFileList;   // data file list
  int32_t order;
78
} SFilesetIter;
H
Haojun Liao 已提交
79 80

typedef struct SFileDataBlockInfo {
81
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
82
  uint64_t uid;
83
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
84 85 86
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
87 88 89 90 91
  int32_t   numOfBlocks;
  int32_t   index;
  SArray*   blockList;  // SArray<SFileDataBlockInfo>
  int32_t   order;
  SBlock    block;  // current SBlock data
92
  SHashObj* pTableMap;
H
Haojun Liao 已提交
93 94 95
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
96 97 98 99
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
100 101
} SFileBlockDumpInfo;

H
Haojun Liao 已提交
102
typedef struct SVersionRange {
dengyihao's avatar
dengyihao 已提交
103 104
  uint64_t minVer;
  uint64_t maxVer;
H
Haojun Liao 已提交
105 106
} SVersionRange;

H
Haojun Liao 已提交
107
typedef struct SReaderStatus {
dengyihao's avatar
dengyihao 已提交
108 109
  bool                 loadFromFile;  // check file stage
  SHashObj*            pTableMap;     // SHash<STableBlockScanInfo>
110
  STableBlockScanInfo* pTableIter;    // table iterator used in building in-memory buffer data blocks.
111
  SFileBlockDumpInfo   fBlockDumpInfo;
112 113 114 115 116
  SDFileSet*           pCurrentFileset;  // current opened file set
  SBlockData           fileBlockData;
  SFilesetIter         fileIter;
  SDataBlockIter       blockIter;
  bool                 composedDataBlock;  // the returned data block is a composed block or not
H
Haojun Liao 已提交
117 118
} SReaderStatus;

H
Hongze Cheng 已提交
119
struct STsdbReader {
H
Haojun Liao 已提交
120 121 122 123 124 125 126
  STsdb*             pTsdb;
  uint64_t           suid;
  int16_t            order;
  STimeWindow        window;  // the primary query time window that applies to all queries
  SSDataBlock*       pResBlock;
  int32_t            capacity;
  SReaderStatus      status;
127 128
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
129
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
130
  STsdbReadSnap*     pReadSnap;
131 132 133 134
  SIOCostSummary     cost;
  STSchema*          pSchema;
  SDataFReader*      pFileReader;
  SVersionRange      verRange;
135

136 137
  int32_t      step;
  STsdbReader* innerReader[2];
H
Hongze Cheng 已提交
138
};
H
Hongze Cheng 已提交
139

H
Haojun Liao 已提交
140
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
141 142
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
143
static TSDBROW* getValidRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
144 145
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                        SRowMerger* pMerger);
146
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
dengyihao's avatar
dengyihao 已提交
147
                                 STsdbReader* pReader);
148
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow);
149 150
static int32_t  doAppendRowFromBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData,
                                     int32_t rowIndex);
151 152
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
static void     updateSchema(TSDBROW* pRow, uint64_t uid, STsdbReader* pReader);
153
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order);
154

dengyihao's avatar
dengyihao 已提交
155 156
static void doMergeMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                             STsdbReader* pReader);
157 158
static void doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
                               STSRow** pTSRow);
dengyihao's avatar
dengyihao 已提交
159 160 161 162
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
163
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Haojun Liao 已提交
164

165 166 167
static int32_t setColumnIdSlotList(STsdbReader* pReader, SSDataBlock* pBlock) {
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

168
  size_t numOfCols = blockDataGetNumOfCols(pBlock);
169

170
  pSupInfo->colIds = taosMemoryMalloc(numOfCols * sizeof(int16_t));
171
  pSupInfo->buildBuf = taosMemoryCalloc(numOfCols, POINTER_BYTES);
172 173 174
  if (pSupInfo->buildBuf == NULL || pSupInfo->colIds == NULL) {
    taosMemoryFree(pSupInfo->colIds);
    taosMemoryFree(pSupInfo->buildBuf);
H
Haojun Liao 已提交
175 176
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
177

H
Haojun Liao 已提交
178 179
  for (int32_t i = 0; i < numOfCols; ++i) {
    SColumnInfoData* pCol = taosArrayGet(pBlock->pDataBlock, i);
180
    pSupInfo->colIds[i] = pCol->info.colId;
181 182 183 184

    if (IS_VAR_DATA_TYPE(pCol->info.type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCol->info.bytes);
    }
H
Haojun Liao 已提交
185
  }
H
Hongze Cheng 已提交
186

H
Haojun Liao 已提交
187 188
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
189

190
static SHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, const STableKeyInfo* idList, int32_t numOfTables) {
H
Haojun Liao 已提交
191
  // allocate buffer in order to load data blocks from file
192
  // todo use simple hash instead, optimize the memory consumption
193 194 195
  SHashObj* pTableMap =
      taosHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK);
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
196 197 198
    return NULL;
  }

199 200 201 202 203
  for (int32_t j = 0; j < numOfTables; ++j) {
    STableBlockScanInfo info = {.lastKey = 0, .uid = idList[j].uid};
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
      if (info.lastKey == INT64_MIN || info.lastKey < pTsdbReader->window.skey) {
        info.lastKey = pTsdbReader->window.skey;
H
Haojun Liao 已提交
204 205
      }

206
      ASSERT(info.lastKey >= pTsdbReader->window.skey && info.lastKey <= pTsdbReader->window.ekey);
wmmhello's avatar
wmmhello 已提交
207
    } else {
208
      info.lastKey = pTsdbReader->window.skey;
H
Haojun Liao 已提交
209
    }
wmmhello's avatar
wmmhello 已提交
210

211 212 213
    taosHashPut(pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
    tsdbDebug("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, info.uid, info.lastKey,
              pTsdbReader->idStr);
H
Haojun Liao 已提交
214 215
  }

216 217
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->idStr);
218

219
  return pTableMap;
H
Hongze Cheng 已提交
220
}
H
Hongze Cheng 已提交
221

222 223 224
static void resetDataBlockScanInfo(SHashObj* pTableMap) {
  STableBlockScanInfo* p = NULL;

dengyihao's avatar
dengyihao 已提交
225
  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
226 227
    p->iterInit = false;
    p->iiter.hasVal = false;
dengyihao's avatar
dengyihao 已提交
228
    if (p->iter.iter != NULL) {
229
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
230 231
    }

232
    p->delSkyline = taosArrayDestroy(p->delSkyline);
233 234 235
  }
}

236 237 238 239 240 241 242 243
static void destroyBlockScanInfo(SHashObj* pTableMap) {
  STableBlockScanInfo* p = NULL;

  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
    p->iterInit = false;
    p->iiter.hasVal = false;

    if (p->iter.iter != NULL) {
244
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
245 246 247
    }

    if (p->iiter.iter != NULL) {
248
      p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
249 250
    }

251 252
    p->delSkyline = taosArrayDestroy(p->delSkyline);
    p->pBlockList = taosArrayDestroy(p->pBlockList);
253
    tMapDataClear(&p->mapData);
254 255 256 257 258
  }

  taosHashCleanup(pTableMap);
}

259
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) {
260 261
  ASSERT(pWindow != NULL);
  return pWindow->skey > pWindow->ekey;
H
Haojun Liao 已提交
262
}
H
Hongze Cheng 已提交
263

264 265 266
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
267
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
268

269
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
270
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
271

dengyihao's avatar
dengyihao 已提交
272
  STimeWindow win = *pWindow;
273 274 275 276 277 278
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
279

H
Haojun Liao 已提交
280
static void limitOutputBufferSize(const SQueryTableDataCond* pCond, int32_t* capacity) {
H
Haojun Liao 已提交
281 282 283 284 285 286
  int32_t rowLen = 0;
  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    rowLen += pCond->colList[i].bytes;
  }

  // make sure the output SSDataBlock size be less than 2MB.
H
Haojun Liao 已提交
287 288 289
  const int32_t TWOMB = 2 * 1024 * 1024;
  if ((*capacity) * rowLen > TWOMB) {
    (*capacity) = TWOMB / rowLen;
H
Haojun Liao 已提交
290 291 292 293
  }
}

// init file iterator
H
Hongze Cheng 已提交
294 295
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet, int32_t order, const char* idstr) {
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
296

297 298
  pIter->index = ASCENDING_TRAVERSE(order) ? -1 : numOfFileset;
  pIter->order = order;
H
Hongze Cheng 已提交
299
  pIter->pFileList = aDFileSet;
300
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
301

H
Haojun Liao 已提交
302
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, idstr);
H
Haojun Liao 已提交
303 304 305
  return TSDB_CODE_SUCCESS;
}

306
static bool filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader) {
307 308
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
309 310 311
  pIter->index += step;

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
H
Haojun Liao 已提交
312 313 314 315 316
    return false;
  }

  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
317

318
  while (1) {
H
Haojun Liao 已提交
319 320 321
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
322

323
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
324

325 326 327 328
    int32_t code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
329

330 331
    pReader->cost.headFileLoad += 1;

332 333 334 335 336 337 338 339 340 341 342 343
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
      return false;
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
344 345 346
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
        return false;
      }
347 348
      continue;
    }
C
Cary Xu 已提交
349

350
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
351
              pReader->window.ekey, pReader->idStr);
352 353
    return true;
  }
354

355
_err:
H
Haojun Liao 已提交
356 357 358
  return false;
}

359
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order, SHashObj* pTableMap) {
360 361
  pIter->order = order;
  pIter->index = -1;
H
Haojun Liao 已提交
362
  pIter->numOfBlocks = -1;
363 364 365 366 367
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
368
  pIter->pTableMap = pTableMap;
369 370
}

L
Liu Jicong 已提交
371
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
372

H
Haojun Liao 已提交
373
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
374 375
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
376 377
}

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    SColumnInfoData colInfo = {{0}, 0};
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }

  return pResBlock;
}

401 402
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, STsdbReader** ppReader, int32_t capacity,
                                const char* idstr) {
H
Haojun Liao 已提交
403
  int32_t      code = 0;
404
  int8_t       level = 0;
H
Haojun Liao 已提交
405
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
406 407
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
408
    goto _end;
H
Hongze Cheng 已提交
409 410
  }

C
Cary Xu 已提交
411 412 413 414
  if (VND_IS_TSMA(pVnode)) {
    tsdbDebug("vgId:%d, tsma is selected to query", TD_VID(pVnode));
  }

H
Haojun Liao 已提交
415
  initReaderStatus(&pReader->status);
416

L
Liu Jicong 已提交
417
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
418 419
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
420
  pReader->capacity = capacity;
dengyihao's avatar
dengyihao 已提交
421 422
  pReader->idStr = (idstr != NULL) ? strdup(idstr) : NULL;
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
423
  pReader->type = pCond->type;
424
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
425

426
  ASSERT(pCond->numOfCols > 0);
H
Hongze Cheng 已提交
427

428
  limitOutputBufferSize(pCond, &pReader->capacity);
429

430 431
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
432
  pSup->pColAgg = taosArrayInit(4, sizeof(SColumnDataAgg));
433
  pSup->plist = taosMemoryCalloc(pCond->numOfCols, POINTER_BYTES);
434
  if (pSup->pColAgg == NULL || pSup->plist == NULL) {
435 436 437
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
438

439 440
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;

H
Haojun Liao 已提交
441 442 443 444 445 446
  code = tBlockDataInit(&pReader->status.fileBlockData);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

447 448 449 450
  pReader->pResBlock = createResBlock(pCond, pReader->capacity);
  if (pReader->pResBlock == NULL) {
    code = terrno;
    goto _end;
H
Hongze Cheng 已提交
451
  }
H
Hongze Cheng 已提交
452

453 454
  setColumnIdSlotList(pReader, pReader->pResBlock);

H
Hongze Cheng 已提交
455 456
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
457

H
Haojun Liao 已提交
458 459
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
460 461 462
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495

// void tsdbResetQueryHandleForNewTable(STsdbReader* queryHandle, SQueryTableDataCond* pCond, STableListInfo* tableList,
//                                      int32_t tWinIdx) {
//   STsdbReader* pTsdbReadHandle = queryHandle;

//   pTsdbReadHandle->order = pCond->order;
//   pTsdbReadHandle->window = pCond->twindows[tWinIdx];
//   pTsdbReadHandle->type = TSDB_QUERY_TYPE_ALL;
//   pTsdbReadHandle->cur.fid = -1;
//   pTsdbReadHandle->cur.win = TSWINDOW_INITIALIZER;
//   pTsdbReadHandle->checkFiles = true;
//   pTsdbReadHandle->activeIndex = 0;  // current active table index
//   pTsdbReadHandle->locateStart = false;
//   pTsdbReadHandle->loadExternalRow = pCond->loadExternalRows;

//   if (ASCENDING_TRAVERSE(pCond->order)) {
//     assert(pTsdbReadHandle->window.skey <= pTsdbReadHandle->window.ekey);
//   } else {
//     assert(pTsdbReadHandle->window.skey >= pTsdbReadHandle->window.ekey);
//   }

//   // allocate buffer in order to load data blocks from file
//   memset(pTsdbReadHandle->suppInfo.pstatis, 0, sizeof(SColumnDataAgg));
//   memset(pTsdbReadHandle->suppInfo.plist, 0, POINTER_BYTES);

//   tsdbInitDataBlockLoadInfo(&pTsdbReadHandle->dataBlockLoadInfo);
//   tsdbInitCompBlockLoadInfo(&pTsdbReadHandle->compBlockLoadInfo);

//   SArray* pTable = NULL;
//   //  STsdbMeta* pMeta = tsdbGetMeta(pTsdbReadHandle->pTsdb);

//   //  pTsdbReadHandle->pTableCheckInfo = destroyTableCheckInfo(pTsdbReadHandle->pTableCheckInfo);

H
Haojun Liao 已提交
496
//   pTsdbReadHandle->pTableCheckInfo = NULL;  // createDataBlockScanInfo(pTsdbReadHandle, groupList, pMeta,
H
Hongze Cheng 已提交
497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
//                                             // &pTable);
//   if (pTsdbReadHandle->pTableCheckInfo == NULL) {
//     //    tsdbReaderClose(pTsdbReadHandle);
//     terrno = TSDB_CODE_TDB_OUT_OF_MEMORY;
//   }

//   //  pTsdbReadHandle->prev = doFreeColumnInfoData(pTsdbReadHandle->prev);
//   //  pTsdbReadHandle->next = doFreeColumnInfoData(pTsdbReadHandle->next);
// }

// SArray* tsdbGetQueriedTableList(STsdbReader** pHandle) {
//   assert(pHandle != NULL);

//   STsdbReader* pTsdbReadHandle = (STsdbReader*)pHandle;

//   size_t  size = taosArrayGetSize(pTsdbReadHandle->pTableCheckInfo);
//   SArray* res = taosArrayInit(size, POINTER_BYTES);
//   return res;
// }

// static int32_t binarySearchForBlock(SBlock* pBlock, int32_t numOfBlocks, TSKEY skey, int32_t order) {
//   int32_t firstSlot = 0;
//   int32_t lastSlot = numOfBlocks - 1;
H
Hongze Cheng 已提交
520

H
Hongze Cheng 已提交
521
//   int32_t midSlot = firstSlot;
H
Hongze Cheng 已提交
522

H
Hongze Cheng 已提交
523 524 525
//   while (1) {
//     numOfBlocks = lastSlot - firstSlot + 1;
//     midSlot = (firstSlot + (numOfBlocks >> 1));
H
Hongze Cheng 已提交
526

H
Hongze Cheng 已提交
527
//     if (numOfBlocks == 1) break;
H
Hongze Cheng 已提交
528

H
Hongze Cheng 已提交
529 530 531 532 533 534 535 536 537 538 539
//     if (skey > pBlock[midSlot].maxKey.ts) {
//       if (numOfBlocks == 2) break;
//       if ((order == TSDB_ORDER_DESC) && (skey < pBlock[midSlot + 1].minKey.ts)) break;
//       firstSlot = midSlot + 1;
//     } else if (skey < pBlock[midSlot].minKey.ts) {
//       if ((order == TSDB_ORDER_ASC) && (skey > pBlock[midSlot - 1].maxKey.ts)) break;
//       lastSlot = midSlot - 1;
//     } else {
//       break;  // got the slot
//     }
//   }
H
Hongze Cheng 已提交
540

H
Hongze Cheng 已提交
541 542
//   return midSlot;
// }
H
Hongze Cheng 已提交
543

H
Haojun Liao 已提交
544
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
545
  SArray* aBlockIdx = taosArrayInit(8, sizeof(SBlockIdx));
H
Hongze Cheng 已提交
546

547
  int64_t st = taosGetTimestampUs();
548
  int32_t code = tsdbReadBlockIdx(pFileReader, aBlockIdx, NULL);
H
Haojun Liao 已提交
549
  if (code != TSDB_CODE_SUCCESS) {
550
    goto _end;
H
Haojun Liao 已提交
551
  }
H
Hongze Cheng 已提交
552

553 554
  size_t num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
H
Hongze Cheng 已提交
555
    taosArrayClear(aBlockIdx);
H
Haojun Liao 已提交
556 557
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
558

559 560 561 562
  int64_t et1 = taosGetTimestampUs();

  SBlockIdx* pBlockIdx = NULL;
  for (int32_t i = 0; i < num; ++i) {
563
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Haojun Liao 已提交
564

565
    // uid check
H
Hongze Cheng 已提交
566
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
567 568 569 570
      continue;
    }

    // this block belongs to a table that is not queried.
H
Hongze Cheng 已提交
571
    void* p = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(uint64_t));
H
Haojun Liao 已提交
572 573 574 575 576 577
    if (p == NULL) {
      continue;
    }

    STableBlockScanInfo* pScanInfo = p;
    if (pScanInfo->pBlockList == NULL) {
578
      pScanInfo->pBlockList = taosArrayInit(4, sizeof(int32_t));
H
Haojun Liao 已提交
579 580
    }

H
Hongze Cheng 已提交
581
    taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
582
  }
H
Hongze Cheng 已提交
583

584
  int64_t et2 = taosGetTimestampUs();
585
  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
586
            (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0, pReader->idStr);
587 588 589

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

590
_end:
H
Hongze Cheng 已提交
591
  taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
592 593
  return code;
}
H
Hongze Cheng 已提交
594

595 596
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, uint32_t* numOfValidTables,
                               int32_t* numOfBlocks) {
H
Haojun Liao 已提交
597 598
  size_t numOfTables = taosArrayGetSize(pIndexList);
  *numOfValidTables = 0;
H
Hongze Cheng 已提交
599

600
  int64_t st = taosGetTimestampUs();
601
  size_t  size = 0;
602

603
  STableBlockScanInfo* px = NULL;
dengyihao's avatar
dengyihao 已提交
604
  while (1) {
605 606 607 608 609
    px = taosHashIterate(pReader->status.pTableMap, px);
    if (px == NULL) {
      break;
    }

610
    tMapDataClear(&px->mapData);
611 612 613
    taosArrayClear(px->pBlockList);
  }

dengyihao's avatar
dengyihao 已提交
614
  for (int32_t i = 0; i < numOfTables; ++i) {
H
Haojun Liao 已提交
615
    SBlockIdx* pBlockIdx = taosArrayGet(pIndexList, i);
H
Hongze Cheng 已提交
616

617
    STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(int64_t));
H
Hongze Cheng 已提交
618

619 620
    tMapDataReset(&pScanInfo->mapData);
    tsdbReadBlock(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData, NULL);
621

622 623
    size += pScanInfo->mapData.nData;
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Haojun Liao 已提交
624
      SBlock block = {0};
625
      tMapDataGetItemByIdx(&pScanInfo->mapData, j, &block, tGetBlock);
H
Hongze Cheng 已提交
626

627
      // 1. time range check
628
      if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
H
Haojun Liao 已提交
629 630
        continue;
      }
H
Hongze Cheng 已提交
631

632
      // 2. version range check
633 634 635
      if (block.minVersion > pReader->verRange.maxVer || block.maxVersion < pReader->verRange.minVer) {
        continue;
      }
636

637
      void* p = taosArrayPush(pScanInfo->pBlockList, &j);
H
Haojun Liao 已提交
638
      if (p == NULL) {
639
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
640 641
        return TSDB_CODE_OUT_OF_MEMORY;
      }
642 643

      (*numOfBlocks) += 1;
H
Haojun Liao 已提交
644
    }
H
Hongze Cheng 已提交
645

H
Haojun Liao 已提交
646 647 648 649
    if (pScanInfo->pBlockList != NULL && taosArrayGetSize(pScanInfo->pBlockList) > 0) {
      (*numOfValidTables) += 1;
    }
  }
H
Hongze Cheng 已提交
650

651
  double el = (taosGetTimestampUs() - st) / 1000.0;
652
  tsdbDebug("load block of %d tables completed, blocks:%d in %d tables, size:%.2f Kb, elapsed time:%.2f ms %s",
653
            numOfTables, *numOfBlocks, *numOfValidTables, size / 1000.0, el, pReader->idStr);
654 655 656

  pReader->cost.numOfBlocks += (*numOfBlocks);
  pReader->cost.headFileLoadTime += el;
657

H
Haojun Liao 已提交
658 659
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
660

661 662
// todo remove pblock parameter
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, SBlock* pBlock, int32_t order) {
663
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
H
Haojun Liao 已提交
664

665
  pDumpInfo->allDumped = true;
666
  pDumpInfo->lastKey = pBlock->maxKey.ts + step;
H
Haojun Liao 已提交
667 668
}

669 670
static void doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
                         SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
671
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
672
    if (pColVal->isNull || pColVal->isNone) {
H
Haojun Liao 已提交
673 674 675 676 677 678 679
      colDataAppendNULL(pColInfoData, rowIndex);
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
      memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      colDataAppend(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
    }
  } else {
680
    colDataAppend(pColInfoData, rowIndex, (const char*)&pColVal->value, pColVal->isNull || pColVal->isNone);
H
Haojun Liao 已提交
681
  }
H
Haojun Liao 已提交
682 683
}

684 685 686 687 688
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
  SFileDataBlockInfo* pFBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pFBlockInfo;
}

689
static SBlock* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
690

691
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
692
  SReaderStatus*  pStatus = &pReader->status;
693
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
H
Hongze Cheng 已提交
694

695
  SBlockData*         pBlockData = &pStatus->fileBlockData;
H
Haojun Liao 已提交
696
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
697
  SBlock*             pBlock = getCurrentBlock(pBlockIter);
H
Haojun Liao 已提交
698
  SSDataBlock*        pResBlock = pReader->pResBlock;
699
  int32_t             numOfOutputCols = blockDataGetNumOfCols(pResBlock);
H
Haojun Liao 已提交
700

H
Haojun Liao 已提交
701
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
702
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Haojun Liao 已提交
703

H
Haojun Liao 已提交
704
  SColVal cv = {0};
705
  int64_t st = taosGetTimestampUs();
706 707
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
708

709
  int32_t rowIndex = 0;
710 711
  int32_t remain = asc ? (pBlockData->nRow - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex + 1);

712 713 714 715 716 717 718 719
  int32_t endIndex = 0;
  if (remain <= pReader->capacity) {
    endIndex = pBlockData->nRow;
  } else {
    endIndex = pDumpInfo->rowIndex + step * pReader->capacity;
    remain = pReader->capacity;
  }

720
  int32_t          i = 0;
721 722
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
723
    for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
724 725 726 727 728
      colDataAppend(pColData, rowIndex++, (const char*)&pBlockData->aTSKEY[j], false);
    }
    i += 1;
  }

729 730 731
  int32_t colIndex = 0;
  int32_t num = taosArrayGetSize(pBlockData->aIdx);
  while (i < numOfOutputCols && colIndex < num) {
732 733 734
    rowIndex = 0;
    pColData = taosArrayGet(pResBlock->pDataBlock, i);

H
Hongze Cheng 已提交
735
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
736 737

    if (pData->cid == pColData->info.colId) {
738
      for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
739 740
        tColDataGetValue(pData, j, &cv);
        doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
H
Haojun Liao 已提交
741
      }
742
      colIndex += 1;
743
      ASSERT(rowIndex == remain);
744 745
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNNULL(pColData, 0, remain);
H
Haojun Liao 已提交
746
    }
747 748 749 750

    i += 1;
  }

751
  while (i < numOfOutputCols) {
752 753 754
    pColData = taosArrayGet(pResBlock->pDataBlock, i);
    colDataAppendNNULL(pColData, 0, remain);
    i += 1;
H
Haojun Liao 已提交
755
  }
H
Haojun Liao 已提交
756

757
  pResBlock->info.rows = remain;
758
  pDumpInfo->rowIndex += step * remain;
759 760

  setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
H
Haojun Liao 已提交
761

762
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
763
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
764

765
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
766
  tsdbDebug("%p copy file block to sdatablock, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
767
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
768 769 770 771 772 773
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, remain, unDumpedRows,
            pBlock->minVersion, pBlock->maxVersion, elapsedTime, pReader->idStr);

  return TSDB_CODE_SUCCESS;
}

774 775
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter,
                                   STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
776 777 778
  int64_t st = taosGetTimestampUs();

  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
779
  SBlock*             pBlock = getCurrentBlock(pBlockIter);
780

781 782
  SSDataBlock* pResBlock = pReader->pResBlock;
  int32_t      numOfCols = blockDataGetNumOfCols(pResBlock);
783 784 785 786

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

787 788 789
  SBlockIdx blockIdx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
  int32_t   code =
      tsdbReadColData(pReader->pFileReader, &blockIdx, pBlock, pSupInfo->colIds, numOfCols, pBlockData, NULL, NULL);
790 791 792 793
  if (code != TSDB_CODE_SUCCESS) {
    goto _error;
  }

794
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
795 796 797 798
  pReader->cost.blockLoadTime += elapsedTime;

  pDumpInfo->allDumped = false;
  tsdbDebug("%p load file block into buffer, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
799
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
800
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
H
Haojun Liao 已提交
801
            pBlock->minVersion, pBlock->maxVersion, elapsedTime, pReader->idStr);
802

H
Haojun Liao 已提交
803
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
804 805

_error:
H
Haojun Liao 已提交
806 807 808 809 810
  tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, %s",
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pReader->idStr);
  return code;
H
Haojun Liao 已提交
811
}
H
Hongze Cheng 已提交
812

H
Haojun Liao 已提交
813 814 815
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
816

H
Haojun Liao 已提交
817 818 819 820
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
821

H
Haojun Liao 已提交
822 823
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
824

H
Haojun Liao 已提交
825 826
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  ASSERT(numOfTables >= 1);
H
Hongze Cheng 已提交
827

H
Haojun Liao 已提交
828
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
829 830
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
831

H
Haojun Liao 已提交
832 833 834 835
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
836

H
Haojun Liao 已提交
837 838
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
839

H
Haojun Liao 已提交
840
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
841
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
842
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
843

H
Haojun Liao 已提交
844
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
845

H
Haojun Liao 已提交
846 847
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
848

H
Haojun Liao 已提交
849 850 851 852 853 854 855
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
856

857
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
858
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
859

860 861 862 863
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter) {
864
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
865 866 867 868 869 870 871 872 873 874
  STableBlockScanInfo* pScanInfo = taosHashGet(pBlockIter->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));

  int32_t* mapDataIndex = taosArrayGet(pScanInfo->pBlockList, pFBlock->tbBlockIdx);
  tMapDataGetItemByIdx(&pScanInfo->mapData, *mapDataIndex, &pBlockIter->block, tGetBlock);

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
875
}
H
Hongze Cheng 已提交
876

H
Haojun Liao 已提交
877
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks) {
878
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
879

880
  pBlockIter->numOfBlocks = numOfBlocks;
881 882
  taosArrayClear(pBlockIter->blockList);

883 884
  // access data blocks according to the offset of each block in asc/desc order.
  int32_t numOfTables = (int32_t)taosHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
885

886
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
887

888
  SBlockOrderSupporter sup = {0};
889
  int32_t              code = initBlockOrderSupporter(&sup, numOfTables);
890 891 892
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
893

894 895 896 897 898 899 900
  int32_t cnt = 0;
  void*   ptr = NULL;
  while (1) {
    ptr = taosHashIterate(pReader->status.pTableMap, ptr);
    if (ptr == NULL) {
      break;
    }
H
Haojun Liao 已提交
901

902 903 904 905
    STableBlockScanInfo* pTableScanInfo = (STableBlockScanInfo*)ptr;
    if (pTableScanInfo->pBlockList == NULL || taosArrayGetSize(pTableScanInfo->pBlockList) == 0) {
      continue;
    }
H
Haojun Liao 已提交
906

907 908
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
909

910 911 912 913 914
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
      return TSDB_CODE_TDB_OUT_OF_MEMORY;
    }
H
Haojun Liao 已提交
915

916
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
917
    SBlock block = {0};
918 919
    for (int32_t k = 0; k < num; ++k) {
      SBlockOrderWrapper wrapper = {0};
920 921 922 923

      int32_t* mapDataIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
      tMapDataGetItemByIdx(&pTableScanInfo->mapData, *mapDataIndex, &block, tGetBlock);

924
      wrapper.uid = pTableScanInfo->uid;
925
      wrapper.offset = block.aSubBlock[0].offset;
H
Haojun Liao 已提交
926

927 928 929 930 931 932
      sup.pDataBlockInfo[sup.numOfTables][k] = wrapper;
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
933

934
  ASSERT(numOfBlocks == cnt);
H
Haojun Liao 已提交
935

936 937 938 939 940
  // since there is only one table qualified, blocks are not sorted
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
941
    }
942

943
    int64_t et = taosGetTimestampUs();
944 945
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
              pReader, cnt, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
946

947
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
948
    cleanupBlockOrderSupporter(&sup);
949
    doSetCurrentBlock(pBlockIter);
950
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
951
  }
H
Haojun Liao 已提交
952

953 954
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
955

956
  assert(cnt <= numOfBlocks && sup.numOfTables <= numOfTables);
H
Haojun Liao 已提交
957

958 959 960 961 962
  SMultiwayMergeTreeInfo* pTree = NULL;
  uint8_t                 ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_TDB_OUT_OF_MEMORY;
H
Haojun Liao 已提交
963
  }
H
Haojun Liao 已提交
964

965 966 967 968
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
969

970 971
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
972

973 974 975 976
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
977

978 979
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
980
  }
H
Haojun Liao 已提交
981

982
  int64_t et = taosGetTimestampUs();
983 984
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, cnt, (et - st) / 1000.0,
            pReader->idStr);
985 986
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
987

988
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
989 990
  doSetCurrentBlock(pBlockIter);

991
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
992
}
H
Hongze Cheng 已提交
993

H
Haojun Liao 已提交
994
static bool blockIteratorNext(SDataBlockIter* pBlockIter) {
995 996
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

997
  int32_t step = asc ? 1 : -1;
998
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
999 1000 1001
    return false;
  }

1002
  pBlockIter->index += step;
1003 1004
  doSetCurrentBlock(pBlockIter);

1005 1006 1007
  return true;
}

1008 1009 1010
/**
 * This is an two rectangles overlap cases.
 */
1011
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SBlock* pBlock) {
1012 1013 1014 1015
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
         (pVerRange->minVer > pBlock->minVersion && pVerRange->minVer <= pBlock->maxVersion) ||
         (pVerRange->maxVer < pBlock->maxVersion && pVerRange->maxVer >= pBlock->minVersion);
H
Haojun Liao 已提交
1016
}
H
Hongze Cheng 已提交
1017

1018 1019
static SBlock* getNeighborBlockOfSameTable(SFileDataBlockInfo* pFBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
                                           int32_t* nextIndex, int32_t order) {
1020 1021 1022
  bool asc = ASCENDING_TRAVERSE(order);
  if (asc && pFBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
    return NULL;
1023 1024
  }

1025
  if (!asc && pFBlockInfo->tbBlockIdx == 0) {
1026 1027 1028
    return NULL;
  }

1029
  int32_t step = asc ? 1 : -1;
1030
  *nextIndex = pFBlockInfo->tbBlockIdx + step;
1031

1032
  SBlock*  pBlock = taosMemoryCalloc(1, sizeof(SBlock));
1033 1034 1035 1036
  int32_t* indexInMapdata = taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);

  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, *indexInMapdata, pBlock, tGetBlock);
  return pBlock;
1037 1038 1039 1040 1041
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
  ASSERT(pBlockIter != NULL && pFBlockInfo != NULL);

1042
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1043 1044
  int32_t index = pBlockIter->index;

1045
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  ASSERT(0);
  return -1;
}

1058
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1059 1060 1061 1062 1063
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1064 1065 1066 1067 1068
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1069

1070 1071 1072
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1073

1074
  doSetCurrentBlock(pBlockIter);
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
  return TSDB_CODE_SUCCESS;
}

static bool overlapWithNeighborBlock(SBlock* pBlock, SBlock* pNeighbor, int32_t order) {
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
    return pBlock->maxKey.ts == pNeighbor->minKey.ts;
  } else {
    return pBlock->minKey.ts == pNeighbor->maxKey.ts;
  }
H
Haojun Liao 已提交
1085
}
H
Hongze Cheng 已提交
1086

1087
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SBlock* pBlock) {
H
Haojun Liao 已提交
1088
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1089

1090
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1091
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1092
}
H
Hongze Cheng 已提交
1093

H
Haojun Liao 已提交
1094
static bool keyOverlapFileBlock(TSDBKEY key, SBlock* pBlock, SVersionRange* pVerRange) {
1095 1096
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVersion >= pVerRange->minVer) &&
         (pBlock->minVersion <= pVerRange->maxVer);
H
Haojun Liao 已提交
1097 1098
}

1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SBlock* pBlock) {
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

  for (int32_t i = pBlockScanInfo->fileDelIndex; i < num; i += 1) {
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
      if (p->version >= pBlock->minVersion) {
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
      if (p->version >= pBlock->minVersion) {
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
          if (i + 1 == num - 1) {  // pnext is the last point
            if (pnext->ts >= pBlock->minKey.ts) {
              return true;
            }
          } else {
            if (pnext->ts >= pBlock->minKey.ts && pnext->version >= pBlock->minVersion) {
              return true;
            }
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

1133
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SBlock* pBlock, int32_t order) {
1134 1135 1136 1137
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1138
  // ts is not overlap
1139
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1140
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1141 1142 1143 1144 1145
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1146 1147 1148 1149
  if (ASCENDING_TRAVERSE(order)) {
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1150
    while (1) {
1151 1152 1153 1154 1155
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
        break;
1156 1157 1158
      }
    }

1159 1160
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  }
1161 1162
}

1163 1164 1165 1166
// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
1167
// 5. delete info should not overlap with current block data
1168 1169
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pFBlock, SBlock* pBlock,
                                STableBlockScanInfo* pScanInfo, TSDBKEY key) {
1170 1171 1172
  int32_t neighborIndex = 0;
  SBlock* pNeighbor = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &neighborIndex, pReader->order);

1173
  // overlap with neighbor
1174 1175 1176
  bool overlapWithNeighbor = false;
  if (pNeighbor) {
    overlapWithNeighbor = overlapWithNeighborBlock(pBlock, pNeighbor, pReader->order);
1177
    taosMemoryFree(pNeighbor);
1178 1179
  }

1180
  // has duplicated ts of different version in this block
L
Liu Jicong 已提交
1181 1182
  bool hasDup = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  bool overlapWithDel = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1183

1184
  return (overlapWithNeighbor || hasDup || dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock) ||
1185
          keyOverlapFileBlock(key, pBlock, &pReader->verRange) || (pBlock->nRow > pReader->capacity) || overlapWithDel);
H
Haojun Liao 已提交
1186 1187
}

1188
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1189
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1190 1191
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1192

1193 1194 1195
  SSDataBlock* pBlock = pReader->pResBlock;

  int64_t st = taosGetTimestampUs();
1196
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->capacity, pReader);
H
Haojun Liao 已提交
1197

1198
  blockDataUpdateTsWindow(pBlock, 0);
1199
  pBlock->info.uid = pBlockScanInfo->uid;
1200

1201
  setComposedBlockFlag(pReader, true);
1202

1203
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
S
Shengliang Guan 已提交
1204
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%d, brange:%" PRId64
1205 1206 1207
            " - %" PRId64 " %s",
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
            pReader->idStr);
1208 1209

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1210 1211 1212
  return code;
}

1213
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
H
Haojun Liao 已提交
1214
                                     SIterInfo* pIter, int64_t key) {
1215
  SRowMerger          merge = {0};
H
Haojun Liao 已提交
1216
  STSRow*             pTSRow = NULL;
1217 1218 1219 1220
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

  TSDBKEY k = TSDBROW_KEY(pRow);
1221
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1222
  SArray* pDelList = pBlockScanInfo->delSkyline;
1223

1224 1225 1226 1227 1228 1229 1230 1231
  // ascending order traverse
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (key < k.ts) {
      tRowMergerInit(&merge, &fRow, pReader->pSchema);

      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      tRowMergerGetRow(&merge, &pTSRow);
    } else if (k.ts < key) {  // k.ts < key
1232
      doMergeMultiRows(pRow, pBlockScanInfo->uid, pIter, pDelList, &pTSRow, pReader);
1233 1234 1235
    } else {  // k.ts == key, ascending order: file block ----> imem rows -----> mem rows
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1236 1237

      tRowMerge(&merge, pRow);
1238
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1239 1240

      tRowMergerGetRow(&merge, &pTSRow);
1241
    }
1242 1243
  } else {  // descending order scan
    if (key < k.ts) {
1244
      doMergeMultiRows(pRow, pBlockScanInfo->uid, pIter, pDelList, &pTSRow, pReader);
1245 1246
    } else if (k.ts < key) {
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
1247

1248 1249 1250 1251 1252 1253
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      tRowMergerGetRow(&merge, &pTSRow);
    } else {  // descending order: mem rows -----> imem rows ------> file block
      updateSchema(pRow, pBlockScanInfo->uid, pReader);

      tRowMergerInit(&merge, pRow, pReader->pSchema);
1254
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1255 1256 1257 1258 1259 1260

      tRowMerge(&merge, &fRow);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);

      tRowMergerGetRow(&merge, &pTSRow);
    }
1261 1262
  }

1263
  tRowMergerClear(&merge);
1264
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
H
Haojun Liao 已提交
1265 1266

  taosMemoryFree(pTSRow);
1267 1268 1269
  return TSDB_CODE_SUCCESS;
}

1270 1271 1272 1273
static int32_t doMergeThreeLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
  SRowMerger merge = {0};
  STSRow*    pTSRow = NULL;

1274 1275
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
dengyihao's avatar
dengyihao 已提交
1276
  SArray*             pDelList = pBlockScanInfo->delSkyline;
1277

1278 1279
  TSDBROW* pRow = getValidRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidRow(&pBlockScanInfo->iiter, pDelList, pReader);
1280
  ASSERT(pRow != NULL && piRow != NULL);
H
Haojun Liao 已提交
1281

1282
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
H
Haojun Liao 已提交
1283

1284
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
1285

1286 1287 1288 1289
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

  if (ASCENDING_TRAVERSE(pReader->order)) {
1290 1291
    // [1&2] key <= [k.ts && ik.ts]
    if (key <= k.ts && key <= ik.ts) {
1292 1293 1294
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
      tRowMergerInit(&merge, &fRow, pReader->pSchema);

1295
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
H
Haojun Liao 已提交
1296

1297 1298
      if (ik.ts == key) {
        tRowMerge(&merge, piRow);
1299
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1300 1301
      }

1302 1303
      if (k.ts == key) {
        tRowMerge(&merge, pRow);
1304
        doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1305 1306 1307
      }

      tRowMergerGetRow(&merge, &pTSRow);
1308
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1309
      return TSDB_CODE_SUCCESS;
1310
    } else {  // key > ik.ts || key > k.ts
1311 1312
      ASSERT(key != ik.ts);

1313
      // [3] ik.ts < key <= k.ts
1314
      // [4] ik.ts < k.ts <= key
1315
      if (ik.ts < k.ts) {
1316
        doMergeMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader);
1317
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1318 1319 1320
        return TSDB_CODE_SUCCESS;
      }

1321 1322
      // [5] k.ts < key   <= ik.ts
      // [6] k.ts < ik.ts <= key
1323
      if (k.ts < ik.ts) {
1324
        doMergeMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, &pTSRow, pReader);
1325
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1326 1327 1328
        return TSDB_CODE_SUCCESS;
      }

1329
      // [7] k.ts == ik.ts < key
1330
      if (k.ts == ik.ts) {
1331 1332
        ASSERT(key > ik.ts && key > k.ts);

1333
        doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pTSRow);
1334
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1335 1336 1337
        return TSDB_CODE_SUCCESS;
      }
    }
1338 1339 1340 1341 1342 1343
  } else {  // descending order scan
    // [1/2] k.ts >= ik.ts && k.ts >= key
    if (k.ts >= ik.ts && k.ts >= key) {
      updateSchema(pRow, uid, pReader);

      tRowMergerInit(&merge, pRow, pReader->pSchema);
1344
      doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1345 1346 1347

      if (ik.ts == k.ts) {
        tRowMerge(&merge, piRow);
1348
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1349 1350 1351 1352 1353 1354 1355 1356 1357
      }

      if (k.ts == key) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      }

      tRowMergerGetRow(&merge, &pTSRow);
1358
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1359 1360
      return TSDB_CODE_SUCCESS;
    } else {
1361
      ASSERT(ik.ts != k.ts);  // this case has been included in the previous if branch
1362 1363 1364 1365

      // [3] ik.ts > k.ts >= Key
      // [4] ik.ts > key >= k.ts
      if (ik.ts > key) {
1366
        doMergeMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader);
1367
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378
        return TSDB_CODE_SUCCESS;
      }

      // [5] key > ik.ts > k.ts
      // [6] key > k.ts > ik.ts
      if (key > ik.ts) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMergerInit(&merge, &fRow, pReader->pSchema);

        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
1379
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1380 1381 1382 1383 1384
        return TSDB_CODE_SUCCESS;
      }

      //[7] key = ik.ts > k.ts
      if (key == ik.ts) {
1385
        doMergeMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader);
1386 1387 1388 1389 1390

        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
1391
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1392 1393 1394 1395 1396 1397
        return TSDB_CODE_SUCCESS;
      }
    }
  }

  ASSERT(0);
S
Shengliang Guan 已提交
1398
  return -1;
1399 1400
}

dengyihao's avatar
dengyihao 已提交
1401 1402
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

1414
  TSDBKEY k = {.ts = ts, .version = ver};
1415
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, &k, pReader->order)) {
1416 1417 1418
    return false;
  }

1419 1420 1421
  return true;
}

1422
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
1423

1424 1425 1426 1427 1428
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;

  int64_t  key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1429 1430
  TSDBROW* pRow = getValidRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
1431

1432
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal) {
1433
    return doMergeThreeLevelRows(pReader, pBlockScanInfo);
1434
  } else {
1435
    // imem + file
1436
    if (pBlockScanInfo->iiter.hasVal) {
H
Haojun Liao 已提交
1437
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key);
1438 1439
    }

1440
    // mem + file
1441
    if (pBlockScanInfo->iter.hasVal) {
H
Haojun Liao 已提交
1442
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key);
H
Haojun Liao 已提交
1443
    }
1444

1445
    // imem & mem are all empty, only file exist
1446 1447 1448 1449 1450 1451

    // opt version
    // 1. it is not a border point
    // 2. the direct next point is not an duplicated timestamp
    if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && pReader->order == TSDB_ORDER_ASC) ||
        (pDumpInfo->rowIndex > 0 && pReader->order == TSDB_ORDER_DESC)) {
1452
      int32_t step = pReader->order == TSDB_ORDER_ASC ? 1 : -1;
H
Haojun Liao 已提交
1453
      int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
1454
      if (nextKey != key) {  // merge is not needed
1455
        doAppendRowFromBlock(pReader->pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
H
Haojun Liao 已提交
1456
        pDumpInfo->rowIndex += step;
1457 1458 1459 1460
        return TSDB_CODE_SUCCESS;
      }
    }

1461
    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1462

H
Haojun Liao 已提交
1463
    STSRow*    pTSRow = NULL;
1464
    SRowMerger merge = {0};
H
Haojun Liao 已提交
1465

1466 1467 1468
    tRowMergerInit(&merge, &fRow, pReader->pSchema);
    doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
    tRowMergerGetRow(&merge, &pTSRow);
1469
    doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1470

H
Haojun Liao 已提交
1471 1472
    taosMemoryFree(pTSRow);
    tRowMergerClear(&merge);
1473
    return TSDB_CODE_SUCCESS;
1474 1475 1476
  }
}

1477
static int32_t buildComposedDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
1478 1479
  SSDataBlock* pResBlock = pReader->pResBlock;

1480
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1481 1482
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  int32_t             step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
1483

1484 1485
  int64_t st = taosGetTimestampUs();

1486
  while (1) {
1487 1488
    // todo check the validate of row in file block
    {
1489
      if (!isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
1490 1491
        pDumpInfo->rowIndex += step;

1492
        SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
1493 1494 1495 1496 1497 1498 1499 1500 1501
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
          setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
          break;
        }

        continue;
      }
    }

1502
    buildComposedDataBlockImpl(pReader, pBlockScanInfo);
1503
    SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
1504

1505 1506 1507 1508 1509 1510 1511 1512
    // currently loaded file data block is consumed
    if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
      setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
      break;
    }

    if (pResBlock->info.rows >= pReader->capacity) {
      break;
1513 1514 1515 1516
    }
  }

  pResBlock->info.uid = pBlockScanInfo->uid;
1517 1518
  blockDataUpdateTsWindow(pResBlock, 0);

1519
  setComposedBlockFlag(pReader, true);
1520
  int64_t et = taosGetTimestampUs();
1521

1522 1523 1524 1525
  tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
            " rows:%d, elapsed time:%.2f ms %s",
            pReader, pBlockScanInfo->uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
            pResBlock->info.rows, (et - st) / 1000.0, pReader->idStr);
1526

1527 1528 1529 1530 1531
  return TSDB_CODE_SUCCESS;
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

1532
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
1533 1534 1535 1536
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

1537
  int32_t code = TSDB_CODE_SUCCESS;
1538 1539 1540 1541 1542 1543 1544 1545 1546

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
    startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
  } else {
    startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));
1547 1548

  STbData* d = NULL;
H
Hongze Cheng 已提交
1549 1550
  if (pReader->pReadSnap->pMem != NULL) {
    tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid, &d);
1551
    if (d != NULL) {
1552
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
1553
      if (code == TSDB_CODE_SUCCESS) {
1554
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);
1555

H
Haojun Liao 已提交
1556
        tsdbDebug("%p uid:%" PRId64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
1557 1558
                  "-%" PRId64 " %s",
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
1559
      } else {
1560 1561
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
1562
        return code;
1563 1564
      }
    }
H
Haojun Liao 已提交
1565
  } else {
1566
    tsdbDebug("%p uid:%" PRId64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
1567 1568
  }

1569
  STbData* di = NULL;
H
Hongze Cheng 已提交
1570 1571
  if (pReader->pReadSnap->pIMem != NULL) {
    tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid, &di);
1572
    if (di != NULL) {
1573
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
1574
      if (code == TSDB_CODE_SUCCESS) {
1575
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);
1576

H
Haojun Liao 已提交
1577
        tsdbDebug("%p uid:%" PRId64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
1578
                  "-%" PRId64 " %s",
1579
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
1580
      } else {
1581 1582
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
1583
        return code;
1584 1585
      }
    }
H
Haojun Liao 已提交
1586 1587
  } else {
    tsdbDebug("%p uid:%" PRId64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
1588 1589
  }

1590 1591
  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);

1592
  pBlockScanInfo->iterInit = true;
H
Haojun Liao 已提交
1593 1594 1595
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
1596 1597
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
1598 1599 1600
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
1601

1602 1603 1604
  int32_t code = 0;
  STsdb*  pTsdb = pReader->pTsdb;

1605 1606
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
1607
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
1608 1609 1610
  if (pDelFile) {
    SDelFReader* pDelFReader = NULL;
    code = tsdbDelFReaderOpen(&pDelFReader, pDelFile, pTsdb, NULL);
1611
    if (code != TSDB_CODE_SUCCESS) {
1612 1613 1614 1615 1616
      goto _err;
    }

    SArray* aDelIdx = taosArrayInit(4, sizeof(SDelIdx));
    if (aDelIdx == NULL) {
1617
      tsdbDelFReaderClose(&pDelFReader);
1618 1619 1620
      goto _err;
    }

1621
    code = tsdbReadDelIdx(pDelFReader, aDelIdx, NULL);
1622 1623 1624
    if (code != TSDB_CODE_SUCCESS) {
      taosArrayDestroy(aDelIdx);
      tsdbDelFReaderClose(&pDelFReader);
1625 1626
      goto _err;
    }
1627

1628 1629 1630
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
    SDelIdx* pIdx = taosArraySearch(aDelIdx, &idx, tCmprDelIdx, TD_EQ);

H
Haojun Liao 已提交
1631 1632
    if (pIdx != NULL) {
      code = tsdbReadDelData(pDelFReader, pIdx, pDelData, NULL);
1633 1634 1635 1636 1637 1638 1639
    }

    taosArrayDestroy(aDelIdx);
    tsdbDelFReaderClose(&pDelFReader);

    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
1640
    }
1641
  }
1642

1643 1644 1645 1646 1647 1648 1649
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
1650 1651
  }

1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
dengyihao's avatar
dengyihao 已提交
1666 1667
  pBlockScanInfo->iter.index =
      ASCENDING_TRAVERSE(pReader->order) ? 0 : taosArrayGetSize(pBlockScanInfo->delSkyline) - 1;
1668 1669
  pBlockScanInfo->iiter.index = pBlockScanInfo->iter.index;
  pBlockScanInfo->fileDelIndex = pBlockScanInfo->iter.index;
1670 1671
  return code;

1672 1673 1674
_err:
  taosArrayDestroy(pDelData);
  return code;
1675 1676
}

1677 1678 1679
static TSDBKEY getCurrentKeyInBuf(SDataBlockIter* pBlockIter, STsdbReader* pReader) {
  TSDBKEY key = {.ts = TSKEY_INITIAL_VAL};

1680
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
1681 1682
  STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));

1683 1684
  initMemDataIterator(pScanInfo, pReader);
  TSDBROW* pRow = getValidRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
1685
  if (pRow != NULL) {
1686 1687 1688
    key = TSDBROW_KEY(pRow);
  }

1689
  pRow = getValidRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
1690
  if (pRow != NULL) {
1691 1692 1693 1694 1695 1696 1697 1698 1699
    TSDBKEY k = TSDBROW_KEY(pRow);
    if (key.ts > k.ts) {
      key = k;
    }
  }

  return key;
}

H
Haojun Liao 已提交
1700 1701
static int32_t moveToNextFile(STsdbReader* pReader, int32_t* numOfBlocks) {
  SReaderStatus* pStatus = &pReader->status;
1702

1703
  size_t  numOfTables = taosHashGetSize(pReader->status.pTableMap);
1704
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
1705 1706

  while (1) {
1707
    bool hasNext = filesetIteratorNext(&pStatus->fileIter, pReader);
1708
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
1709 1710 1711
      break;
    }

H
Haojun Liao 已提交
1712
    taosArrayClear(pIndexList);
H
Haojun Liao 已提交
1713 1714
    int32_t code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
1715
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
1716 1717 1718 1719 1720 1721 1722
      return code;
    }

    if (taosArrayGetSize(pIndexList) > 0) {
      uint32_t numOfValidTable = 0;
      code = doLoadFileBlock(pReader, pIndexList, &numOfValidTable, numOfBlocks);
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
1723
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
        return code;
      }

      if (numOfValidTable > 0) {
        break;
      }
    }
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
1734
  taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
1735 1736 1737
  return TSDB_CODE_SUCCESS;
}

1738 1739 1740
static int32_t doBuildDataBlock(STsdbReader* pReader) {
  int32_t code = TSDB_CODE_SUCCESS;

1741
  SReaderStatus*  pStatus = &pReader->status;
1742 1743
  SDataBlockIter* pBlockIter = &pStatus->blockIter;

1744 1745
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
  STableBlockScanInfo* pScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
1746

1747
  SBlock* pBlock = getCurrentBlock(pBlockIter);
1748 1749 1750

  TSDBKEY key = getCurrentKeyInBuf(pBlockIter, pReader);
  if (fileBlockShouldLoad(pReader, pFBlock, pBlock, pScanInfo, key)) {
H
Haojun Liao 已提交
1751 1752
    tBlockDataReset(&pStatus->fileBlockData);
    tBlockDataClearData(&pStatus->fileBlockData);
1753
    code = doLoadFileBlockData(pReader, pBlockIter, pScanInfo, &pStatus->fileBlockData);
1754 1755 1756 1757 1758
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    // build composed data block
1759
    code = buildComposedDataBlock(pReader, pScanInfo);
1760 1761
  } else if (bufferDataInFileBlockGap(pReader->order, key, pBlock)) {
    // data in memory that are earlier than current file block
1762
    // todo rows in buffer should be less than the file block in asc, greater than file block in desc
1763
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
1764
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
1765
  } else {  // whole block is required, return it directly
1766
    SDataBlockInfo* pInfo = &pReader->pResBlock->info;
1767 1768 1769
    pInfo->rows = pBlock->nRow;
    pInfo->uid = pScanInfo->uid;
    pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
1770
    setComposedBlockFlag(pReader, false);
1771
    setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock, pReader->order);
1772 1773 1774 1775 1776
  }

  return code;
}

H
Haojun Liao 已提交
1777
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
1778 1779
  SReaderStatus* pStatus = &pReader->status;

1780
  while (1) {
1781 1782 1783
    if (pStatus->pTableIter == NULL) {
      pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
      if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
1784
        return TSDB_CODE_SUCCESS;
1785 1786 1787 1788
      }
    }

    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;
1789
    initMemDataIterator(pBlockScanInfo, pReader);
1790

1791
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
1792
    int32_t code = buildDataBlockFromBuf(pReader, pBlockScanInfo, endKey);
H
Haojun Liao 已提交
1793 1794 1795 1796
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

1797
    if (pReader->pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
1798
      return TSDB_CODE_SUCCESS;
1799 1800 1801 1802 1803
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
    if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
1804
      return TSDB_CODE_SUCCESS;
1805 1806 1807 1808
    }
  }
}

1809
// set the correct start position in case of the first/last file block, according to the query time window
1810
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
1811
  SBlock* pBlock = getCurrentBlock(pBlockIter);
1812

1813 1814 1815
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
1816 1817 1818

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
1819
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
1820 1821
}

1822
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836
  int32_t numOfBlocks = 0;
  int32_t code = moveToNextFile(pReader, &numOfBlocks);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

  // all data files are consumed, try data in buffer
  if (numOfBlocks == 0) {
    pReader->status.loadFromFile = false;
    return code;
  }

  // initialize the block iterator for a new fileset
  code = initBlockIterator(pReader, pBlockIter, numOfBlocks);
1837 1838

  // set the correct start position according to the query time window
1839
  initBlockDumpInfo(pReader, pBlockIter);
1840 1841 1842
  return code;
}

1843
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
1844 1845
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
1846 1847
}

1848
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
H
Haojun Liao 已提交
1849
  int32_t code = TSDB_CODE_SUCCESS;
1850 1851
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

1852 1853
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

1854
  while (1) {
1855
    SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
1856 1857
    STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));

1858 1859
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1860
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
      code = buildComposedDataBlock(pReader, pScanInfo);
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
        bool hasNext = blockIteratorNext(&pReader->status.blockIter);
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
        } else {  // data blocks in current file are exhausted, let's try the next file now
          code = initForFirstBlockInFile(pReader, pBlockIter);

          // error happens or all the data files are completely checked
          if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
            return code;
          }
1876
        }
H
Haojun Liao 已提交
1877
      }
1878 1879 1880

      // current block is not loaded yet, or data in buffer may overlap with the file block.
      code = doBuildDataBlock(pReader);
1881 1882
    }

1883 1884 1885 1886 1887 1888 1889 1890
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }
1891
}
H
refact  
Hongze Cheng 已提交
1892

1893 1894
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
1895
  if (VND_IS_RSMA(pVnode)) {
1896
    int8_t  level = 0;
1897 1898
    int64_t now = taosGetTimestamp(pVnode->config.tsdbCfg.precision);

1899
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
      if ((now - pRetention->keep) <= winSKey) {
        break;
      }
      ++level;
    }

1913
    const char* str = (idStr != NULL) ? idStr : "";
1914 1915

    if (level == TSDB_RETENTION_L0) {
1916
      *pLevel = TSDB_RETENTION_L0;
C
Cary Xu 已提交
1917
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L0, str);
1918 1919
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
1920
      *pLevel = TSDB_RETENTION_L1;
C
Cary Xu 已提交
1921
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L1, str);
1922 1923
      return VND_RSMA1(pVnode);
    } else {
1924
      *pLevel = TSDB_RETENTION_L2;
C
Cary Xu 已提交
1925
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", TD_VID(pVnode), TSDB_RETENTION_L2, str);
1926 1927 1928 1929 1930 1931 1932
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
1933
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
1934
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
1935 1936

  int64_t endVer = 0;
L
Liu Jicong 已提交
1937 1938
  if (pCond->endVersion ==
      -1) {  // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
1939 1940
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
1941
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
1942 1943
  }

H
Haojun Liao 已提交
1944
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
1945 1946
}

H
Hongze Cheng 已提交
1947 1948 1949 1950
// // todo not unref yet, since it is not support multi-group interpolation query
// static UNUSED_FUNC void changeQueryHandleForInterpQuery(STsdbReader* pHandle) {
//   // filter the queried time stamp in the first place
//   STsdbReader* pTsdbReadHandle = (STsdbReader*)pHandle;
H
refact  
Hongze Cheng 已提交
1951

H
Hongze Cheng 已提交
1952 1953
//   // starts from the buffer in case of descending timestamp order check data blocks
//   size_t numOfTables = taosArrayGetSize(pTsdbReadHandle->pTableCheckInfo);
H
refact  
Hongze Cheng 已提交
1954

H
Hongze Cheng 已提交
1955 1956
//   int32_t i = 0;
//   while (i < numOfTables) {
H
Haojun Liao 已提交
1957
//     STableBlockScanInfo* pCheckInfo = taosArrayGet(pTsdbReadHandle->pTableCheckInfo, i);
H
refact  
Hongze Cheng 已提交
1958

H
Hongze Cheng 已提交
1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972
//     // the first qualified table for interpolation query
//     //    if ((pTsdbReadHandle->window.skey <= pCheckInfo->pTableObj->lastKey) &&
//     //        (pCheckInfo->pTableObj->lastKey != TSKEY_INITIAL_VAL)) {
//     //      break;
//     //    }

//     i++;
//   }

//   // there are no data in all the tables
//   if (i == numOfTables) {
//     return;
//   }

H
Haojun Liao 已提交
1973
//   STableBlockScanInfo info = *(STableBlockScanInfo*)taosArrayGet(pTsdbReadHandle->pTableCheckInfo, i);
H
Hongze Cheng 已提交
1974 1975 1976 1977 1978 1979
//   taosArrayClear(pTsdbReadHandle->pTableCheckInfo);

//   info.lastKey = pTsdbReadHandle->window.skey;
//   taosArrayPush(pTsdbReadHandle->pTableCheckInfo, &info);
// }

1980
bool hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order) {
1981 1982 1983 1984
  ASSERT(pKey != NULL);
  if (pDelList == NULL) {
    return false;
  }
L
Liu Jicong 已提交
1985 1986 1987
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
1988

1989 1990 1991 1992 1993 1994
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
      ASSERT(pKey->ts >= last->ts);

      if (pKey->ts > last->ts) {
1995
        return false;
1996 1997 1998
      } else if (pKey->ts == last->ts) {
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
        return (prev->version >= pKey->version);
1999 2000
      }
    } else {
2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

      if (pKey->ts < pCurrent->ts) {
        return false;
      }

      if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
        return true;
      }

      while (pNext->ts <= pKey->ts && (*index) < num - 1) {
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

          if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
            return true;
          }
        }
      }

      return false;
2031 2032
    }
  } else {
2033 2034
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
2035

2036 2037 2038 2039 2040 2041 2042
      if (pKey->ts < pFirst->ts) {
        return false;
      } else if (pKey->ts == pFirst->ts) {
        return pFirst->version >= pKey->version;
      } else {
        ASSERT(0);
      }
2043
    } else {
2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

      if (pKey->ts > pCurrent->ts) {
        return false;
      }

      if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
        return true;
      }

      while (pPrev->ts >= pKey->ts && (*index) > 1) {
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

          if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
            return true;
          }
        }
2071 2072 2073 2074 2075
      }

      return false;
    }
  }
2076 2077

  return false;
2078 2079 2080 2081
}

TSDBROW* getValidRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
2082 2083
    return NULL;
  }
H
Hongze Cheng 已提交
2084

2085
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
2086
  TSDBKEY  key = TSDBROW_KEY(pRow);
2087
  if (outOfTimeWindow(key.ts, &pReader->window)) {
2088
    pIter->hasVal = false;
H
Haojun Liao 已提交
2089 2090
    return NULL;
  }
H
Hongze Cheng 已提交
2091

2092
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
2093
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
2094
      (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2095 2096
    return pRow;
  }
H
Hongze Cheng 已提交
2097

2098
  while (1) {
2099 2100
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2101 2102
      return NULL;
    }
H
Hongze Cheng 已提交
2103

2104
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
2105

H
Haojun Liao 已提交
2106
    key = TSDBROW_KEY(pRow);
2107
    if (outOfTimeWindow(key.ts, &pReader->window)) {
2108
      pIter->hasVal = false;
H
Haojun Liao 已提交
2109 2110
      return NULL;
    }
H
Hongze Cheng 已提交
2111

dengyihao's avatar
dengyihao 已提交
2112
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
2113
        (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2114 2115 2116 2117
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
2118

2119 2120
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
                         STsdbReader* pReader) {
H
Haojun Liao 已提交
2121
  while (1) {
2122 2123
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2124 2125
      break;
    }
H
Hongze Cheng 已提交
2126

2127
    // data exists but not valid
2128
    TSDBROW* pRow = getValidRow(pIter, pDelList, pReader);
2129 2130 2131 2132 2133
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
2134
    TSDBKEY k = TSDBROW_KEY(pRow);
2135
    if (k.ts != ts) {
H
Haojun Liao 已提交
2136 2137 2138
      break;
    }

2139 2140
    int32_t   sversion = TSDBROW_SVERSION(pRow);
    STSchema* pTSchema = NULL;
2141
    if (pReader->pSchema == NULL || sversion != pReader->pSchema->version) {
2142
      metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pTSchema);
2143 2144 2145
      if (pReader->pSchema == NULL) {
        pReader->pSchema = pTSchema;
      }
2146 2147 2148 2149 2150
    } else {
      pTSchema = pReader->pSchema;
    }

    tRowMergerAdd(pMerger, pRow, pTSchema);
M
Minglei Jin 已提交
2151

2152
    if (pTSchema != pReader->pSchema) {
M
Minglei Jin 已提交
2153 2154
      taosMemoryFree(pTSchema);
    }
H
Haojun Liao 已提交
2155 2156 2157 2158 2159
  }

  return TSDB_CODE_SUCCESS;
}

2160
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
2161
                                          SVersionRange* pVerRange, int32_t step) {
2162 2163
  while (pBlockData->aTSKEY[rowIndex] == key && rowIndex < pBlockData->nRow && rowIndex >= 0) {
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
2164
      rowIndex += step;
2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
    tRowMerge(pMerger, &fRow);
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SBlock* pBlock,
2182 2183
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
2184
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2185
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
2186

2187
  *state = CHECK_FILEBLOCK_QUIT;
2188
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
2189 2190 2191

  int32_t nextIndex = -1;
  SBlock* pNeighborBlock = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &nextIndex, pReader->order);
2192
  if (pNeighborBlock == NULL) {  // do nothing
2193 2194 2195 2196
    return 0;
  }

  bool overlap = overlapWithNeighborBlock(pBlock, pNeighborBlock, pReader->order);
2197 2198
  taosMemoryFree(pNeighborBlock);

2199
  if (overlap) {  // load next block
2200
    SReaderStatus*  pStatus = &pReader->status;
2201 2202
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

2203
    // 1. find the next neighbor block in the scan block list
2204
    SFileDataBlockInfo fb = {.uid = pFBlock->uid, .tbBlockIdx = nextIndex};
2205
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);
2206

2207
    // 2. remove it from the scan block list
2208
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);
2209

2210
    // 3. load the neighbor block, and set it to be the currently accessed file data block
H
Haojun Liao 已提交
2211 2212
    tBlockDataReset(&pStatus->fileBlockData);
    tBlockDataClearData(&pStatus->fileBlockData);
2213 2214 2215 2216 2217
    int32_t code = doLoadFileBlockData(pReader, pBlockIter, pScanInfo, &pStatus->fileBlockData);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2218
    // 4. check the data values
2219 2220 2221 2222
    initBlockDumpInfo(pReader, pBlockIter);

    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
H
Haojun Liao 已提交
2223
    if (pDumpInfo->rowIndex >= pDumpInfo->totalRows) {
2224 2225 2226 2227 2228 2229 2230
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

  return TSDB_CODE_SUCCESS;
}

2231 2232
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                SRowMerger* pMerger) {
2233 2234
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2235
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
2236
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
2237
  int32_t step = asc ? 1 : -1;
2238

2239
  pDumpInfo->rowIndex += step;
2240
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) || (pDumpInfo->rowIndex >= 0 && !asc)) {
2241 2242 2243
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
2244

2245 2246 2247 2248
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
2249

2250
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
2251
      SBlock*             pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
2252 2253 2254
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
2255
      }
2256
    }
H
Haojun Liao 已提交
2257
  }
2258

H
Haojun Liao 已提交
2259 2260 2261
  return TSDB_CODE_SUCCESS;
}

2262
void updateSchema(TSDBROW* pRow, uint64_t uid, STsdbReader* pReader) {
2263 2264 2265
  int32_t sversion = TSDBROW_SVERSION(pRow);

  if (pReader->pSchema == NULL) {
M
Minglei Jin 已提交
2266
    metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pSchema);
2267 2268
  } else if (pReader->pSchema->version != sversion) {
    taosMemoryFreeClear(pReader->pSchema);
M
Minglei Jin 已提交
2269
    metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pSchema);
2270 2271 2272
  }
}

dengyihao's avatar
dengyihao 已提交
2273 2274
void doMergeMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                      STsdbReader* pReader) {
2275 2276 2277
  SRowMerger merge = {0};

  TSDBKEY k = TSDBROW_KEY(pRow);
2278 2279 2280
  // updateSchema(pRow, uid, pReader);
  int32_t   sversion = TSDBROW_SVERSION(pRow);
  STSchema* pTSchema = NULL;
2281
  if (pReader->pSchema == NULL || sversion != pReader->pSchema->version) {
2282
    metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pTSchema);
2283 2284 2285
    if (pReader->pSchema == NULL) {
      pReader->pSchema = pTSchema;
    }
2286 2287 2288
  } else {
    pTSchema = pReader->pSchema;
  }
H
Haojun Liao 已提交
2289

2290 2291
  tRowMergerInit2(&merge, pReader->pSchema, pRow, pTSchema);
  doMergeRowsInBuf(pIter, uid, k.ts, pDelList, &merge, pReader);
2292
  tRowMergerGetRow(&merge, pTSRow);
2293
  tRowMergerClear(&merge);
M
Minglei Jin 已提交
2294

2295
  if (pTSchema != pReader->pSchema) {
M
Minglei Jin 已提交
2296 2297
    taosMemoryFree(pTSchema);
  }
2298 2299
}

2300 2301
void doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
                        STSRow** pTSRow) {
H
Haojun Liao 已提交
2302 2303
  SRowMerger merge = {0};

2304 2305 2306
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

2307 2308 2309 2310
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
    updateSchema(piRow, pBlockScanInfo->uid, pReader);

    tRowMergerInit(&merge, piRow, pReader->pSchema);
2311
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2312

2313
    tRowMerge(&merge, pRow);
2314
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2315 2316
  } else {
    updateSchema(pRow, pBlockScanInfo->uid, pReader);
2317

2318
    tRowMergerInit(&merge, pRow, pReader->pSchema);
2319
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2320 2321

    tRowMerge(&merge, piRow);
2322
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2323
  }
2324 2325 2326 2327

  tRowMergerGetRow(&merge, pTSRow);
}

2328 2329
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STSRow** pTSRow,
                            int64_t endKey) {
2330 2331
  TSDBROW* pRow = getValidRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
2332
  SArray*  pDelList = pBlockScanInfo->delSkyline;
H
Haojun Liao 已提交
2333

2334 2335
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
2336
  if (pBlockScanInfo->iter.hasVal) {
2337 2338 2339 2340 2341 2342
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

2343
  if (pBlockScanInfo->iiter.hasVal) {
2344 2345 2346 2347 2348 2349
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

2350
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
2351
    TSDBKEY k = TSDBROW_KEY(pRow);
2352
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
2353

2354
    if (ik.ts < k.ts) {  // ik.ts < k.ts
2355
      doMergeMultiRows(piRow, pBlockScanInfo->uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader);
2356
    } else if (k.ts < ik.ts) {
2357
      doMergeMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader);
2358 2359
    } else {  // ik.ts == k.ts
      doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, pTSRow);
H
Haojun Liao 已提交
2360
    }
2361 2362

    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
2363 2364
  }

2365 2366
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
    doMergeMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader);
H
Haojun Liao 已提交
2367 2368 2369
    return TSDB_CODE_SUCCESS;
  }

2370 2371
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
    doMergeMultiRows(piRow, pBlockScanInfo->uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader);
H
Haojun Liao 已提交
2372 2373 2374 2375 2376 2377
    return TSDB_CODE_SUCCESS;
  }

  return TSDB_CODE_SUCCESS;
}

2378
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow) {
2379 2380 2381
  int32_t numOfRows = pBlock->info.rows;
  int32_t numOfCols = (int32_t)taosArrayGetSize(pBlock->pDataBlock);

2382
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
2383
  STSchema*           pSchema = pReader->pSchema;
2384

2385
  SColVal colVal = {0};
2386
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
2387

2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407
  SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
  if (pColInfoData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppend(pColInfoData, numOfRows, (const char*)&pTSRow->ts, false);
    i += 1;
  }

  while (i < numOfCols && j < pSchema->numOfCols) {
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    col_id_t colId = pColInfoData->info.colId;

    if (colId == pSchema->columns[j].colId) {
      tTSRowGetVal(pTSRow, pReader->pSchema, j, &colVal);
      doCopyColVal(pColInfoData, numOfRows, i, &colVal, pSupInfo);
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
      colDataAppendNULL(pColInfoData, numOfRows);
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
2408
    }
2409 2410
  }

2411
  // set null value since current column does not exist in the "pSchema"
2412
  while (i < numOfCols) {
2413 2414 2415 2416 2417
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    colDataAppendNULL(pColInfoData, numOfRows);
    i += 1;
  }

2418 2419 2420 2421
  pBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

2422 2423 2424 2425 2426 2427 2428 2429 2430
int32_t doAppendRowFromBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData, int32_t rowIndex) {
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppendInt64(pColData, outputRowIndex, &pBlockData->aTSKEY[rowIndex]);
2431
    i += 1;
2432 2433 2434 2435 2436 2437
  }

  SColVal cv = {0};
  int32_t numOfInputCols = taosArrayGetSize(pBlockData->aIdx);
  int32_t numOfOutputCols = blockDataGetNumOfCols(pResBlock);

2438
  while (i < numOfOutputCols && j < numOfInputCols) {
2439
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
2440
    SColData*        pData = tBlockDataGetColDataByIdx(pBlockData, j);
2441 2442

    if (pData->cid == pCol->info.colId) {
2443 2444
      tColDataGetValue(pData, rowIndex, &cv);
      doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
2445 2446 2447 2448 2449 2450 2451 2452 2453 2454
      j += 1;
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNULL(pCol, outputRowIndex);
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
2455
    colDataAppendNULL(pCol, outputRowIndex);
2456 2457 2458 2459 2460 2461 2462
    i += 1;
  }

  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

2463 2464
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
H
Haojun Liao 已提交
2465 2466 2467 2468
  SSDataBlock* pBlock = pReader->pResBlock;

  do {
    STSRow* pTSRow = NULL;
2469
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &pTSRow, endKey);
2470 2471
    if (pTSRow == NULL) {
      break;
H
Haojun Liao 已提交
2472 2473
    }

2474
    doAppendRowFromTSRow(pBlock, pReader, pTSRow);
2475
    taosMemoryFree(pTSRow);
H
Haojun Liao 已提交
2476 2477

    // no data in buffer, return immediately
2478
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
2479 2480 2481
      break;
    }

2482
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
2483 2484 2485 2486
      break;
    }
  } while (1);

2487
  ASSERT(pBlock->info.rows <= capacity);
H
Haojun Liao 已提交
2488 2489
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
2490

2491
// todo refactor, use arraylist instead
H
Hongze Cheng 已提交
2492
int32_t tsdbSetTableId(STsdbReader* pReader, int64_t uid) {
2493 2494 2495 2496 2497
  ASSERT(pReader != NULL);
  taosHashClear(pReader->status.pTableMap);

  STableBlockScanInfo info = {.lastKey = 0, .uid = uid};
  taosHashPut(pReader->status.pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
H
Hongze Cheng 已提交
2498 2499 2500
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
2501 2502 2503 2504 2505 2506
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
2507

dengyihao's avatar
dengyihao 已提交
2508 2509 2510 2511 2512 2513
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
2514

2515 2516 2517 2518
uint64_t getReaderMaxVersion(STsdbReader *pReader) {
  return pReader->verRange.maxVer;
}

C
Cary Xu 已提交
2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
/**
 * @brief Get all suids since suid
 *
 * @param pMeta
 * @param suid return all suids in one vnode if suid is 0
 * @param list
 * @return int32_t
 */
int32_t tsdbGetStbIdList(SMeta* pMeta, int64_t suid, SArray* list) {
  SMStbCursor* pCur = metaOpenStbCursor(pMeta, suid);
L
Liu Jicong 已提交
2529
  if (!pCur) {
C
Cary Xu 已提交
2530 2531
    return TSDB_CODE_FAILED;
  }
C
Cary Xu 已提交
2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545

  while (1) {
    tb_uid_t id = metaStbCursorNext(pCur);
    if (id == 0) {
      break;
    }

    taosArrayPush(list, &id);
  }

  metaCloseStbCursor(pCur);
  return TSDB_CODE_SUCCESS;
}

H
refact  
Hongze Cheng 已提交
2546
// ====================================== EXPOSED APIs ======================================
2547 2548
int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, SArray* pTableList, STsdbReader** ppReader,
                       const char* idstr) {
2549 2550
  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, 4096, idstr);
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2551 2552
    goto _err;
  }
H
Hongze Cheng 已提交
2553

2554
  // check for query time window
H
Haojun Liao 已提交
2555
  STsdbReader* pReader = *ppReader;
2556
  if (isEmptyQueryTimeWindow(&pReader->window)) {
H
Haojun Liao 已提交
2557 2558 2559
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
2560

2561 2562 2563
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
    STimeWindow w = pCond->twindows;
2564
    int32_t     order = pCond->order;
2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582
    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.ekey = pCond->twindows.skey;
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
      pCond->twindows.skey = pCond->twindows.ekey;
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[0], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.skey = w.ekey;
      pCond->twindows.ekey = INT64_MAX;
2583
    } else {
2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599
      pCond->twindows.skey = INT64_MIN;
      pCond->twindows.ekey = w.ekey;
    }
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[1], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

  if (pCond->suid != 0) {
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, -1);
  } else if (taosArrayGetSize(pTableList) > 0) {
    STableKeyInfo* pKey = taosArrayGet(pTableList, 0);
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1);
  }

2600 2601
  int32_t numOfTables = taosArrayGetSize(pTableList);
  pReader->status.pTableMap = createDataBlockScanInfo(pReader, pTableList->pData, numOfTables);
H
Haojun Liao 已提交
2602 2603 2604
  if (pReader->status.pTableMap == NULL) {
    tsdbReaderClose(pReader);
    *ppReader = NULL;
H
Haojun Liao 已提交
2605

H
Haojun Liao 已提交
2606 2607 2608
    code = TSDB_CODE_TDB_OUT_OF_MEMORY;
    goto _err;
  }
H
Hongze Cheng 已提交
2609

H
Hongze Cheng 已提交
2610
  code = tsdbTakeReadSnap(pReader->pTsdb, &pReader->pReadSnap);
2611 2612 2613
  if (code != TSDB_CODE_SUCCESS) {
    goto _err;
  }
H
Hongze Cheng 已提交
2614

2615 2616
  if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
    SDataBlockIter* pBlockIter = &pReader->status.blockIter;
2617

2618
    initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader->order, pReader->idStr);
2619
    resetDataBlockIterator(&pReader->status.blockIter, pReader->order, pReader->status.pTableMap);
2620 2621 2622 2623 2624 2625 2626 2627 2628 2629

    // no data in files, let's try buffer in memory
    if (pReader->status.fileIter.numOfFiles == 0) {
      pReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
2630
  } else {
2631
    STsdbReader*    pPrevReader = pReader->innerReader[0];
2632 2633
    SDataBlockIter* pBlockIter = &pPrevReader->status.blockIter;

2634 2635
    initFilesetIterator(&pPrevReader->status.fileIter, pPrevReader->pReadSnap->fs.aDFileSet, pPrevReader->order,
                        pPrevReader->idStr);
2636
    resetDataBlockIterator(&pPrevReader->status.blockIter, pPrevReader->order, pReader->status.pTableMap);
2637 2638 2639 2640 2641 2642 2643 2644 2645

    // no data in files, let's try buffer in memory
    if (pPrevReader->status.fileIter.numOfFiles == 0) {
      pPrevReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pPrevReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2646 2647 2648
    }
  }

2649
  tsdbDebug("%p total numOfTable:%d in this query %s", pReader, numOfTables, pReader->idStr);
H
Hongze Cheng 已提交
2650
  return code;
H
Hongze Cheng 已提交
2651 2652

_err:
S
Shengliang Guan 已提交
2653
  tsdbError("failed to create data reader, code:%s %s", tstrerror(code), pReader->idStr);
H
Hongze Cheng 已提交
2654
  return code;
H
refact  
Hongze Cheng 已提交
2655 2656 2657
}

void tsdbReaderClose(STsdbReader* pReader) {
2658 2659
  if (pReader == NULL) {
    return;
2660
  }
H
refact  
Hongze Cheng 已提交
2661

2662 2663
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

H
Hongze Cheng 已提交
2664
  tsdbUntakeReadSnap(pReader->pTsdb, pReader->pReadSnap);
H
Hongze Cheng 已提交
2665

2666 2667 2668 2669
  taosMemoryFreeClear(pSupInfo->plist);
  taosMemoryFree(pSupInfo->colIds);

  taosArrayDestroy(pSupInfo->pColAgg);
L
Liu Jicong 已提交
2670
  for (int32_t i = 0; i < blockDataGetNumOfCols(pReader->pResBlock); ++i) {
2671 2672 2673 2674 2675
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
  taosMemoryFree(pSupInfo->buildBuf);
H
Haojun Liao 已提交
2676
  tBlockDataClear(&pReader->status.fileBlockData, true);
2677 2678

  cleanupDataBlockIterator(&pReader->status.blockIter);
2679 2680

  size_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
2681
  destroyBlockScanInfo(pReader->status.pTableMap);
2682
  blockDataDestroy(pReader->pResBlock);
2683

H
Haojun Liao 已提交
2684 2685 2686
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
2687

2688
  SIOCostSummary* pCost = &pReader->cost;
H
refact  
Hongze Cheng 已提交
2689

2690 2691 2692 2693 2694
  tsdbDebug("%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
            " SMA-time:%.2f ms, "
            "fileBlocks:%" PRId64
            ", fileBlocks-time:%.2f ms, build in-memory-block-time:%.2f ms, STableBlockScanInfo "
            "size:%.2f Kb %s",
2695
            pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaData, pCost->smaLoadTime,
2696
            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock,
2697
            numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pReader->idStr);
H
refact  
Hongze Cheng 已提交
2698

2699 2700 2701
  taosMemoryFree(pReader->idStr);
  taosMemoryFree(pReader->pSchema);
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
2702 2703
}

2704
static bool doTsdbNextDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2705
  // cleanup the data that belongs to the previous data block
2706 2707
  SSDataBlock* pBlock = pReader->pResBlock;
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
2708

2709
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
2710

2711 2712 2713 2714 2715
  if (pStatus->loadFromFile) {
    int32_t code = buildBlockFromFiles(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return false;
    }
2716

2717 2718 2719
    if (pBlock->info.rows > 0) {
      return true;
    } else {
H
Haojun Liao 已提交
2720
      buildBlockFromBufferSequentially(pReader);
2721
      return pBlock->info.rows > 0;
H
Haojun Liao 已提交
2722
    }
2723 2724 2725
  } else {  // no data in files, let's try the buffer
    buildBlockFromBufferSequentially(pReader);
    return pBlock->info.rows > 0;
H
Haojun Liao 已提交
2726
  }
2727

2728
  return false;
H
refact  
Hongze Cheng 已提交
2729 2730
}

2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767
bool tsdbNextDataBlock(STsdbReader* pReader) {
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return false;
  }

  if (pReader->innerReader[0] != NULL) {
    bool ret = doTsdbNextDataBlock(pReader->innerReader[0]);
    if (ret) {
      pReader->step = EXTERNAL_ROWS_PREV;
      return ret;
    }

    tsdbReaderClose(pReader->innerReader[0]);
    pReader->innerReader[0] = NULL;
  }

  pReader->step = EXTERNAL_ROWS_MAIN;
  bool ret = doTsdbNextDataBlock(pReader);
  if (ret) {
    return ret;
  }

  if (pReader->innerReader[1] != NULL) {
    bool ret1 = doTsdbNextDataBlock(pReader->innerReader[1]);
    if (ret1) {
      pReader->step = EXTERNAL_ROWS_NEXT;
      return ret1;
    }

    tsdbReaderClose(pReader->innerReader[1]);
    pReader->innerReader[1] = NULL;
  }

  return false;
}

static void setBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
2768 2769 2770 2771
  ASSERT(pDataBlockInfo != NULL && pReader != NULL);
  pDataBlockInfo->rows = pReader->pResBlock->info.rows;
  pDataBlockInfo->uid = pReader->pResBlock->info.uid;
  pDataBlockInfo->window = pReader->pResBlock->info.window;
H
Hongze Cheng 已提交
2772 2773
}

2774 2775
void tsdbRetrieveDataBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
2776
    if (pReader->step == EXTERNAL_ROWS_MAIN) {
2777
      setBlockInfo(pReader, pDataBlockInfo);
2778
    } else if (pReader->step == EXTERNAL_ROWS_PREV) {
2779 2780 2781 2782 2783 2784 2785 2786 2787
      setBlockInfo(pReader->innerReader[0], pDataBlockInfo);
    } else {
      setBlockInfo(pReader->innerReader[1], pDataBlockInfo);
    }
  } else {
    setBlockInfo(pReader, pDataBlockInfo);
  }
}

2788
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SColumnDataAgg*** pBlockStatis, bool* allHave) {
H
Hongze Cheng 已提交
2789
  int32_t code = 0;
2790
  *allHave = false;
H
Hongze Cheng 已提交
2791

2792
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
2793 2794 2795 2796
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }

2797
  // there is no statistics data for composed block
2798 2799 2800 2801
  if (pReader->status.composedDataBlock) {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
2802

2803
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
2804

2805
  SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
2806
  int64_t stime = taosGetTimestampUs();
H
Hongze Cheng 已提交
2807

2808 2809
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

2810
  if (tBlockHasSma(pBlock)) {
2811
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg, NULL);
2812
    if (code != TSDB_CODE_SUCCESS) {
2813 2814
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
2815 2816
      return code;
    }
2817 2818 2819
  } else {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
2820
  }
H
Hongze Cheng 已提交
2821

2822
  *allHave = true;
H
Hongze Cheng 已提交
2823

2824 2825
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
2826

2827 2828
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844
  pTsAgg->min = pReader->pResBlock->info.window.skey;
  pTsAgg->max = pReader->pResBlock->info.window.ekey;
  pSup->plist[0] = pTsAgg;

  // update the number of NULL data rows
  size_t numOfCols = blockDataGetNumOfCols(pReader->pResBlock);

  int32_t i = 0, j = 0;
  while (j < numOfCols && i < taosArrayGetSize(pSup->pColAgg)) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colIds[j]) {
      if (IS_BSMA_ON(&(pReader->pSchema->columns[i]))) {
        pSup->plist[j] = pAgg;
      } else {
        *allHave = false;
      }
2845 2846
      i += 1;
      j += 1;
2847 2848 2849 2850 2851 2852 2853
    } else if (pAgg->colId < pSup->colIds[j]) {
      i += 1;
    } else if (pSup->colIds[j] < pAgg->colId) {
      j += 1;
    }
  }

2854
  double elapsed = (taosGetTimestampUs() - stime) / 1000.0;
2855
  pReader->cost.smaLoadTime += elapsed;
2856
  pReader->cost.smaData += 1;
2857 2858 2859

  *pBlockStatis = pSup->plist;

2860
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", elapsed time:%.2f ms, %s", 0, pFBlock->uid,
2861 2862
            elapsed, pReader->idStr);

H
Hongze Cheng 已提交
2863
  return code;
H
Hongze Cheng 已提交
2864 2865
}

2866
static SArray* doRetrieveDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2867 2868 2869
  SReaderStatus* pStatus = &pReader->status;

  if (pStatus->composedDataBlock) {
2870
    return pReader->pResBlock->pDataBlock;
2871
  }
2872

2873
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pStatus->blockIter);
2874
  STableBlockScanInfo* pBlockScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
2875

H
Haojun Liao 已提交
2876 2877 2878
  tBlockDataReset(&pStatus->fileBlockData);
  tBlockDataClearData(&pStatus->fileBlockData);
  int32_t code = doLoadFileBlockData(pReader, &pStatus->blockIter, pBlockScanInfo, &pStatus->fileBlockData);
2879
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
2880
    tBlockDataClear(&pStatus->fileBlockData, 1);
H
Haojun Liao 已提交
2881

2882 2883
    terrno = code;
    return NULL;
2884
  }
2885 2886 2887

  copyBlockDataToSDataBlock(pReader, pBlockScanInfo);
  return pReader->pResBlock->pDataBlock;
H
Hongze Cheng 已提交
2888 2889
}

2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901
SArray* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
      return doRetrieveDataBlock(pReader->innerReader[0]);
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
      return doRetrieveDataBlock(pReader->innerReader[1]);
    }
  }

  return doRetrieveDataBlock(pReader);
}

H
Haojun Liao 已提交
2902
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
2903 2904 2905
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
2906

L
Liu Jicong 已提交
2907
  pReader->order = pCond->order;
2908
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
2909
  pReader->status.loadFromFile = true;
dengyihao's avatar
dengyihao 已提交
2910
  pReader->status.pTableIter = NULL;
H
Haojun Liao 已提交
2911
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
2912

2913
  // allocate buffer in order to load data blocks from file
2914
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
2915 2916
  memset(pReader->suppInfo.plist, 0, POINTER_BYTES);

2917
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
2918
  tsdbDataFReaderClose(&pReader->pFileReader);
2919

2920
  int32_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
L
Liu Jicong 已提交
2921 2922
  tsdbDataFReaderClose(&pReader->pFileReader);

H
Hongze Cheng 已提交
2923
  initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader->order, pReader->idStr);
2924
  resetDataBlockIterator(&pReader->status.blockIter, pReader->order, pReader->status.pTableMap);
2925
  resetDataBlockScanInfo(pReader->status.pTableMap);
2926

2927
  int32_t         code = 0;
2928 2929
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

2930 2931 2932 2933 2934 2935
  // no data in files, let's try buffer in memory
  if (pReader->status.fileIter.numOfFiles == 0) {
    pReader->status.loadFromFile = false;
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
2936 2937
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
2938 2939 2940
      return code;
    }
  }
H
Hongze Cheng 已提交
2941

dengyihao's avatar
dengyihao 已提交
2942 2943
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s",
            pReader, pReader->suid, numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
2944

2945
  return code;
H
Hongze Cheng 已提交
2946
}
H
Hongze Cheng 已提交
2947

2948 2949 2950
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows) {
  return (numOfRows - startRow) / bucketRange;
}
H
Hongze Cheng 已提交
2951

2952 2953 2954 2955
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
H
Hongze Cheng 已提交
2956

2957 2958
  // find the start data block in file
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
2959

2960 2961 2962
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
2963

2964
  int32_t bucketRange = ceil((pc->maxRows - pc->minRows) / 20.0);
H
Hongze Cheng 已提交
2965

2966
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
2967

2968 2969
  int32_t numOfTables = (int32_t)taosHashGetSize(pStatus->pTableMap);
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
2970

2971 2972
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
2973 2974 2975 2976

  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
  }
H
Hongze Cheng 已提交
2977

2978
  pTableBlockInfo->numOfTables = numOfTables;
H
Haojun Liao 已提交
2979
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
2980

2981 2982
  while (true) {
    if (hasNext) {
H
Haojun Liao 已提交
2983
      SBlock* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
2984

2985 2986
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
2987

2988 2989 2990
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
2991

2992 2993 2994
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
2995

2996 2997 2998
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
2999

3000 3001
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows);
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
3002 3003

      hasNext = blockIteratorNext(&pStatus->blockIter);
3004 3005 3006 3007 3008
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        break;
      }
H
refact  
Hongze Cheng 已提交
3009

3010
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
3011
      hasNext = (pBlockIter->numOfBlocks > 0);
3012
    }
H
refact  
Hongze Cheng 已提交
3013

H
Hongze Cheng 已提交
3014 3015
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
3016
  }
H
Hongze Cheng 已提交
3017

H
refact  
Hongze Cheng 已提交
3018 3019
  return code;
}
H
Hongze Cheng 已提交
3020

H
refact  
Hongze Cheng 已提交
3021
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
3022
  int64_t rows = 0;
H
Hongze Cheng 已提交
3023

3024 3025
  SReaderStatus* pStatus = &pReader->status;
  pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
H
Hongze Cheng 已提交
3026

3027 3028 3029 3030 3031
  while (pStatus->pTableIter != NULL) {
    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;

    STbData* d = NULL;
    if (pReader->pTsdb->mem != NULL) {
3032
      tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid, &d);
3033 3034 3035 3036 3037 3038 3039
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
    if (pReader->pTsdb->imem != NULL) {
3040
      tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid, &di);
3041 3042 3043 3044 3045 3046 3047 3048
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
  }
H
Hongze Cheng 已提交
3049

H
refact  
Hongze Cheng 已提交
3050
  return rows;
H
Hongze Cheng 已提交
3051
}
D
dapan1121 已提交
3052

L
Liu Jicong 已提交
3053
int32_t tsdbGetTableSchema(SVnode* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
  int32_t sversion = 1;

  SMetaReader mr = {0};
  metaReaderInit(&mr, pVnode->pMeta, 0);
  int32_t code = metaGetTableEntryByUid(&mr, uid);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
3066

D
dapan1121 已提交
3067
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
3068
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083
    *suid = mr.me.ctbEntry.suid;
    code = metaGetTableEntryByUid(&mr, *suid);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
    sversion = mr.me.stbEntry.schemaRow.version;
  } else {
    ASSERT(mr.me.type == TSDB_NORMAL_TABLE);
    sversion = mr.me.ntbEntry.schemaRow.version;
  }

  metaReaderClear(&mr);
  *pSchema = metaGetTbTSchema(pVnode->pMeta, uid, sversion);
L
Liu Jicong 已提交
3084

D
dapan1121 已提交
3085 3086
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116

int32_t tsdbTakeReadSnap(STsdb* pTsdb, STsdbReadSnap** ppSnap) {
  int32_t code = 0;

  // alloc
  *ppSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap));
  if (*ppSnap == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
  code = taosThreadRwlockRdlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

  // take snapshot
  (*ppSnap)->pMem = pTsdb->mem;
  (*ppSnap)->pIMem = pTsdb->imem;

  if ((*ppSnap)->pMem) {
    tsdbRefMemTable((*ppSnap)->pMem);
  }

  if ((*ppSnap)->pIMem) {
    tsdbRefMemTable((*ppSnap)->pIMem);
  }

H
Hongze Cheng 已提交
3117
  // fs
H
Hongze Cheng 已提交
3118 3119 3120 3121 3122
  code = tsdbFSRef(pTsdb, &(*ppSnap)->fs);
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
3123 3124 3125 3126 3127 3128 3129 3130

  // unlock
  code = taosThreadRwlockUnlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

S
Shengliang Guan 已提交
3131
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145
_exit:
  return code;
}

void tsdbUntakeReadSnap(STsdb* pTsdb, STsdbReadSnap* pSnap) {
  if (pSnap) {
    if (pSnap->pMem) {
      tsdbUnrefMemTable(pSnap->pMem);
    }

    if (pSnap->pIMem) {
      tsdbUnrefMemTable(pSnap->pIMem);
    }

H
Hongze Cheng 已提交
3146
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
3147
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
3148
  }
H
Hongze Cheng 已提交
3149

S
Shengliang Guan 已提交
3150
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
3151
}