tsdbRead.c 104.5 KB
Newer Older
H
hjxilinx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Hongze Cheng 已提交
16
#include "tsdb.h"
17
#define ASCENDING_TRAVERSE(o) (o == TSDB_ORDER_ASC)
H
Hongze Cheng 已提交
18

19 20 21 22 23 24
typedef enum {
  EXTERNAL_ROWS_PREV = 0x1,
  EXTERNAL_ROWS_MAIN = 0x2,
  EXTERNAL_ROWS_NEXT = 0x3,
} EContentData;

25
typedef struct {
dengyihao's avatar
dengyihao 已提交
26
  STbDataIter* iter;
27 28 29 30
  int32_t      index;
  bool         hasVal;
} SIterInfo;

H
Haojun Liao 已提交
31
typedef struct STableBlockScanInfo {
dengyihao's avatar
dengyihao 已提交
32 33
  uint64_t  uid;
  TSKEY     lastKey;
34
  SMapData  mapData;     // block info (compressed)
dengyihao's avatar
dengyihao 已提交
35 36 37 38 39 40
  SArray*   pBlockList;  // block data index list
  SIterInfo iter;        // mem buffer skip list iterator
  SIterInfo iiter;       // imem buffer skip list iterator
  SArray*   delSkyline;  // delete info for this table
  int32_t   fileDelIndex;
  bool      iterInit;  // whether to initialize the in-memory skip list iterator or not
H
Haojun Liao 已提交
41 42 43
} STableBlockScanInfo;

typedef struct SBlockOrderWrapper {
dengyihao's avatar
dengyihao 已提交
44
  int64_t uid;
45
  int64_t offset;
H
Haojun Liao 已提交
46
} SBlockOrderWrapper;
H
Hongze Cheng 已提交
47 48

typedef struct SBlockOrderSupporter {
49 50 51 52
  SBlockOrderWrapper** pDataBlockInfo;
  int32_t*             indexPerTable;
  int32_t*             numOfBlocksPerTable;
  int32_t              numOfTables;
H
Hongze Cheng 已提交
53 54 55
} SBlockOrderSupporter;

typedef struct SIOCostSummary {
56 57 58
  int64_t numOfBlocks;
  double  blockLoadTime;
  double  buildmemBlock;
59
  int64_t headFileLoad;
60 61 62
  double  headFileLoadTime;
  int64_t smaData;
  double  smaLoadTime;
H
Hongze Cheng 已提交
63 64 65
} SIOCostSummary;

typedef struct SBlockLoadSuppInfo {
66
  SArray*          pColAgg;
67
  SColumnDataAgg   tsColAgg;
C
Cary Xu 已提交
68
  SColumnDataAgg** plist;
69 70
  int16_t*         colIds;    // column ids for loading file block data
  char**           buildBuf;  // build string tmp buffer, todo remove it later after all string format being updated.
H
Hongze Cheng 已提交
71 72
} SBlockLoadSuppInfo;

73
typedef struct SFilesetIter {
H
Hongze Cheng 已提交
74 75 76 77
  int32_t numOfFiles;  // number of total files
  int32_t index;       // current accessed index in the list
  SArray* pFileList;   // data file list
  int32_t order;
78
} SFilesetIter;
H
Haojun Liao 已提交
79 80

typedef struct SFileDataBlockInfo {
81
  // index position in STableBlockScanInfo in order to check whether neighbor block overlaps with it
dengyihao's avatar
dengyihao 已提交
82
  uint64_t uid;
83
  int32_t  tbBlockIdx;
H
Haojun Liao 已提交
84 85 86
} SFileDataBlockInfo;

typedef struct SDataBlockIter {
87 88 89 90 91
  int32_t   numOfBlocks;
  int32_t   index;
  SArray*   blockList;  // SArray<SFileDataBlockInfo>
  int32_t   order;
  SBlock    block;  // current SBlock data
92
  SHashObj* pTableMap;
H
Haojun Liao 已提交
93 94 95
} SDataBlockIter;

typedef struct SFileBlockDumpInfo {
dengyihao's avatar
dengyihao 已提交
96 97 98 99
  int32_t totalRows;
  int32_t rowIndex;
  int64_t lastKey;
  bool    allDumped;
H
Haojun Liao 已提交
100 101
} SFileBlockDumpInfo;

H
Haojun Liao 已提交
102
typedef struct SVersionRange {
dengyihao's avatar
dengyihao 已提交
103 104
  uint64_t minVer;
  uint64_t maxVer;
H
Haojun Liao 已提交
105 106
} SVersionRange;

H
Haojun Liao 已提交
107
typedef struct SReaderStatus {
dengyihao's avatar
dengyihao 已提交
108 109
  bool                 loadFromFile;  // check file stage
  SHashObj*            pTableMap;     // SHash<STableBlockScanInfo>
110
  STableBlockScanInfo* pTableIter;    // table iterator used in building in-memory buffer data blocks.
111
  SFileBlockDumpInfo   fBlockDumpInfo;
112 113 114 115 116
  SDFileSet*           pCurrentFileset;  // current opened file set
  SBlockData           fileBlockData;
  SFilesetIter         fileIter;
  SDataBlockIter       blockIter;
  bool                 composedDataBlock;  // the returned data block is a composed block or not
H
Haojun Liao 已提交
117 118
} SReaderStatus;

H
Hongze Cheng 已提交
119
struct STsdbReader {
H
Haojun Liao 已提交
120 121 122 123 124 125 126
  STsdb*             pTsdb;
  uint64_t           suid;
  int16_t            order;
  STimeWindow        window;  // the primary query time window that applies to all queries
  SSDataBlock*       pResBlock;
  int32_t            capacity;
  SReaderStatus      status;
127 128
  char*              idStr;  // query info handle, for debug purpose
  int32_t            type;   // query type: 1. retrieve all data blocks, 2. retrieve direct prev|next rows
H
Hongze Cheng 已提交
129
  SBlockLoadSuppInfo suppInfo;
H
Hongze Cheng 已提交
130
  STsdbReadSnap*     pReadSnap;
131 132 133 134
  SIOCostSummary     cost;
  STSchema*          pSchema;
  SDataFReader*      pFileReader;
  SVersionRange      verRange;
135

136 137
  int32_t      step;
  STsdbReader* innerReader[2];
H
Hongze Cheng 已提交
138
};
H
Hongze Cheng 已提交
139

H
Haojun Liao 已提交
140
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter);
141 142
static int      buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                          STsdbReader* pReader);
143
static TSDBROW* getValidRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader);
144 145
static int32_t  doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                        SRowMerger* pMerger);
146
static int32_t  doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
dengyihao's avatar
dengyihao 已提交
147
                                 STsdbReader* pReader);
148 149
static int32_t  doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow);
static int32_t  doAppendRowFromBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData, int32_t rowIndex);
150 151
static void     setComposedBlockFlag(STsdbReader* pReader, bool composed);
static void     updateSchema(TSDBROW* pRow, uint64_t uid, STsdbReader* pReader);
152
static bool     hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order);
153

dengyihao's avatar
dengyihao 已提交
154 155
static void doMergeMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                             STsdbReader* pReader);
156 157
static void doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
                               STSRow** pTSRow);
dengyihao's avatar
dengyihao 已提交
158 159 160 161
static int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                                      STbData* piMemTbData);
static STsdb*  getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idstr,
                                   int8_t* pLevel);
162
static SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level);
H
Haojun Liao 已提交
163

164 165 166
static int32_t setColumnIdSlotList(STsdbReader* pReader, SSDataBlock* pBlock) {
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

167
  size_t numOfCols = blockDataGetNumOfCols(pBlock);
168

169
  pSupInfo->colIds = taosMemoryMalloc(numOfCols * sizeof(int16_t));
170
  pSupInfo->buildBuf = taosMemoryCalloc(numOfCols, POINTER_BYTES);
171 172 173
  if (pSupInfo->buildBuf == NULL || pSupInfo->colIds == NULL) {
    taosMemoryFree(pSupInfo->colIds);
    taosMemoryFree(pSupInfo->buildBuf);
H
Haojun Liao 已提交
174 175
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
176

H
Haojun Liao 已提交
177 178
  for (int32_t i = 0; i < numOfCols; ++i) {
    SColumnInfoData* pCol = taosArrayGet(pBlock->pDataBlock, i);
179
    pSupInfo->colIds[i] = pCol->info.colId;
180 181 182 183

    if (IS_VAR_DATA_TYPE(pCol->info.type)) {
      pSupInfo->buildBuf[i] = taosMemoryMalloc(pCol->info.bytes);
    }
H
Haojun Liao 已提交
184
  }
H
Hongze Cheng 已提交
185

H
Haojun Liao 已提交
186 187
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
188

189
static SHashObj* createDataBlockScanInfo(STsdbReader* pTsdbReader, const STableKeyInfo* idList, int32_t numOfTables) {
H
Haojun Liao 已提交
190
  // allocate buffer in order to load data blocks from file
191
  // todo use simple hash instead, optimize the memory consumption
192 193 194
  SHashObj* pTableMap =
      taosHashInit(numOfTables, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), false, HASH_NO_LOCK);
  if (pTableMap == NULL) {
H
Haojun Liao 已提交
195 196 197
    return NULL;
  }

198 199 200 201 202
  for (int32_t j = 0; j < numOfTables; ++j) {
    STableBlockScanInfo info = {.lastKey = 0, .uid = idList[j].uid};
    if (ASCENDING_TRAVERSE(pTsdbReader->order)) {
      if (info.lastKey == INT64_MIN || info.lastKey < pTsdbReader->window.skey) {
        info.lastKey = pTsdbReader->window.skey;
H
Haojun Liao 已提交
203 204
      }

205
      ASSERT(info.lastKey >= pTsdbReader->window.skey && info.lastKey <= pTsdbReader->window.ekey);
wmmhello's avatar
wmmhello 已提交
206
    } else {
207
      info.lastKey = pTsdbReader->window.skey;
H
Haojun Liao 已提交
208
    }
wmmhello's avatar
wmmhello 已提交
209

210 211 212
    taosHashPut(pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
    tsdbDebug("%p check table uid:%" PRId64 " from lastKey:%" PRId64 " %s", pTsdbReader, info.uid, info.lastKey,
              pTsdbReader->idStr);
H
Haojun Liao 已提交
213 214
  }

215 216
  tsdbDebug("%p create %d tables scan-info, size:%.2f Kb, %s", pTsdbReader, numOfTables,
            (sizeof(STableBlockScanInfo) * numOfTables) / 1024.0, pTsdbReader->idStr);
217

218
  return pTableMap;
H
Hongze Cheng 已提交
219
}
H
Hongze Cheng 已提交
220

221 222 223
static void resetDataBlockScanInfo(SHashObj* pTableMap) {
  STableBlockScanInfo* p = NULL;

dengyihao's avatar
dengyihao 已提交
224
  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
225 226
    p->iterInit = false;
    p->iiter.hasVal = false;
dengyihao's avatar
dengyihao 已提交
227
    if (p->iter.iter != NULL) {
228
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
229 230
    }

231
    p->delSkyline = taosArrayDestroy(p->delSkyline);
232 233 234
  }
}

235 236 237 238 239 240 241 242
static void destroyBlockScanInfo(SHashObj* pTableMap) {
  STableBlockScanInfo* p = NULL;

  while ((p = taosHashIterate(pTableMap, p)) != NULL) {
    p->iterInit = false;
    p->iiter.hasVal = false;

    if (p->iter.iter != NULL) {
243
      p->iter.iter = tsdbTbDataIterDestroy(p->iter.iter);
244 245 246
    }

    if (p->iiter.iter != NULL) {
247
      p->iiter.iter = tsdbTbDataIterDestroy(p->iiter.iter);
248 249
    }

250 251
    p->delSkyline = taosArrayDestroy(p->delSkyline);
    p->pBlockList = taosArrayDestroy(p->pBlockList);
252
    tMapDataClear(&p->mapData);
253 254 255 256 257
  }

  taosHashCleanup(pTableMap);
}

258
static bool isEmptyQueryTimeWindow(STimeWindow* pWindow) {
259 260
  ASSERT(pWindow != NULL);
  return pWindow->skey > pWindow->ekey;
H
Haojun Liao 已提交
261
}
H
Hongze Cheng 已提交
262

263 264 265
// Update the query time window according to the data time to live(TTL) information, in order to avoid to return
// the expired data to client, even it is queried already.
static STimeWindow updateQueryTimeWindow(STsdb* pTsdb, STimeWindow* pWindow) {
dengyihao's avatar
dengyihao 已提交
266
  STsdbKeepCfg* pCfg = &pTsdb->keepCfg;
H
Hongze Cheng 已提交
267

268
  int64_t now = taosGetTimestamp(pCfg->precision);
dengyihao's avatar
dengyihao 已提交
269
  int64_t earilyTs = now - (tsTickPerMin[pCfg->precision] * pCfg->keep2) + 1;  // needs to add one tick
270

dengyihao's avatar
dengyihao 已提交
271
  STimeWindow win = *pWindow;
272 273 274 275 276 277
  if (win.skey < earilyTs) {
    win.skey = earilyTs;
  }

  return win;
}
H
Hongze Cheng 已提交
278

H
Haojun Liao 已提交
279
static void limitOutputBufferSize(const SQueryTableDataCond* pCond, int32_t* capacity) {
H
Haojun Liao 已提交
280 281 282 283 284 285
  int32_t rowLen = 0;
  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    rowLen += pCond->colList[i].bytes;
  }

  // make sure the output SSDataBlock size be less than 2MB.
H
Haojun Liao 已提交
286 287 288
  const int32_t TWOMB = 2 * 1024 * 1024;
  if ((*capacity) * rowLen > TWOMB) {
    (*capacity) = TWOMB / rowLen;
H
Haojun Liao 已提交
289 290 291 292
  }
}

// init file iterator
H
Hongze Cheng 已提交
293 294
static int32_t initFilesetIterator(SFilesetIter* pIter, SArray* aDFileSet, int32_t order, const char* idstr) {
  size_t numOfFileset = taosArrayGetSize(aDFileSet);
295

296 297
  pIter->index = ASCENDING_TRAVERSE(order) ? -1 : numOfFileset;
  pIter->order = order;
H
Hongze Cheng 已提交
298
  pIter->pFileList = aDFileSet;
299
  pIter->numOfFiles = numOfFileset;
H
Haojun Liao 已提交
300

H
Haojun Liao 已提交
301
  tsdbDebug("init fileset iterator, total files:%d %s", pIter->numOfFiles, idstr);
H
Haojun Liao 已提交
302 303 304
  return TSDB_CODE_SUCCESS;
}

305
static bool filesetIteratorNext(SFilesetIter* pIter, STsdbReader* pReader) {
306 307
  bool    asc = ASCENDING_TRAVERSE(pIter->order);
  int32_t step = asc ? 1 : -1;
308 309 310
  pIter->index += step;

  if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
H
Haojun Liao 已提交
311 312 313 314 315
    return false;
  }

  // check file the time range of coverage
  STimeWindow win = {0};
H
Hongze Cheng 已提交
316

317
  while (1) {
H
Haojun Liao 已提交
318 319 320
    if (pReader->pFileReader != NULL) {
      tsdbDataFReaderClose(&pReader->pFileReader);
    }
321

322
    pReader->status.pCurrentFileset = (SDFileSet*)taosArrayGet(pIter->pFileList, pIter->index);
H
Haojun Liao 已提交
323

324 325 326 327
    int32_t code = tsdbDataFReaderOpen(&pReader->pFileReader, pReader->pTsdb, pReader->status.pCurrentFileset);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
H
Haojun Liao 已提交
328

329 330
    pReader->cost.headFileLoad += 1;

331 332 333 334 335 336 337 338 339 340 341 342
    int32_t fid = pReader->status.pCurrentFileset->fid;
    tsdbFidKeyRange(fid, pReader->pTsdb->keepCfg.days, pReader->pTsdb->keepCfg.precision, &win.skey, &win.ekey);

    // current file are no longer overlapped with query time window, ignore remain files
    if ((asc && win.skey > pReader->window.ekey) || (!asc && win.ekey < pReader->window.skey)) {
      tsdbDebug("%p remain files are not qualified for qrange:%" PRId64 "-%" PRId64 ", ignore, %s", pReader,
                pReader->window.skey, pReader->window.ekey, pReader->idStr);
      return false;
    }

    if ((asc && (win.ekey < pReader->window.skey)) || ((!asc) && (win.skey > pReader->window.ekey))) {
      pIter->index += step;
343 344 345
      if ((asc && pIter->index >= pIter->numOfFiles) || ((!asc) && pIter->index < 0)) {
        return false;
      }
346 347
      continue;
    }
C
Cary Xu 已提交
348

349
    tsdbDebug("%p file found fid:%d for qrange:%" PRId64 "-%" PRId64 ", %s", pReader, fid, pReader->window.skey,
350
              pReader->window.ekey, pReader->idStr);
351 352
    return true;
  }
353

354
_err:
H
Haojun Liao 已提交
355 356 357
  return false;
}

358
static void resetDataBlockIterator(SDataBlockIter* pIter, int32_t order, SHashObj* pTableMap) {
359 360
  pIter->order = order;
  pIter->index = -1;
H
Haojun Liao 已提交
361
  pIter->numOfBlocks = -1;
362 363 364 365 366
  if (pIter->blockList == NULL) {
    pIter->blockList = taosArrayInit(4, sizeof(SFileDataBlockInfo));
  } else {
    taosArrayClear(pIter->blockList);
  }
367
  pIter->pTableMap = pTableMap;
368 369
}

L
Liu Jicong 已提交
370
static void cleanupDataBlockIterator(SDataBlockIter* pIter) { taosArrayDestroy(pIter->blockList); }
H
Haojun Liao 已提交
371

H
Haojun Liao 已提交
372
static void initReaderStatus(SReaderStatus* pStatus) {
dengyihao's avatar
dengyihao 已提交
373 374
  pStatus->pTableIter = NULL;
  pStatus->loadFromFile = true;
H
Haojun Liao 已提交
375 376
}

377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
static SSDataBlock* createResBlock(SQueryTableDataCond* pCond, int32_t capacity) {
  SSDataBlock* pResBlock = createDataBlock();
  if (pResBlock == NULL) {
    terrno = TSDB_CODE_OUT_OF_MEMORY;
    return NULL;
  }

  for (int32_t i = 0; i < pCond->numOfCols; ++i) {
    SColumnInfoData colInfo = {{0}, 0};
    colInfo.info = pCond->colList[i];
    blockDataAppendColInfo(pResBlock, &colInfo);
  }

  int32_t code = blockDataEnsureCapacity(pResBlock, capacity);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    taosMemoryFree(pResBlock);
    return NULL;
  }

  return pResBlock;
}

400 401
static int32_t tsdbReaderCreate(SVnode* pVnode, SQueryTableDataCond* pCond, STsdbReader** ppReader, int32_t capacity,
                                const char* idstr) {
H
Haojun Liao 已提交
402
  int32_t      code = 0;
403
  int8_t       level = 0;
H
Haojun Liao 已提交
404
  STsdbReader* pReader = (STsdbReader*)taosMemoryCalloc(1, sizeof(*pReader));
H
Hongze Cheng 已提交
405 406
  if (pReader == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
H
Haojun Liao 已提交
407
    goto _end;
H
Hongze Cheng 已提交
408 409
  }

H
Haojun Liao 已提交
410
  initReaderStatus(&pReader->status);
411

L
Liu Jicong 已提交
412
  pReader->pTsdb = getTsdbByRetentions(pVnode, pCond->twindows.skey, pVnode->config.tsdbCfg.retentions, idstr, &level);
dengyihao's avatar
dengyihao 已提交
413 414
  pReader->suid = pCond->suid;
  pReader->order = pCond->order;
415
  pReader->capacity = capacity;
dengyihao's avatar
dengyihao 已提交
416 417
  pReader->idStr = (idstr != NULL) ? strdup(idstr) : NULL;
  pReader->verRange = getQueryVerRange(pVnode, pCond, level);
418
  pReader->type = pCond->type;
419
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
420

421
  ASSERT(pCond->numOfCols > 0);
H
Hongze Cheng 已提交
422

423
  limitOutputBufferSize(pCond, &pReader->capacity);
424

425 426
  // allocate buffer in order to load data blocks from file
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;
427
  pSup->pColAgg = taosArrayInit(4, sizeof(SColumnDataAgg));
428
  pSup->plist = taosMemoryCalloc(pCond->numOfCols, POINTER_BYTES);
429
  if (pSup->pColAgg == NULL || pSup->plist == NULL) {
430 431 432
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _end;
  }
H
Haojun Liao 已提交
433

434 435
  pSup->tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;

H
Haojun Liao 已提交
436 437 438 439 440 441
  code = tBlockDataInit(&pReader->status.fileBlockData);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = code;
    goto _end;
  }

442 443 444 445
  pReader->pResBlock = createResBlock(pCond, pReader->capacity);
  if (pReader->pResBlock == NULL) {
    code = terrno;
    goto _end;
H
Hongze Cheng 已提交
446
  }
H
Hongze Cheng 已提交
447

448 449
  setColumnIdSlotList(pReader, pReader->pResBlock);

H
Hongze Cheng 已提交
450 451
  *ppReader = pReader;
  return code;
H
Hongze Cheng 已提交
452

H
Haojun Liao 已提交
453 454
_end:
  tsdbReaderClose(pReader);
H
Hongze Cheng 已提交
455 456 457
  *ppReader = NULL;
  return code;
}
H
Hongze Cheng 已提交
458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490

// void tsdbResetQueryHandleForNewTable(STsdbReader* queryHandle, SQueryTableDataCond* pCond, STableListInfo* tableList,
//                                      int32_t tWinIdx) {
//   STsdbReader* pTsdbReadHandle = queryHandle;

//   pTsdbReadHandle->order = pCond->order;
//   pTsdbReadHandle->window = pCond->twindows[tWinIdx];
//   pTsdbReadHandle->type = TSDB_QUERY_TYPE_ALL;
//   pTsdbReadHandle->cur.fid = -1;
//   pTsdbReadHandle->cur.win = TSWINDOW_INITIALIZER;
//   pTsdbReadHandle->checkFiles = true;
//   pTsdbReadHandle->activeIndex = 0;  // current active table index
//   pTsdbReadHandle->locateStart = false;
//   pTsdbReadHandle->loadExternalRow = pCond->loadExternalRows;

//   if (ASCENDING_TRAVERSE(pCond->order)) {
//     assert(pTsdbReadHandle->window.skey <= pTsdbReadHandle->window.ekey);
//   } else {
//     assert(pTsdbReadHandle->window.skey >= pTsdbReadHandle->window.ekey);
//   }

//   // allocate buffer in order to load data blocks from file
//   memset(pTsdbReadHandle->suppInfo.pstatis, 0, sizeof(SColumnDataAgg));
//   memset(pTsdbReadHandle->suppInfo.plist, 0, POINTER_BYTES);

//   tsdbInitDataBlockLoadInfo(&pTsdbReadHandle->dataBlockLoadInfo);
//   tsdbInitCompBlockLoadInfo(&pTsdbReadHandle->compBlockLoadInfo);

//   SArray* pTable = NULL;
//   //  STsdbMeta* pMeta = tsdbGetMeta(pTsdbReadHandle->pTsdb);

//   //  pTsdbReadHandle->pTableCheckInfo = destroyTableCheckInfo(pTsdbReadHandle->pTableCheckInfo);

H
Haojun Liao 已提交
491
//   pTsdbReadHandle->pTableCheckInfo = NULL;  // createDataBlockScanInfo(pTsdbReadHandle, groupList, pMeta,
H
Hongze Cheng 已提交
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
//                                             // &pTable);
//   if (pTsdbReadHandle->pTableCheckInfo == NULL) {
//     //    tsdbReaderClose(pTsdbReadHandle);
//     terrno = TSDB_CODE_TDB_OUT_OF_MEMORY;
//   }

//   //  pTsdbReadHandle->prev = doFreeColumnInfoData(pTsdbReadHandle->prev);
//   //  pTsdbReadHandle->next = doFreeColumnInfoData(pTsdbReadHandle->next);
// }

// SArray* tsdbGetQueriedTableList(STsdbReader** pHandle) {
//   assert(pHandle != NULL);

//   STsdbReader* pTsdbReadHandle = (STsdbReader*)pHandle;

//   size_t  size = taosArrayGetSize(pTsdbReadHandle->pTableCheckInfo);
//   SArray* res = taosArrayInit(size, POINTER_BYTES);
//   return res;
// }

// static int32_t binarySearchForBlock(SBlock* pBlock, int32_t numOfBlocks, TSKEY skey, int32_t order) {
//   int32_t firstSlot = 0;
//   int32_t lastSlot = numOfBlocks - 1;
H
Hongze Cheng 已提交
515

H
Hongze Cheng 已提交
516
//   int32_t midSlot = firstSlot;
H
Hongze Cheng 已提交
517

H
Hongze Cheng 已提交
518 519 520
//   while (1) {
//     numOfBlocks = lastSlot - firstSlot + 1;
//     midSlot = (firstSlot + (numOfBlocks >> 1));
H
Hongze Cheng 已提交
521

H
Hongze Cheng 已提交
522
//     if (numOfBlocks == 1) break;
H
Hongze Cheng 已提交
523

H
Hongze Cheng 已提交
524 525 526 527 528 529 530 531 532 533 534
//     if (skey > pBlock[midSlot].maxKey.ts) {
//       if (numOfBlocks == 2) break;
//       if ((order == TSDB_ORDER_DESC) && (skey < pBlock[midSlot + 1].minKey.ts)) break;
//       firstSlot = midSlot + 1;
//     } else if (skey < pBlock[midSlot].minKey.ts) {
//       if ((order == TSDB_ORDER_ASC) && (skey > pBlock[midSlot - 1].maxKey.ts)) break;
//       lastSlot = midSlot - 1;
//     } else {
//       break;  // got the slot
//     }
//   }
H
Hongze Cheng 已提交
535

H
Hongze Cheng 已提交
536 537
//   return midSlot;
// }
H
Hongze Cheng 已提交
538

H
Haojun Liao 已提交
539
static int32_t doLoadBlockIndex(STsdbReader* pReader, SDataFReader* pFileReader, SArray* pIndexList) {
540
  SArray* aBlockIdx = taosArrayInit(8, sizeof(SBlockIdx));
H
Hongze Cheng 已提交
541

542
  int64_t st = taosGetTimestampUs();
543
  int32_t code = tsdbReadBlockIdx(pFileReader, aBlockIdx, NULL);
H
Haojun Liao 已提交
544
  if (code != TSDB_CODE_SUCCESS) {
545
    goto _end;
H
Haojun Liao 已提交
546
  }
H
Hongze Cheng 已提交
547

548 549
  size_t num = taosArrayGetSize(aBlockIdx);
  if (num == 0) {
H
Hongze Cheng 已提交
550
    taosArrayClear(aBlockIdx);
H
Haojun Liao 已提交
551 552
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
553

554 555 556 557
  int64_t et1 = taosGetTimestampUs();

  SBlockIdx* pBlockIdx = NULL;
  for (int32_t i = 0; i < num; ++i) {
558
    pBlockIdx = (SBlockIdx*)taosArrayGet(aBlockIdx, i);
H
Haojun Liao 已提交
559

560
    // uid check
H
Hongze Cheng 已提交
561
    if (pBlockIdx->suid != pReader->suid) {
H
Haojun Liao 已提交
562 563 564 565
      continue;
    }

    // this block belongs to a table that is not queried.
H
Hongze Cheng 已提交
566
    void* p = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(uint64_t));
H
Haojun Liao 已提交
567 568 569 570 571 572
    if (p == NULL) {
      continue;
    }

    STableBlockScanInfo* pScanInfo = p;
    if (pScanInfo->pBlockList == NULL) {
573
      pScanInfo->pBlockList = taosArrayInit(4, sizeof(int32_t));
H
Haojun Liao 已提交
574 575
    }

H
Hongze Cheng 已提交
576
    taosArrayPush(pIndexList, pBlockIdx);
H
Haojun Liao 已提交
577
  }
H
Hongze Cheng 已提交
578

579
  int64_t et2 = taosGetTimestampUs();
580
  tsdbDebug("load block index for %d tables completed, elapsed time:%.2f ms, set blockIdx:%.2f ms, size:%.2f Kb %s",
581
            (int32_t)num, (et1 - st) / 1000.0, (et2 - et1) / 1000.0, num * sizeof(SBlockIdx) / 1024.0, pReader->idStr);
582 583 584

  pReader->cost.headFileLoadTime += (et1 - st) / 1000.0;

585
_end:
H
Hongze Cheng 已提交
586
  taosArrayDestroy(aBlockIdx);
H
Haojun Liao 已提交
587 588
  return code;
}
H
Hongze Cheng 已提交
589

590 591
static int32_t doLoadFileBlock(STsdbReader* pReader, SArray* pIndexList, uint32_t* numOfValidTables,
                               int32_t* numOfBlocks) {
H
Haojun Liao 已提交
592 593
  size_t numOfTables = taosArrayGetSize(pIndexList);
  *numOfValidTables = 0;
H
Hongze Cheng 已提交
594

595
  int64_t st = taosGetTimestampUs();
596
  size_t  size = 0;
597

598
  STableBlockScanInfo* px = NULL;
dengyihao's avatar
dengyihao 已提交
599
  while (1) {
600 601 602 603 604
    px = taosHashIterate(pReader->status.pTableMap, px);
    if (px == NULL) {
      break;
    }

605
    tMapDataClear(&px->mapData);
606 607 608
    taosArrayClear(px->pBlockList);
  }

dengyihao's avatar
dengyihao 已提交
609
  for (int32_t i = 0; i < numOfTables; ++i) {
H
Haojun Liao 已提交
610
    SBlockIdx* pBlockIdx = taosArrayGet(pIndexList, i);
H
Hongze Cheng 已提交
611

612
    STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pBlockIdx->uid, sizeof(int64_t));
H
Hongze Cheng 已提交
613

614 615
    tMapDataReset(&pScanInfo->mapData);
    tsdbReadBlock(pReader->pFileReader, pBlockIdx, &pScanInfo->mapData, NULL);
616

617 618
    size += pScanInfo->mapData.nData;
    for (int32_t j = 0; j < pScanInfo->mapData.nItem; ++j) {
H
Haojun Liao 已提交
619
      SBlock block = {0};
620
      tMapDataGetItemByIdx(&pScanInfo->mapData, j, &block, tGetBlock);
H
Hongze Cheng 已提交
621

622
      // 1. time range check
623
      if (block.minKey.ts > pReader->window.ekey || block.maxKey.ts < pReader->window.skey) {
H
Haojun Liao 已提交
624 625
        continue;
      }
H
Hongze Cheng 已提交
626

627
      // 2. version range check
628 629 630
      if (block.minVersion > pReader->verRange.maxVer || block.maxVersion < pReader->verRange.minVer) {
        continue;
      }
631

632
      void* p = taosArrayPush(pScanInfo->pBlockList, &j);
H
Haojun Liao 已提交
633
      if (p == NULL) {
634
        tMapDataClear(&pScanInfo->mapData);
H
Haojun Liao 已提交
635 636
        return TSDB_CODE_OUT_OF_MEMORY;
      }
637 638

      (*numOfBlocks) += 1;
H
Haojun Liao 已提交
639
    }
H
Hongze Cheng 已提交
640

H
Haojun Liao 已提交
641 642 643 644
    if (pScanInfo->pBlockList != NULL && taosArrayGetSize(pScanInfo->pBlockList) > 0) {
      (*numOfValidTables) += 1;
    }
  }
H
Hongze Cheng 已提交
645

646
  double el = (taosGetTimestampUs() - st) / 1000.0;
647
  tsdbDebug("load block of %d tables completed, blocks:%d in %d tables, size:%.2f Kb, elapsed time:%.2f ms %s",
648
            numOfTables, *numOfBlocks, *numOfValidTables, size / 1000.0, el, pReader->idStr);
649 650 651

  pReader->cost.numOfBlocks += (*numOfBlocks);
  pReader->cost.headFileLoadTime += el;
652

H
Haojun Liao 已提交
653 654
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
655

656 657
// todo remove pblock parameter
static void setBlockAllDumped(SFileBlockDumpInfo* pDumpInfo, SBlock* pBlock, int32_t order) {
658
  int32_t step = ASCENDING_TRAVERSE(order) ? 1 : -1;
H
Haojun Liao 已提交
659

660
  pDumpInfo->allDumped = true;
661
  pDumpInfo->lastKey = pBlock->maxKey.ts + step;
H
Haojun Liao 已提交
662 663
}

664 665
static void doCopyColVal(SColumnInfoData* pColInfoData, int32_t rowIndex, int32_t colIndex, SColVal* pColVal,
                         SBlockLoadSuppInfo* pSup) {
H
Haojun Liao 已提交
666
  if (IS_VAR_DATA_TYPE(pColVal->type)) {
667
    if (pColVal->isNull || pColVal->isNone) {
H
Haojun Liao 已提交
668 669 670 671 672 673 674
      colDataAppendNULL(pColInfoData, rowIndex);
    } else {
      varDataSetLen(pSup->buildBuf[colIndex], pColVal->value.nData);
      memcpy(varDataVal(pSup->buildBuf[colIndex]), pColVal->value.pData, pColVal->value.nData);
      colDataAppend(pColInfoData, rowIndex, pSup->buildBuf[colIndex], false);
    }
  } else {
675
    colDataAppend(pColInfoData, rowIndex, (const char*)&pColVal->value, pColVal->isNull || pColVal->isNone);
H
Haojun Liao 已提交
676
  }
H
Haojun Liao 已提交
677 678
}

679 680 681 682 683
static SFileDataBlockInfo* getCurrentBlockInfo(SDataBlockIter* pBlockIter) {
  SFileDataBlockInfo* pFBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
  return pFBlockInfo;
}

684
static SBlock* getCurrentBlock(SDataBlockIter* pBlockIter) { return &pBlockIter->block; }
685

686
static int32_t copyBlockDataToSDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
687
  SReaderStatus*  pStatus = &pReader->status;
688
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
H
Hongze Cheng 已提交
689

690
  SBlockData*         pBlockData = &pStatus->fileBlockData;
H
Haojun Liao 已提交
691
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
692
  SBlock*             pBlock = getCurrentBlock(pBlockIter);
H
Haojun Liao 已提交
693
  SSDataBlock*        pResBlock = pReader->pResBlock;
694
  int32_t             numOfOutputCols = blockDataGetNumOfCols(pResBlock);
H
Haojun Liao 已提交
695

H
Haojun Liao 已提交
696
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
697
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
H
Haojun Liao 已提交
698

H
Haojun Liao 已提交
699
  SColVal cv = {0};
700
  int64_t st = taosGetTimestampUs();
701 702
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
  int32_t step = asc ? 1 : -1;
703

704
  int32_t rowIndex = 0;
705 706
  int32_t remain = asc ? (pBlockData->nRow - pDumpInfo->rowIndex) : (pDumpInfo->rowIndex + 1);

707 708 709 710 711 712 713 714
  int32_t endIndex = 0;
  if (remain <= pReader->capacity) {
    endIndex = pBlockData->nRow;
  } else {
    endIndex = pDumpInfo->rowIndex + step * pReader->capacity;
    remain = pReader->capacity;
  }

715
  int32_t          i = 0;
716 717
  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
718
    for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
719 720 721 722 723
      colDataAppend(pColData, rowIndex++, (const char*)&pBlockData->aTSKEY[j], false);
    }
    i += 1;
  }

724 725 726
  int32_t colIndex = 0;
  int32_t num = taosArrayGetSize(pBlockData->aIdx);
  while (i < numOfOutputCols && colIndex < num) {
727 728 729
    rowIndex = 0;
    pColData = taosArrayGet(pResBlock->pDataBlock, i);

H
Hongze Cheng 已提交
730
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, colIndex);
731 732

    if (pData->cid == pColData->info.colId) {
733
      for (int32_t j = pDumpInfo->rowIndex; j < endIndex && j >= 0; j += step) {
734 735
        tColDataGetValue(pData, j, &cv);
        doCopyColVal(pColData, rowIndex++, i, &cv, pSupInfo);
H
Haojun Liao 已提交
736
      }
737
      colIndex += 1;
738
      ASSERT(rowIndex == remain);
739 740
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNNULL(pColData, 0, remain);
H
Haojun Liao 已提交
741
    }
742 743 744 745

    i += 1;
  }

746
  while (i < numOfOutputCols) {
747 748 749
    pColData = taosArrayGet(pResBlock->pDataBlock, i);
    colDataAppendNNULL(pColData, 0, remain);
    i += 1;
H
Haojun Liao 已提交
750
  }
H
Haojun Liao 已提交
751

752
  pResBlock->info.rows = remain;
753
  pDumpInfo->rowIndex += step * remain;
754 755

  setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
H
Haojun Liao 已提交
756

757
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
H
Haojun Liao 已提交
758
  pReader->cost.blockLoadTime += elapsedTime;
H
Haojun Liao 已提交
759

760
  int32_t unDumpedRows = asc ? pBlock->nRow - pDumpInfo->rowIndex : pDumpInfo->rowIndex + 1;
H
Haojun Liao 已提交
761
  tsdbDebug("%p load file block into buffer, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
762
            ", rows:%d, remain:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
763 764 765 766 767 768
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, remain, unDumpedRows,
            pBlock->minVersion, pBlock->maxVersion, elapsedTime, pReader->idStr);

  return TSDB_CODE_SUCCESS;
}

769 770
static int32_t doLoadFileBlockData(STsdbReader* pReader, SDataBlockIter* pBlockIter,
                                   STableBlockScanInfo* pBlockScanInfo, SBlockData* pBlockData) {
771 772 773
  int64_t st = taosGetTimestampUs();

  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(pBlockIter);
774
  SBlock*             pBlock = getCurrentBlock(pBlockIter);
775

776 777
  SSDataBlock* pResBlock = pReader->pResBlock;
  int32_t      numOfCols = blockDataGetNumOfCols(pResBlock);
778 779 780 781

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

782 783 784
  SBlockIdx blockIdx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
  int32_t   code =
      tsdbReadColData(pReader->pFileReader, &blockIdx, pBlock, pSupInfo->colIds, numOfCols, pBlockData, NULL, NULL);
785 786 787 788
  if (code != TSDB_CODE_SUCCESS) {
    goto _error;
  }

789
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
790 791 792 793
  pReader->cost.blockLoadTime += elapsedTime;

  pDumpInfo->allDumped = false;
  tsdbDebug("%p load file block into buffer, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
794
            ", rows:%d, minVer:%" PRId64 ", maxVer:%" PRId64 ", elapsed time:%.2f ms, %s",
795
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
H
Haojun Liao 已提交
796
            pBlock->minVersion, pBlock->maxVersion, elapsedTime, pReader->idStr);
797

H
Haojun Liao 已提交
798
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
799 800

_error:
H
Haojun Liao 已提交
801 802 803 804 805
  tsdbError("%p error occurs in loading file block, global index:%d, table index:%d, brange:%" PRId64 "-%" PRId64
            ", rows:%d, %s",
            pReader, pBlockIter->index, pFBlock->tbBlockIdx, pBlock->minKey.ts, pBlock->maxKey.ts, pBlock->nRow,
            pReader->idStr);
  return code;
H
Haojun Liao 已提交
806
}
H
Hongze Cheng 已提交
807

H
Haojun Liao 已提交
808 809 810
static void cleanupBlockOrderSupporter(SBlockOrderSupporter* pSup) {
  taosMemoryFreeClear(pSup->numOfBlocksPerTable);
  taosMemoryFreeClear(pSup->indexPerTable);
H
Hongze Cheng 已提交
811

H
Haojun Liao 已提交
812 813 814 815
  for (int32_t i = 0; i < pSup->numOfTables; ++i) {
    SBlockOrderWrapper* pBlockInfo = pSup->pDataBlockInfo[i];
    taosMemoryFreeClear(pBlockInfo);
  }
H
Hongze Cheng 已提交
816

H
Haojun Liao 已提交
817 818
  taosMemoryFreeClear(pSup->pDataBlockInfo);
}
H
Hongze Cheng 已提交
819

H
Haojun Liao 已提交
820 821
static int32_t initBlockOrderSupporter(SBlockOrderSupporter* pSup, int32_t numOfTables) {
  ASSERT(numOfTables >= 1);
H
Hongze Cheng 已提交
822

H
Haojun Liao 已提交
823
  pSup->numOfBlocksPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
824 825
  pSup->indexPerTable = taosMemoryCalloc(1, sizeof(int32_t) * numOfTables);
  pSup->pDataBlockInfo = taosMemoryCalloc(1, POINTER_BYTES * numOfTables);
H
Hongze Cheng 已提交
826

H
Haojun Liao 已提交
827 828 829 830
  if (pSup->numOfBlocksPerTable == NULL || pSup->indexPerTable == NULL || pSup->pDataBlockInfo == NULL) {
    cleanupBlockOrderSupporter(pSup);
    return TSDB_CODE_OUT_OF_MEMORY;
  }
H
Hongze Cheng 已提交
831

H
Haojun Liao 已提交
832 833
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
834

H
Haojun Liao 已提交
835
static int32_t fileDataBlockOrderCompar(const void* pLeft, const void* pRight, void* param) {
836
  int32_t leftIndex = *(int32_t*)pLeft;
H
Haojun Liao 已提交
837
  int32_t rightIndex = *(int32_t*)pRight;
H
Hongze Cheng 已提交
838

H
Haojun Liao 已提交
839
  SBlockOrderSupporter* pSupporter = (SBlockOrderSupporter*)param;
H
Hongze Cheng 已提交
840

H
Haojun Liao 已提交
841 842
  int32_t leftTableBlockIndex = pSupporter->indexPerTable[leftIndex];
  int32_t rightTableBlockIndex = pSupporter->indexPerTable[rightIndex];
H
Hongze Cheng 已提交
843

H
Haojun Liao 已提交
844 845 846 847 848 849 850
  if (leftTableBlockIndex > pSupporter->numOfBlocksPerTable[leftIndex]) {
    /* left block is empty */
    return 1;
  } else if (rightTableBlockIndex > pSupporter->numOfBlocksPerTable[rightIndex]) {
    /* right block is empty */
    return -1;
  }
H
Hongze Cheng 已提交
851

852
  SBlockOrderWrapper* pLeftBlock = &pSupporter->pDataBlockInfo[leftIndex][leftTableBlockIndex];
H
Haojun Liao 已提交
853
  SBlockOrderWrapper* pRightBlock = &pSupporter->pDataBlockInfo[rightIndex][rightTableBlockIndex];
H
Hongze Cheng 已提交
854

855 856 857 858
  return pLeftBlock->offset > pRightBlock->offset ? 1 : -1;
}

static int32_t doSetCurrentBlock(SDataBlockIter* pBlockIter) {
859
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
860 861 862 863 864 865 866 867 868 869
  STableBlockScanInfo* pScanInfo = taosHashGet(pBlockIter->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));

  int32_t* mapDataIndex = taosArrayGet(pScanInfo->pBlockList, pFBlock->tbBlockIdx);
  tMapDataGetItemByIdx(&pScanInfo->mapData, *mapDataIndex, &pBlockIter->block, tGetBlock);

#if 0
  qDebug("check file block, table uid:%"PRIu64" index:%d offset:%"PRId64", ", pScanInfo->uid, *mapDataIndex, pBlockIter->block.aSubBlock[0].offset);
#endif

  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
870
}
H
Hongze Cheng 已提交
871

H
Haojun Liao 已提交
872
static int32_t initBlockIterator(STsdbReader* pReader, SDataBlockIter* pBlockIter, int32_t numOfBlocks) {
873
  bool asc = ASCENDING_TRAVERSE(pReader->order);
H
Haojun Liao 已提交
874

875
  pBlockIter->numOfBlocks = numOfBlocks;
876 877
  taosArrayClear(pBlockIter->blockList);

878 879
  // access data blocks according to the offset of each block in asc/desc order.
  int32_t numOfTables = (int32_t)taosHashGetSize(pReader->status.pTableMap);
H
Haojun Liao 已提交
880

881
  int64_t st = taosGetTimestampUs();
H
Haojun Liao 已提交
882

883
  SBlockOrderSupporter sup = {0};
884
  int32_t              code = initBlockOrderSupporter(&sup, numOfTables);
885 886 887
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
H
Haojun Liao 已提交
888

889 890 891 892 893 894 895
  int32_t cnt = 0;
  void*   ptr = NULL;
  while (1) {
    ptr = taosHashIterate(pReader->status.pTableMap, ptr);
    if (ptr == NULL) {
      break;
    }
H
Haojun Liao 已提交
896

897 898 899 900
    STableBlockScanInfo* pTableScanInfo = (STableBlockScanInfo*)ptr;
    if (pTableScanInfo->pBlockList == NULL || taosArrayGetSize(pTableScanInfo->pBlockList) == 0) {
      continue;
    }
H
Haojun Liao 已提交
901

902 903
    size_t num = taosArrayGetSize(pTableScanInfo->pBlockList);
    sup.numOfBlocksPerTable[sup.numOfTables] = num;
H
Haojun Liao 已提交
904

905 906 907 908 909
    char* buf = taosMemoryMalloc(sizeof(SBlockOrderWrapper) * num);
    if (buf == NULL) {
      cleanupBlockOrderSupporter(&sup);
      return TSDB_CODE_TDB_OUT_OF_MEMORY;
    }
H
Haojun Liao 已提交
910

911
    sup.pDataBlockInfo[sup.numOfTables] = (SBlockOrderWrapper*)buf;
912
    SBlock block = {0};
913 914
    for (int32_t k = 0; k < num; ++k) {
      SBlockOrderWrapper wrapper = {0};
915 916 917 918

      int32_t* mapDataIndex = taosArrayGet(pTableScanInfo->pBlockList, k);
      tMapDataGetItemByIdx(&pTableScanInfo->mapData, *mapDataIndex, &block, tGetBlock);

919
      wrapper.uid = pTableScanInfo->uid;
920
      wrapper.offset = block.aSubBlock[0].offset;
H
Haojun Liao 已提交
921

922 923 924 925 926 927
      sup.pDataBlockInfo[sup.numOfTables][k] = wrapper;
      cnt++;
    }

    sup.numOfTables += 1;
  }
H
Haojun Liao 已提交
928

929
  ASSERT(numOfBlocks == cnt);
H
Haojun Liao 已提交
930

931 932 933 934 935
  // since there is only one table qualified, blocks are not sorted
  if (sup.numOfTables == 1) {
    for (int32_t i = 0; i < numOfBlocks; ++i) {
      SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[0][i].uid, .tbBlockIdx = i};
      taosArrayPush(pBlockIter->blockList, &blockInfo);
936
    }
937

938
    int64_t et = taosGetTimestampUs();
939 940
    tsdbDebug("%p create blocks info struct completed for one table, %d blocks not sorted, elapsed time:%.2f ms %s",
              pReader, cnt, (et - st) / 1000.0, pReader->idStr);
H
Haojun Liao 已提交
941

942
    pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
H
Haojun Liao 已提交
943
    cleanupBlockOrderSupporter(&sup);
944
    doSetCurrentBlock(pBlockIter);
945
    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
946
  }
H
Haojun Liao 已提交
947

948 949
  tsdbDebug("%p create data blocks info struct completed, %d blocks in %d tables %s", pReader, cnt, sup.numOfTables,
            pReader->idStr);
950

951
  assert(cnt <= numOfBlocks && sup.numOfTables <= numOfTables);
H
Haojun Liao 已提交
952

953 954 955 956 957
  SMultiwayMergeTreeInfo* pTree = NULL;
  uint8_t                 ret = tMergeTreeCreate(&pTree, sup.numOfTables, &sup, fileDataBlockOrderCompar);
  if (ret != TSDB_CODE_SUCCESS) {
    cleanupBlockOrderSupporter(&sup);
    return TSDB_CODE_TDB_OUT_OF_MEMORY;
H
Haojun Liao 已提交
958
  }
H
Haojun Liao 已提交
959

960 961 962 963
  int32_t numOfTotal = 0;
  while (numOfTotal < cnt) {
    int32_t pos = tMergeTreeGetChosenIndex(pTree);
    int32_t index = sup.indexPerTable[pos]++;
H
Haojun Liao 已提交
964

965 966
    SFileDataBlockInfo blockInfo = {.uid = sup.pDataBlockInfo[pos][index].uid, .tbBlockIdx = index};
    taosArrayPush(pBlockIter->blockList, &blockInfo);
H
Haojun Liao 已提交
967

968 969 970 971
    // set data block index overflow, in order to disable the offset comparator
    if (sup.indexPerTable[pos] >= sup.numOfBlocksPerTable[pos]) {
      sup.indexPerTable[pos] = sup.numOfBlocksPerTable[pos] + 1;
    }
H
Haojun Liao 已提交
972

973 974
    numOfTotal += 1;
    tMergeTreeAdjust(pTree, tMergeTreeGetAdjustIndex(pTree));
H
Haojun Liao 已提交
975
  }
H
Haojun Liao 已提交
976

977
  int64_t et = taosGetTimestampUs();
978 979
  tsdbDebug("%p %d data blocks access order completed, elapsed time:%.2f ms %s", pReader, cnt, (et - st) / 1000.0,
            pReader->idStr);
980 981
  cleanupBlockOrderSupporter(&sup);
  taosMemoryFree(pTree);
H
Haojun Liao 已提交
982

983
  pBlockIter->index = asc ? 0 : (numOfBlocks - 1);
984 985
  doSetCurrentBlock(pBlockIter);

986
  return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
987
}
H
Hongze Cheng 已提交
988

H
Haojun Liao 已提交
989
static bool blockIteratorNext(SDataBlockIter* pBlockIter) {
990 991
  bool asc = ASCENDING_TRAVERSE(pBlockIter->order);

992
  int32_t step = asc ? 1 : -1;
993
  if ((pBlockIter->index >= pBlockIter->numOfBlocks - 1 && asc) || (pBlockIter->index <= 0 && (!asc))) {
994 995 996
    return false;
  }

997
  pBlockIter->index += step;
998 999
  doSetCurrentBlock(pBlockIter);

1000 1001 1002
  return true;
}

1003 1004 1005
/**
 * This is an two rectangles overlap cases.
 */
1006
static int32_t dataBlockPartiallyRequired(STimeWindow* pWindow, SVersionRange* pVerRange, SBlock* pBlock) {
1007 1008 1009 1010
  return (pWindow->ekey < pBlock->maxKey.ts && pWindow->ekey >= pBlock->minKey.ts) ||
         (pWindow->skey > pBlock->minKey.ts && pWindow->skey <= pBlock->maxKey.ts) ||
         (pVerRange->minVer > pBlock->minVersion && pVerRange->minVer <= pBlock->maxVersion) ||
         (pVerRange->maxVer < pBlock->maxVersion && pVerRange->maxVer >= pBlock->minVersion);
H
Haojun Liao 已提交
1011
}
H
Hongze Cheng 已提交
1012

1013 1014
static SBlock* getNeighborBlockOfSameTable(SFileDataBlockInfo* pFBlockInfo, STableBlockScanInfo* pTableBlockScanInfo,
                                           int32_t* nextIndex, int32_t order) {
1015 1016 1017
  bool asc = ASCENDING_TRAVERSE(order);
  if (asc && pFBlockInfo->tbBlockIdx >= taosArrayGetSize(pTableBlockScanInfo->pBlockList) - 1) {
    return NULL;
1018 1019
  }

1020
  if (!asc && pFBlockInfo->tbBlockIdx == 0) {
1021 1022 1023
    return NULL;
  }

1024
  int32_t step = asc ? 1 : -1;
1025
  *nextIndex = pFBlockInfo->tbBlockIdx + step;
1026

1027
  SBlock*  pBlock = taosMemoryCalloc(1, sizeof(SBlock));
1028 1029 1030 1031
  int32_t* indexInMapdata = taosArrayGet(pTableBlockScanInfo->pBlockList, *nextIndex);

  tMapDataGetItemByIdx(&pTableBlockScanInfo->mapData, *indexInMapdata, pBlock, tGetBlock);
  return pBlock;
1032 1033 1034 1035 1036
}

static int32_t findFileBlockInfoIndex(SDataBlockIter* pBlockIter, SFileDataBlockInfo* pFBlockInfo) {
  ASSERT(pBlockIter != NULL && pFBlockInfo != NULL);

1037
  int32_t step = ASCENDING_TRAVERSE(pBlockIter->order) ? 1 : -1;
1038 1039
  int32_t index = pBlockIter->index;

1040
  while (index < pBlockIter->numOfBlocks && index >= 0) {
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
    SFileDataBlockInfo* pFBlock = taosArrayGet(pBlockIter->blockList, index);
    if (pFBlock->uid == pFBlockInfo->uid && pFBlock->tbBlockIdx == pFBlockInfo->tbBlockIdx) {
      return index;
    }

    index += step;
  }

  ASSERT(0);
  return -1;
}

1053
static int32_t setFileBlockActiveInBlockIter(SDataBlockIter* pBlockIter, int32_t index, int32_t step) {
1054 1055 1056 1057 1058
  if (index < 0 || index >= pBlockIter->numOfBlocks) {
    return -1;
  }

  SFileDataBlockInfo fblock = *(SFileDataBlockInfo*)taosArrayGet(pBlockIter->blockList, index);
1059 1060 1061 1062 1063
  pBlockIter->index += step;

  if (index != pBlockIter->index) {
    taosArrayRemove(pBlockIter->blockList, index);
    taosArrayInsert(pBlockIter->blockList, pBlockIter->index, &fblock);
1064

1065 1066 1067
    SFileDataBlockInfo* pBlockInfo = taosArrayGet(pBlockIter->blockList, pBlockIter->index);
    ASSERT(pBlockInfo->uid == fblock.uid && pBlockInfo->tbBlockIdx == fblock.tbBlockIdx);
  }
1068

1069
  doSetCurrentBlock(pBlockIter);
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
  return TSDB_CODE_SUCCESS;
}

static bool overlapWithNeighborBlock(SBlock* pBlock, SBlock* pNeighbor, int32_t order) {
  // it is the last block in current file, no chance to overlap with neighbor blocks.
  if (ASCENDING_TRAVERSE(order)) {
    return pBlock->maxKey.ts == pNeighbor->minKey.ts;
  } else {
    return pBlock->minKey.ts == pNeighbor->maxKey.ts;
  }
H
Haojun Liao 已提交
1080
}
H
Hongze Cheng 已提交
1081

1082
static bool bufferDataInFileBlockGap(int32_t order, TSDBKEY key, SBlock* pBlock) {
H
Haojun Liao 已提交
1083
  bool ascScan = ASCENDING_TRAVERSE(order);
H
Hongze Cheng 已提交
1084

1085
  return (ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts <= pBlock->minKey.ts)) ||
1086
         (!ascScan && (key.ts != TSKEY_INITIAL_VAL && key.ts >= pBlock->maxKey.ts));
H
Haojun Liao 已提交
1087
}
H
Hongze Cheng 已提交
1088

H
Haojun Liao 已提交
1089
static bool keyOverlapFileBlock(TSDBKEY key, SBlock* pBlock, SVersionRange* pVerRange) {
1090 1091
  return (key.ts >= pBlock->minKey.ts && key.ts <= pBlock->maxKey.ts) && (pBlock->maxVersion >= pVerRange->minVer) &&
         (pBlock->minVersion <= pVerRange->maxVer);
H
Haojun Liao 已提交
1092 1093
}

1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
static bool doCheckforDatablockOverlap(STableBlockScanInfo* pBlockScanInfo, const SBlock* pBlock) {
  size_t num = taosArrayGetSize(pBlockScanInfo->delSkyline);

  for (int32_t i = pBlockScanInfo->fileDelIndex; i < num; i += 1) {
    TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, i);
    if (p->ts >= pBlock->minKey.ts && p->ts <= pBlock->maxKey.ts) {
      if (p->version >= pBlock->minVersion) {
        return true;
      }
    } else if (p->ts < pBlock->minKey.ts) {  // p->ts < pBlock->minKey.ts
      if (p->version >= pBlock->minVersion) {
        if (i < num - 1) {
          TSDBKEY* pnext = taosArrayGet(pBlockScanInfo->delSkyline, i + 1);
          if (i + 1 == num - 1) {  // pnext is the last point
            if (pnext->ts >= pBlock->minKey.ts) {
              return true;
            }
          } else {
            if (pnext->ts >= pBlock->minKey.ts && pnext->version >= pBlock->minVersion) {
              return true;
            }
          }
        } else {  // it must be the last point
          ASSERT(p->version == 0);
        }
      }
    } else {  // (p->ts > pBlock->maxKey.ts) {
      return false;
    }
  }

  return false;
}

1128
static bool overlapWithDelSkyline(STableBlockScanInfo* pBlockScanInfo, const SBlock* pBlock, int32_t order) {
1129 1130 1131 1132
  if (pBlockScanInfo->delSkyline == NULL) {
    return false;
  }

1133
  // ts is not overlap
1134
  TSDBKEY* pFirst = taosArrayGet(pBlockScanInfo->delSkyline, 0);
L
Liu Jicong 已提交
1135
  TSDBKEY* pLast = taosArrayGetLast(pBlockScanInfo->delSkyline);
1136 1137 1138 1139 1140
  if (pBlock->minKey.ts > pLast->ts || pBlock->maxKey.ts < pFirst->ts) {
    return false;
  }

  // version is not overlap
1141 1142 1143 1144
  if (ASCENDING_TRAVERSE(order)) {
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  } else {
    int32_t index = pBlockScanInfo->fileDelIndex;
1145
    while (1) {
1146 1147 1148 1149 1150
      TSDBKEY* p = taosArrayGet(pBlockScanInfo->delSkyline, index);
      if (p->ts > pBlock->minKey.ts && index > 0) {
        index -= 1;
      } else {  // find the first point that is smaller than the minKey.ts of dataBlock.
        break;
1151 1152 1153
      }
    }

1154 1155
    return doCheckforDatablockOverlap(pBlockScanInfo, pBlock);
  }
1156 1157
}

1158 1159 1160 1161
// 1. the version of all rows should be less than the endVersion
// 2. current block should not overlap with next neighbor block
// 3. current timestamp should not be overlap with each other
// 4. output buffer should be large enough to hold all rows in current block
1162
// 5. delete info should not overlap with current block data
1163 1164
static bool fileBlockShouldLoad(STsdbReader* pReader, SFileDataBlockInfo* pFBlock, SBlock* pBlock,
                                STableBlockScanInfo* pScanInfo, TSDBKEY key) {
1165 1166 1167
  int32_t neighborIndex = 0;
  SBlock* pNeighbor = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &neighborIndex, pReader->order);

1168
  // overlap with neighbor
1169 1170 1171
  bool overlapWithNeighbor = false;
  if (pNeighbor) {
    overlapWithNeighbor = overlapWithNeighborBlock(pBlock, pNeighbor, pReader->order);
1172
    taosMemoryFree(pNeighbor);
1173 1174
  }

1175
  // has duplicated ts of different version in this block
L
Liu Jicong 已提交
1176 1177
  bool hasDup = (pBlock->nSubBlock == 1) ? pBlock->hasDup : true;
  bool overlapWithDel = overlapWithDelSkyline(pScanInfo, pBlock, pReader->order);
1178

1179
  return (overlapWithNeighbor || hasDup || dataBlockPartiallyRequired(&pReader->window, &pReader->verRange, pBlock) ||
1180
          keyOverlapFileBlock(key, pBlock, &pReader->verRange) || (pBlock->nRow > pReader->capacity) || overlapWithDel);
H
Haojun Liao 已提交
1181 1182
}

1183
static int32_t buildDataBlockFromBuf(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, int64_t endKey) {
1184
  if (!(pBlockScanInfo->iiter.hasVal || pBlockScanInfo->iter.hasVal)) {
1185 1186
    return TSDB_CODE_SUCCESS;
  }
H
Haojun Liao 已提交
1187

1188 1189 1190
  SSDataBlock* pBlock = pReader->pResBlock;

  int64_t st = taosGetTimestampUs();
1191
  int32_t code = buildDataBlockFromBufImpl(pBlockScanInfo, endKey, pReader->capacity, pReader);
H
Haojun Liao 已提交
1192

1193
  blockDataUpdateTsWindow(pBlock, 0);
1194
  pBlock->info.uid = pBlockScanInfo->uid;
1195

1196
  setComposedBlockFlag(pReader, true);
1197

1198
  double elapsedTime = (taosGetTimestampUs() - st) / 1000.0;
1199 1200 1201 1202
  tsdbDebug("%p build data block from cache completed, elapsed time:%.2f ms, numOfRows:%d, brange: %" PRId64
            " - %" PRId64 " %s",
            pReader, elapsedTime, pBlock->info.rows, pBlock->info.window.skey, pBlock->info.window.ekey,
            pReader->idStr);
1203 1204

  pReader->cost.buildmemBlock += elapsedTime;
H
Haojun Liao 已提交
1205 1206 1207
  return code;
}

1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
static bool tryCopyDistinctRowFromFileBlock(STsdbReader* pReader, SBlockData* pBlockData, int64_t key, SFileBlockDumpInfo* pDumpInfo) {

  // opt version
  // 1. it is not a border point
  // 2. the direct next point is not an duplicated timestamp
  if ((pDumpInfo->rowIndex < pDumpInfo->totalRows - 1 && pReader->order == TSDB_ORDER_ASC) ||
      (pDumpInfo->rowIndex > 0 && pReader->order == TSDB_ORDER_DESC)) {
    int32_t step = pReader->order == TSDB_ORDER_ASC? 1:-1;

    int64_t nextKey = pBlockData->aTSKEY[pDumpInfo->rowIndex + step];
    if (nextKey != key) { // merge is not needed
      doAppendRowFromBlock(pReader->pResBlock, pReader, pBlockData, pDumpInfo->rowIndex);
      pDumpInfo->rowIndex += step;
      return true;
    }
  }

  return false;
}

1228
static int32_t doMergeBufAndFileRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo, TSDBROW* pRow,
H
Haojun Liao 已提交
1229
                                     SIterInfo* pIter, int64_t key) {
1230
  SRowMerger          merge = {0};
H
Haojun Liao 已提交
1231
  STSRow*             pTSRow = NULL;
1232 1233 1234 1235
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

  TSDBKEY k = TSDBROW_KEY(pRow);
1236
  TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1237
  SArray* pDelList = pBlockScanInfo->delSkyline;
1238

1239 1240 1241
  // ascending order traverse
  if (ASCENDING_TRAVERSE(pReader->order)) {
    if (key < k.ts) {
1242 1243 1244 1245 1246 1247 1248 1249
      // imem & mem are all empty, only file exist
      if (tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo)) {
        return TSDB_CODE_SUCCESS;
      } else {
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
      }
1250
    } else if (k.ts < key) {  // k.ts < key
1251
      doMergeMultiRows(pRow, pBlockScanInfo->uid, pIter, pDelList, &pTSRow, pReader);
1252 1253 1254
    } else {  // k.ts == key, ascending order: file block ----> imem rows -----> mem rows
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
1255 1256

      tRowMerge(&merge, pRow);
1257
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1258 1259

      tRowMergerGetRow(&merge, &pTSRow);
1260
    }
1261 1262
  } else {  // descending order scan
    if (key < k.ts) {
1263
      doMergeMultiRows(pRow, pBlockScanInfo->uid, pIter, pDelList, &pTSRow, pReader);
1264
    } else if (k.ts < key) {
1265 1266 1267 1268 1269 1270 1271
      if (tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo)) {
        return TSDB_CODE_SUCCESS;
      } else {
        tRowMergerInit(&merge, &fRow, pReader->pSchema);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
      }
1272 1273 1274 1275
    } else {  // descending order: mem rows -----> imem rows ------> file block
      updateSchema(pRow, pBlockScanInfo->uid, pReader);

      tRowMergerInit(&merge, pRow, pReader->pSchema);
1276
      doMergeRowsInBuf(pIter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
1277 1278 1279 1280 1281 1282

      tRowMerge(&merge, &fRow);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);

      tRowMergerGetRow(&merge, &pTSRow);
    }
1283 1284
  }

1285
  tRowMergerClear(&merge);
1286
  doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
H
Haojun Liao 已提交
1287 1288

  taosMemoryFree(pTSRow);
1289 1290 1291
  return TSDB_CODE_SUCCESS;
}

1292 1293 1294 1295
static int32_t doMergeThreeLevelRows(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
  SRowMerger merge = {0};
  STSRow*    pTSRow = NULL;

1296 1297
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
dengyihao's avatar
dengyihao 已提交
1298
  SArray*             pDelList = pBlockScanInfo->delSkyline;
1299

1300 1301
  TSDBROW* pRow = getValidRow(&pBlockScanInfo->iter, pDelList, pReader);
  TSDBROW* piRow = getValidRow(&pBlockScanInfo->iiter, pDelList, pReader);
1302
  ASSERT(pRow != NULL && piRow != NULL);
H
Haojun Liao 已提交
1303

1304
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
H
Haojun Liao 已提交
1305

1306
  uint64_t uid = pBlockScanInfo->uid;
H
Haojun Liao 已提交
1307

1308 1309 1310 1311
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

  if (ASCENDING_TRAVERSE(pReader->order)) {
1312 1313
    // [1&2] key <= [k.ts && ik.ts]
    if (key <= k.ts && key <= ik.ts) {
1314 1315 1316
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
      tRowMergerInit(&merge, &fRow, pReader->pSchema);

1317
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
H
Haojun Liao 已提交
1318

1319 1320
      if (ik.ts == key) {
        tRowMerge(&merge, piRow);
1321
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1322 1323
      }

1324 1325
      if (k.ts == key) {
        tRowMerge(&merge, pRow);
1326
        doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1327 1328 1329
      }

      tRowMergerGetRow(&merge, &pTSRow);
1330
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1331
      return TSDB_CODE_SUCCESS;
1332
    } else {  // key > ik.ts || key > k.ts
1333 1334
      ASSERT(key != ik.ts);

1335
      // [3] ik.ts < key <= k.ts
1336
      // [4] ik.ts < k.ts <= key
1337
      if (ik.ts < k.ts) {
1338
        doMergeMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader);
1339
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1340 1341 1342
        return TSDB_CODE_SUCCESS;
      }

1343 1344
      // [5] k.ts < key   <= ik.ts
      // [6] k.ts < ik.ts <= key
1345
      if (k.ts < ik.ts) {
1346
        doMergeMultiRows(pRow, uid, &pBlockScanInfo->iter, pDelList, &pTSRow, pReader);
1347
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1348 1349 1350
        return TSDB_CODE_SUCCESS;
      }

1351
      // [7] k.ts == ik.ts < key
1352
      if (k.ts == ik.ts) {
1353 1354
        ASSERT(key > ik.ts && key > k.ts);

1355
        doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, &pTSRow);
1356
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1357 1358 1359
        return TSDB_CODE_SUCCESS;
      }
    }
1360 1361 1362 1363 1364 1365
  } else {  // descending order scan
    // [1/2] k.ts >= ik.ts && k.ts >= key
    if (k.ts >= ik.ts && k.ts >= key) {
      updateSchema(pRow, uid, pReader);

      tRowMergerInit(&merge, pRow, pReader->pSchema);
1366
      doMergeRowsInBuf(&pBlockScanInfo->iter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1367 1368 1369

      if (ik.ts == k.ts) {
        tRowMerge(&merge, piRow);
1370
        doMergeRowsInBuf(&pBlockScanInfo->iiter, uid, key, pBlockScanInfo->delSkyline, &merge, pReader);
1371 1372 1373 1374 1375 1376 1377 1378 1379
      }

      if (k.ts == key) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      }

      tRowMergerGetRow(&merge, &pTSRow);
1380
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1381 1382
      return TSDB_CODE_SUCCESS;
    } else {
1383
      ASSERT(ik.ts != k.ts);  // this case has been included in the previous if branch
1384 1385 1386 1387

      // [3] ik.ts > k.ts >= Key
      // [4] ik.ts > key >= k.ts
      if (ik.ts > key) {
1388
        doMergeMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader);
1389
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
        return TSDB_CODE_SUCCESS;
      }

      // [5] key > ik.ts > k.ts
      // [6] key > k.ts > ik.ts
      if (key > ik.ts) {
        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMergerInit(&merge, &fRow, pReader->pSchema);

        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
1401
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1402 1403 1404 1405 1406
        return TSDB_CODE_SUCCESS;
      }

      //[7] key = ik.ts > k.ts
      if (key == ik.ts) {
1407
        doMergeMultiRows(piRow, uid, &pBlockScanInfo->iiter, pDelList, &pTSRow, pReader);
1408 1409 1410 1411 1412

        TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
        tRowMerge(&merge, &fRow);
        doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
        tRowMergerGetRow(&merge, &pTSRow);
1413
        doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1414 1415 1416 1417 1418 1419
        return TSDB_CODE_SUCCESS;
      }
    }
  }

  ASSERT(0);
S
Shengliang Guan 已提交
1420
  return -1;
1421 1422
}

dengyihao's avatar
dengyihao 已提交
1423 1424
static bool isValidFileBlockRow(SBlockData* pBlockData, SFileBlockDumpInfo* pDumpInfo,
                                STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435
  // check for version and time range
  int64_t ver = pBlockData->aVersion[pDumpInfo->rowIndex];
  if (ver > pReader->verRange.maxVer || ver < pReader->verRange.minVer) {
    return false;
  }

  int64_t ts = pBlockData->aTSKEY[pDumpInfo->rowIndex];
  if (ts > pReader->window.ekey || ts < pReader->window.skey) {
    return false;
  }

1436
  TSDBKEY k = {.ts = ts, .version = ver};
1437
  if (hasBeenDropped(pBlockScanInfo->delSkyline, &pBlockScanInfo->fileDelIndex, &k, pReader->order)) {
1438 1439 1440
    return false;
  }

1441 1442 1443
  return true;
}

1444
static bool outOfTimeWindow(int64_t ts, STimeWindow* pWindow) { return (ts > pWindow->ekey) || (ts < pWindow->skey); }
1445

1446 1447 1448 1449 1450
static int32_t buildComposedDataBlockImpl(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
  SBlockData*         pBlockData = &pReader->status.fileBlockData;

  int64_t  key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
1451 1452
  TSDBROW* pRow = getValidRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
1453

1454
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal) {
1455
    return doMergeThreeLevelRows(pReader, pBlockScanInfo);
1456
  } else {
1457
    // imem + file
1458
    if (pBlockScanInfo->iiter.hasVal) {
H
Haojun Liao 已提交
1459
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, piRow, &pBlockScanInfo->iiter, key);
1460 1461
    }

1462
    // mem + file
1463
    if (pBlockScanInfo->iter.hasVal) {
H
Haojun Liao 已提交
1464
      return doMergeBufAndFileRows(pReader, pBlockScanInfo, pRow, &pBlockScanInfo->iter, key);
H
Haojun Liao 已提交
1465
    }
1466

1467
    // imem & mem are all empty, only file exist
1468 1469 1470 1471
    if (tryCopyDistinctRowFromFileBlock(pReader, pBlockData, key, pDumpInfo)) {
      return TSDB_CODE_SUCCESS;
    } else {
      TSDBROW fRow = tsdbRowFromBlockData(pBlockData, pDumpInfo->rowIndex);
1472

1473 1474
      STSRow*    pTSRow = NULL;
      SRowMerger merge = {0};
H
Haojun Liao 已提交
1475

1476 1477 1478 1479
      tRowMergerInit(&merge, &fRow, pReader->pSchema);
      doMergeRowsInFileBlocks(pBlockData, pBlockScanInfo, pReader, &merge);
      tRowMergerGetRow(&merge, &pTSRow);
      doAppendRowFromTSRow(pReader->pResBlock, pReader, pTSRow);
1480

1481 1482 1483 1484
      taosMemoryFree(pTSRow);
      tRowMergerClear(&merge);
      return TSDB_CODE_SUCCESS;
    }
1485 1486 1487
  }
}

1488
static int32_t buildComposedDataBlock(STsdbReader* pReader, STableBlockScanInfo* pBlockScanInfo) {
1489 1490
  SSDataBlock* pResBlock = pReader->pResBlock;

1491
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
1492 1493
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
  int32_t             step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
1494

1495 1496
  int64_t st = taosGetTimestampUs();

1497
  while (1) {
1498 1499
    // todo check the validate of row in file block
    {
1500
      if (!isValidFileBlockRow(pBlockData, pDumpInfo, pBlockScanInfo, pReader)) {
1501 1502
        pDumpInfo->rowIndex += step;

1503
        SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
1504 1505 1506 1507 1508 1509 1510 1511 1512
        if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
          setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
          break;
        }

        continue;
      }
    }

1513
    buildComposedDataBlockImpl(pReader, pBlockScanInfo);
1514
    SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
1515

1516 1517 1518 1519 1520 1521 1522 1523
    // currently loaded file data block is consumed
    if (pDumpInfo->rowIndex >= pBlock->nRow || pDumpInfo->rowIndex < 0) {
      setBlockAllDumped(pDumpInfo, pBlock, pReader->order);
      break;
    }

    if (pResBlock->info.rows >= pReader->capacity) {
      break;
1524 1525 1526 1527
    }
  }

  pResBlock->info.uid = pBlockScanInfo->uid;
1528 1529
  blockDataUpdateTsWindow(pResBlock, 0);

1530
  setComposedBlockFlag(pReader, true);
1531
  int64_t et = taosGetTimestampUs();
1532

1533 1534 1535 1536
  tsdbDebug("%p uid:%" PRIu64 ", composed data block created, brange:%" PRIu64 "-%" PRIu64
            " rows:%d, elapsed time:%.2f ms %s",
            pReader, pBlockScanInfo->uid, pResBlock->info.window.skey, pResBlock->info.window.ekey,
            pResBlock->info.rows, (et - st) / 1000.0, pReader->idStr);
1537

1538 1539 1540 1541 1542
  return TSDB_CODE_SUCCESS;
}

void setComposedBlockFlag(STsdbReader* pReader, bool composed) { pReader->status.composedDataBlock = composed; }

1543
static int32_t initMemDataIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader) {
1544 1545 1546 1547
  if (pBlockScanInfo->iterInit) {
    return TSDB_CODE_SUCCESS;
  }

1548
  int32_t code = TSDB_CODE_SUCCESS;
1549 1550 1551 1552 1553 1554 1555 1556 1557

  TSDBKEY startKey = {0};
  if (ASCENDING_TRAVERSE(pReader->order)) {
    startKey = (TSDBKEY){.ts = pReader->window.skey, .version = pReader->verRange.minVer};
  } else {
    startKey = (TSDBKEY){.ts = pReader->window.ekey, .version = pReader->verRange.maxVer};
  }

  int32_t backward = (!ASCENDING_TRAVERSE(pReader->order));
1558 1559

  STbData* d = NULL;
H
Hongze Cheng 已提交
1560 1561
  if (pReader->pReadSnap->pMem != NULL) {
    tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid, &d);
1562
    if (d != NULL) {
1563
      code = tsdbTbDataIterCreate(d, &startKey, backward, &pBlockScanInfo->iter.iter);
1564
      if (code == TSDB_CODE_SUCCESS) {
1565
        pBlockScanInfo->iter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iter.iter) != NULL);
1566

H
Haojun Liao 已提交
1567
        tsdbDebug("%p uid:%" PRId64 ", check data in mem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
1568 1569
                  "-%" PRId64 " %s",
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, d->minKey, d->maxKey, pReader->idStr);
1570
      } else {
1571 1572
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for imem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
1573
        return code;
1574 1575
      }
    }
H
Haojun Liao 已提交
1576
  } else {
1577
    tsdbDebug("%p uid:%" PRId64 ", no data in mem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
H
Haojun Liao 已提交
1578 1579
  }

1580
  STbData* di = NULL;
H
Hongze Cheng 已提交
1581 1582
  if (pReader->pReadSnap->pIMem != NULL) {
    tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid, &di);
1583
    if (di != NULL) {
1584
      code = tsdbTbDataIterCreate(di, &startKey, backward, &pBlockScanInfo->iiter.iter);
1585
      if (code == TSDB_CODE_SUCCESS) {
1586
        pBlockScanInfo->iiter.hasVal = (tsdbTbDataIterGet(pBlockScanInfo->iiter.iter) != NULL);
1587

H
Haojun Liao 已提交
1588
        tsdbDebug("%p uid:%" PRId64 ", check data in imem from skey:%" PRId64 ", order:%d, ts range in buf:%" PRId64
1589
                  "-%" PRId64 " %s",
1590
                  pReader, pBlockScanInfo->uid, startKey.ts, pReader->order, di->minKey, di->maxKey, pReader->idStr);
1591
      } else {
1592 1593
        tsdbError("%p uid:%" PRId64 ", failed to create iterator for mem, code:%s, %s", pReader, pBlockScanInfo->uid,
                  tstrerror(code), pReader->idStr);
1594
        return code;
1595 1596
      }
    }
H
Haojun Liao 已提交
1597 1598
  } else {
    tsdbDebug("%p uid:%" PRId64 ", no data in imem, %s", pReader, pBlockScanInfo->uid, pReader->idStr);
1599 1600
  }

1601 1602
  initDelSkylineIterator(pBlockScanInfo, pReader, d, di);

1603
  pBlockScanInfo->iterInit = true;
H
Haojun Liao 已提交
1604 1605 1606
  return TSDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
1607 1608
int32_t initDelSkylineIterator(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STbData* pMemTbData,
                               STbData* piMemTbData) {
1609 1610 1611
  if (pBlockScanInfo->delSkyline != NULL) {
    return TSDB_CODE_SUCCESS;
  }
1612

1613 1614 1615
  int32_t code = 0;
  STsdb*  pTsdb = pReader->pTsdb;

1616 1617
  SArray* pDelData = taosArrayInit(4, sizeof(SDelData));

H
Hongze Cheng 已提交
1618
  SDelFile* pDelFile = pReader->pReadSnap->fs.pDelFile;
1619 1620 1621
  if (pDelFile) {
    SDelFReader* pDelFReader = NULL;
    code = tsdbDelFReaderOpen(&pDelFReader, pDelFile, pTsdb, NULL);
1622
    if (code != TSDB_CODE_SUCCESS) {
1623 1624 1625 1626 1627
      goto _err;
    }

    SArray* aDelIdx = taosArrayInit(4, sizeof(SDelIdx));
    if (aDelIdx == NULL) {
1628
      tsdbDelFReaderClose(&pDelFReader);
1629 1630 1631
      goto _err;
    }

1632
    code = tsdbReadDelIdx(pDelFReader, aDelIdx, NULL);
1633 1634 1635
    if (code != TSDB_CODE_SUCCESS) {
      taosArrayDestroy(aDelIdx);
      tsdbDelFReaderClose(&pDelFReader);
1636 1637
      goto _err;
    }
1638

1639 1640 1641
    SDelIdx  idx = {.suid = pReader->suid, .uid = pBlockScanInfo->uid};
    SDelIdx* pIdx = taosArraySearch(aDelIdx, &idx, tCmprDelIdx, TD_EQ);

H
Haojun Liao 已提交
1642 1643
    if (pIdx != NULL) {
      code = tsdbReadDelData(pDelFReader, pIdx, pDelData, NULL);
1644 1645 1646 1647 1648 1649 1650
    }

    taosArrayDestroy(aDelIdx);
    tsdbDelFReaderClose(&pDelFReader);

    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
1651
    }
1652
  }
1653

1654 1655 1656 1657 1658 1659 1660
  SDelData* p = NULL;
  if (pMemTbData != NULL) {
    p = pMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
1661 1662
  }

1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
  if (piMemTbData != NULL) {
    p = piMemTbData->pHead;
    while (p) {
      taosArrayPush(pDelData, p);
      p = p->pNext;
    }
  }

  if (taosArrayGetSize(pDelData) > 0) {
    pBlockScanInfo->delSkyline = taosArrayInit(4, sizeof(TSDBKEY));
    code = tsdbBuildDeleteSkyline(pDelData, 0, (int32_t)(taosArrayGetSize(pDelData) - 1), pBlockScanInfo->delSkyline);
  }

  taosArrayDestroy(pDelData);
dengyihao's avatar
dengyihao 已提交
1677 1678
  pBlockScanInfo->iter.index =
      ASCENDING_TRAVERSE(pReader->order) ? 0 : taosArrayGetSize(pBlockScanInfo->delSkyline) - 1;
1679 1680
  pBlockScanInfo->iiter.index = pBlockScanInfo->iter.index;
  pBlockScanInfo->fileDelIndex = pBlockScanInfo->iter.index;
1681 1682
  return code;

1683 1684 1685
_err:
  taosArrayDestroy(pDelData);
  return code;
1686 1687
}

1688 1689 1690
static TSDBKEY getCurrentKeyInBuf(SDataBlockIter* pBlockIter, STsdbReader* pReader) {
  TSDBKEY key = {.ts = TSKEY_INITIAL_VAL};

1691
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
1692 1693
  STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));

1694 1695
  initMemDataIterator(pScanInfo, pReader);
  TSDBROW* pRow = getValidRow(&pScanInfo->iter, pScanInfo->delSkyline, pReader);
1696
  if (pRow != NULL) {
1697 1698 1699
    key = TSDBROW_KEY(pRow);
  }

1700
  pRow = getValidRow(&pScanInfo->iiter, pScanInfo->delSkyline, pReader);
1701
  if (pRow != NULL) {
1702 1703 1704 1705 1706 1707 1708 1709 1710
    TSDBKEY k = TSDBROW_KEY(pRow);
    if (key.ts > k.ts) {
      key = k;
    }
  }

  return key;
}

H
Haojun Liao 已提交
1711 1712
static int32_t moveToNextFile(STsdbReader* pReader, int32_t* numOfBlocks) {
  SReaderStatus* pStatus = &pReader->status;
1713

1714
  size_t  numOfTables = taosHashGetSize(pReader->status.pTableMap);
1715
  SArray* pIndexList = taosArrayInit(numOfTables, sizeof(SBlockIdx));
H
Haojun Liao 已提交
1716 1717

  while (1) {
1718
    bool hasNext = filesetIteratorNext(&pStatus->fileIter, pReader);
1719
    if (!hasNext) {  // no data files on disk
H
Haojun Liao 已提交
1720 1721 1722
      break;
    }

H
Haojun Liao 已提交
1723
    taosArrayClear(pIndexList);
H
Haojun Liao 已提交
1724 1725
    int32_t code = doLoadBlockIndex(pReader, pReader->pFileReader, pIndexList);
    if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
1726
      taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
1727 1728 1729 1730 1731 1732 1733
      return code;
    }

    if (taosArrayGetSize(pIndexList) > 0) {
      uint32_t numOfValidTable = 0;
      code = doLoadFileBlock(pReader, pIndexList, &numOfValidTable, numOfBlocks);
      if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
1734
        taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
1735 1736 1737 1738 1739 1740 1741 1742 1743 1744
        return code;
      }

      if (numOfValidTable > 0) {
        break;
      }
    }
    // no blocks in current file, try next files
  }

H
Haojun Liao 已提交
1745
  taosArrayDestroy(pIndexList);
H
Haojun Liao 已提交
1746 1747 1748
  return TSDB_CODE_SUCCESS;
}

1749 1750 1751
static int32_t doBuildDataBlock(STsdbReader* pReader) {
  int32_t code = TSDB_CODE_SUCCESS;

1752
  SReaderStatus*  pStatus = &pReader->status;
1753 1754
  SDataBlockIter* pBlockIter = &pStatus->blockIter;

1755 1756
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(pBlockIter);
  STableBlockScanInfo* pScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
1757

1758
  SBlock* pBlock = getCurrentBlock(pBlockIter);
1759 1760 1761

  TSDBKEY key = getCurrentKeyInBuf(pBlockIter, pReader);
  if (fileBlockShouldLoad(pReader, pFBlock, pBlock, pScanInfo, key)) {
H
Haojun Liao 已提交
1762 1763
    tBlockDataReset(&pStatus->fileBlockData);
    tBlockDataClearData(&pStatus->fileBlockData);
1764
    code = doLoadFileBlockData(pReader, pBlockIter, pScanInfo, &pStatus->fileBlockData);
1765 1766 1767 1768 1769
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    // build composed data block
1770
    code = buildComposedDataBlock(pReader, pScanInfo);
1771 1772
  } else if (bufferDataInFileBlockGap(pReader->order, key, pBlock)) {
    // data in memory that are earlier than current file block
1773
    // todo rows in buffer should be less than the file block in asc, greater than file block in desc
1774
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? pBlock->minKey.ts : pBlock->maxKey.ts;
1775
    code = buildDataBlockFromBuf(pReader, pScanInfo, endKey);
1776
  } else {  // whole block is required, return it directly
1777
    SDataBlockInfo* pInfo = &pReader->pResBlock->info;
1778 1779 1780
    pInfo->rows = pBlock->nRow;
    pInfo->uid = pScanInfo->uid;
    pInfo->window = (STimeWindow){.skey = pBlock->minKey.ts, .ekey = pBlock->maxKey.ts};
1781
    setComposedBlockFlag(pReader, false);
1782
    setBlockAllDumped(&pStatus->fBlockDumpInfo, pBlock, pReader->order);
1783 1784 1785 1786 1787
  }

  return code;
}

H
Haojun Liao 已提交
1788
static int32_t buildBlockFromBufferSequentially(STsdbReader* pReader) {
1789 1790
  SReaderStatus* pStatus = &pReader->status;

1791
  while (1) {
1792 1793 1794
    if (pStatus->pTableIter == NULL) {
      pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
      if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
1795
        return TSDB_CODE_SUCCESS;
1796 1797 1798 1799
      }
    }

    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;
1800
    initMemDataIterator(pBlockScanInfo, pReader);
1801

1802
    int64_t endKey = (ASCENDING_TRAVERSE(pReader->order)) ? INT64_MAX : INT64_MIN;
1803
    int32_t code = buildDataBlockFromBuf(pReader, pBlockScanInfo, endKey);
H
Haojun Liao 已提交
1804 1805 1806 1807
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

1808
    if (pReader->pResBlock->info.rows > 0) {
H
Haojun Liao 已提交
1809
      return TSDB_CODE_SUCCESS;
1810 1811 1812 1813 1814
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
    if (pStatus->pTableIter == NULL) {
H
Haojun Liao 已提交
1815
      return TSDB_CODE_SUCCESS;
1816 1817 1818 1819
    }
  }
}

1820
// set the correct start position in case of the first/last file block, according to the query time window
1821
static void initBlockDumpInfo(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
1822
  SBlock* pBlock = getCurrentBlock(pBlockIter);
1823

1824 1825 1826
  SReaderStatus* pStatus = &pReader->status;

  SFileBlockDumpInfo* pDumpInfo = &pStatus->fBlockDumpInfo;
1827 1828 1829

  pDumpInfo->totalRows = pBlock->nRow;
  pDumpInfo->allDumped = false;
1830
  pDumpInfo->rowIndex = ASCENDING_TRAVERSE(pReader->order) ? 0 : pBlock->nRow - 1;
1831 1832
}

1833
static int32_t initForFirstBlockInFile(STsdbReader* pReader, SDataBlockIter* pBlockIter) {
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847
  int32_t numOfBlocks = 0;
  int32_t code = moveToNextFile(pReader, &numOfBlocks);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

  // all data files are consumed, try data in buffer
  if (numOfBlocks == 0) {
    pReader->status.loadFromFile = false;
    return code;
  }

  // initialize the block iterator for a new fileset
  code = initBlockIterator(pReader, pBlockIter, numOfBlocks);
1848 1849

  // set the correct start position according to the query time window
1850
  initBlockDumpInfo(pReader, pBlockIter);
1851 1852 1853
  return code;
}

1854
static bool fileBlockPartiallyRead(SFileBlockDumpInfo* pDumpInfo, bool asc) {
1855 1856
  return (!pDumpInfo->allDumped) &&
         ((pDumpInfo->rowIndex > 0 && asc) || (pDumpInfo->rowIndex < (pDumpInfo->totalRows - 1) && (!asc)));
1857 1858
}

1859
static int32_t buildBlockFromFiles(STsdbReader* pReader) {
H
Haojun Liao 已提交
1860
  int32_t code = TSDB_CODE_SUCCESS;
1861 1862
  bool    asc = ASCENDING_TRAVERSE(pReader->order);

1863 1864
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

1865
  while (1) {
1866
    SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
1867 1868
    STableBlockScanInfo* pScanInfo = taosHashGet(pReader->status.pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));

1869 1870
    SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

1871
    if (fileBlockPartiallyRead(pDumpInfo, asc)) {  // file data block is partially loaded
1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886
      code = buildComposedDataBlock(pReader, pScanInfo);
    } else {
      // current block are exhausted, try the next file block
      if (pDumpInfo->allDumped) {
        // try next data block in current file
        bool hasNext = blockIteratorNext(&pReader->status.blockIter);
        if (hasNext) {  // check for the next block in the block accessed order list
          initBlockDumpInfo(pReader, pBlockIter);
        } else {  // data blocks in current file are exhausted, let's try the next file now
          code = initForFirstBlockInFile(pReader, pBlockIter);

          // error happens or all the data files are completely checked
          if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
            return code;
          }
1887
        }
H
Haojun Liao 已提交
1888
      }
1889 1890 1891

      // current block is not loaded yet, or data in buffer may overlap with the file block.
      code = doBuildDataBlock(pReader);
1892 1893
    }

1894 1895 1896 1897 1898 1899 1900 1901
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    if (pReader->pResBlock->info.rows > 0) {
      return TSDB_CODE_SUCCESS;
    }
  }
1902
}
H
refact  
Hongze Cheng 已提交
1903

1904 1905
static STsdb* getTsdbByRetentions(SVnode* pVnode, TSKEY winSKey, SRetention* retentions, const char* idStr,
                                  int8_t* pLevel) {
1906
  if (VND_IS_RSMA(pVnode)) {
1907
    int8_t  level = 0;
1908 1909
    int64_t now = taosGetTimestamp(pVnode->config.tsdbCfg.precision);

1910
    for (int8_t i = 0; i < TSDB_RETENTION_MAX; ++i) {
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923
      SRetention* pRetention = retentions + level;
      if (pRetention->keep <= 0) {
        if (level > 0) {
          --level;
        }
        break;
      }
      if ((now - pRetention->keep) <= winSKey) {
        break;
      }
      ++level;
    }

1924 1925
    int32_t     vgId = TD_VID(pVnode);
    const char* str = (idStr != NULL) ? idStr : "";
1926 1927

    if (level == TSDB_RETENTION_L0) {
1928
      *pLevel = TSDB_RETENTION_L0;
1929
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", vgId, TSDB_RETENTION_L0, str);
1930 1931
      return VND_RSMA0(pVnode);
    } else if (level == TSDB_RETENTION_L1) {
1932
      *pLevel = TSDB_RETENTION_L1;
1933
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", vgId, TSDB_RETENTION_L1, str);
1934 1935
      return VND_RSMA1(pVnode);
    } else {
1936
      *pLevel = TSDB_RETENTION_L2;
1937
      tsdbDebug("vgId:%d, rsma level %d is selected to query %s", vgId, TSDB_RETENTION_L2, str);
1938 1939 1940 1941 1942 1943 1944
      return VND_RSMA2(pVnode);
    }
  }

  return VND_TSDB(pVnode);
}

H
Haojun Liao 已提交
1945
SVersionRange getQueryVerRange(SVnode* pVnode, SQueryTableDataCond* pCond, int8_t level) {
L
Liu Jicong 已提交
1946
  int64_t startVer = (pCond->startVersion == -1) ? 0 : pCond->startVersion;
H
Haojun Liao 已提交
1947 1948

  int64_t endVer = 0;
L
Liu Jicong 已提交
1949 1950
  if (pCond->endVersion ==
      -1) {  // user not specified end version, set current maximum version of vnode as the endVersion
H
Haojun Liao 已提交
1951 1952
    endVer = pVnode->state.applied;
  } else {
L
Liu Jicong 已提交
1953
    endVer = (pCond->endVersion > pVnode->state.applied) ? pVnode->state.applied : pCond->endVersion;
1954 1955
  }

H
Haojun Liao 已提交
1956
  return (SVersionRange){.minVer = startVer, .maxVer = endVer};
1957 1958
}

H
Hongze Cheng 已提交
1959 1960 1961 1962
// // todo not unref yet, since it is not support multi-group interpolation query
// static UNUSED_FUNC void changeQueryHandleForInterpQuery(STsdbReader* pHandle) {
//   // filter the queried time stamp in the first place
//   STsdbReader* pTsdbReadHandle = (STsdbReader*)pHandle;
H
refact  
Hongze Cheng 已提交
1963

H
Hongze Cheng 已提交
1964 1965
//   // starts from the buffer in case of descending timestamp order check data blocks
//   size_t numOfTables = taosArrayGetSize(pTsdbReadHandle->pTableCheckInfo);
H
refact  
Hongze Cheng 已提交
1966

H
Hongze Cheng 已提交
1967 1968
//   int32_t i = 0;
//   while (i < numOfTables) {
H
Haojun Liao 已提交
1969
//     STableBlockScanInfo* pCheckInfo = taosArrayGet(pTsdbReadHandle->pTableCheckInfo, i);
H
refact  
Hongze Cheng 已提交
1970

H
Hongze Cheng 已提交
1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
//     // the first qualified table for interpolation query
//     //    if ((pTsdbReadHandle->window.skey <= pCheckInfo->pTableObj->lastKey) &&
//     //        (pCheckInfo->pTableObj->lastKey != TSKEY_INITIAL_VAL)) {
//     //      break;
//     //    }

//     i++;
//   }

//   // there are no data in all the tables
//   if (i == numOfTables) {
//     return;
//   }

H
Haojun Liao 已提交
1985
//   STableBlockScanInfo info = *(STableBlockScanInfo*)taosArrayGet(pTsdbReadHandle->pTableCheckInfo, i);
H
Hongze Cheng 已提交
1986 1987 1988 1989 1990 1991
//   taosArrayClear(pTsdbReadHandle->pTableCheckInfo);

//   info.lastKey = pTsdbReadHandle->window.skey;
//   taosArrayPush(pTsdbReadHandle->pTableCheckInfo, &info);
// }

1992
bool hasBeenDropped(const SArray* pDelList, int32_t* index, TSDBKEY* pKey, int32_t order) {
1993 1994 1995 1996
  ASSERT(pKey != NULL);
  if (pDelList == NULL) {
    return false;
  }
L
Liu Jicong 已提交
1997 1998 1999
  size_t  num = taosArrayGetSize(pDelList);
  bool    asc = ASCENDING_TRAVERSE(order);
  int32_t step = asc ? 1 : -1;
2000

2001 2002 2003 2004 2005 2006
  if (asc) {
    if (*index >= num - 1) {
      TSDBKEY* last = taosArrayGetLast(pDelList);
      ASSERT(pKey->ts >= last->ts);

      if (pKey->ts > last->ts) {
2007
        return false;
2008 2009 2010
      } else if (pKey->ts == last->ts) {
        TSDBKEY* prev = taosArrayGet(pDelList, num - 2);
        return (prev->version >= pKey->version);
2011 2012
      }
    } else {
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pNext = taosArrayGet(pDelList, (*index) + 1);

      if (pKey->ts < pCurrent->ts) {
        return false;
      }

      if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
        return true;
      }

      while (pNext->ts <= pKey->ts && (*index) < num - 1) {
        (*index) += 1;

        if ((*index) < num - 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pNext = taosArrayGet(pDelList, (*index) + 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version == 0 && pNext->version > 0) {
            continue;
          }

          if (pCurrent->ts <= pKey->ts && pNext->ts >= pKey->ts && pCurrent->version >= pKey->version) {
            return true;
          }
        }
      }

      return false;
2043 2044
    }
  } else {
2045 2046
    if (*index <= 0) {
      TSDBKEY* pFirst = taosArrayGet(pDelList, 0);
2047

2048 2049 2050 2051 2052 2053 2054
      if (pKey->ts < pFirst->ts) {
        return false;
      } else if (pKey->ts == pFirst->ts) {
        return pFirst->version >= pKey->version;
      } else {
        ASSERT(0);
      }
2055
    } else {
2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082
      TSDBKEY* pCurrent = taosArrayGet(pDelList, *index);
      TSDBKEY* pPrev = taosArrayGet(pDelList, (*index) - 1);

      if (pKey->ts > pCurrent->ts) {
        return false;
      }

      if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
        return true;
      }

      while (pPrev->ts >= pKey->ts && (*index) > 1) {
        (*index) += step;

        if ((*index) >= 1) {
          pCurrent = taosArrayGet(pDelList, *index);
          pPrev = taosArrayGet(pDelList, (*index) - 1);

          // it is not a consecutive deletion range, ignore it
          if (pCurrent->version > 0 && pPrev->version == 0) {
            continue;
          }

          if (pPrev->ts <= pKey->ts && pCurrent->ts >= pKey->ts && pPrev->version >= pKey->version) {
            return true;
          }
        }
2083 2084 2085 2086 2087
      }

      return false;
    }
  }
2088 2089

  return false;
2090 2091 2092 2093
}

TSDBROW* getValidRow(SIterInfo* pIter, const SArray* pDelList, STsdbReader* pReader) {
  if (!pIter->hasVal) {
H
Haojun Liao 已提交
2094 2095
    return NULL;
  }
H
Hongze Cheng 已提交
2096

2097
  TSDBROW* pRow = tsdbTbDataIterGet(pIter->iter);
2098
  TSDBKEY  key = TSDBROW_KEY(pRow);
2099
  if (outOfTimeWindow(key.ts, &pReader->window)) {
2100
    pIter->hasVal = false;
H
Haojun Liao 已提交
2101 2102
    return NULL;
  }
H
Hongze Cheng 已提交
2103

2104
  // it is a valid data version
dengyihao's avatar
dengyihao 已提交
2105
  if ((key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer) &&
2106
      (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2107 2108
    return pRow;
  }
H
Hongze Cheng 已提交
2109

2110
  while (1) {
2111 2112
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2113 2114
      return NULL;
    }
H
Hongze Cheng 已提交
2115

2116
    pRow = tsdbTbDataIterGet(pIter->iter);
H
Hongze Cheng 已提交
2117

H
Haojun Liao 已提交
2118
    key = TSDBROW_KEY(pRow);
2119
    if (outOfTimeWindow(key.ts, &pReader->window)) {
2120
      pIter->hasVal = false;
H
Haojun Liao 已提交
2121 2122
      return NULL;
    }
H
Hongze Cheng 已提交
2123

dengyihao's avatar
dengyihao 已提交
2124
    if (key.version <= pReader->verRange.maxVer && key.version >= pReader->verRange.minVer &&
2125
        (!hasBeenDropped(pDelList, &pIter->index, &key, pReader->order))) {
H
Haojun Liao 已提交
2126 2127 2128 2129
      return pRow;
    }
  }
}
H
Hongze Cheng 已提交
2130

2131 2132
int32_t doMergeRowsInBuf(SIterInfo* pIter, uint64_t uid, int64_t ts, SArray* pDelList, SRowMerger* pMerger,
                         STsdbReader* pReader) {
H
Haojun Liao 已提交
2133
  while (1) {
2134 2135
    pIter->hasVal = tsdbTbDataIterNext(pIter->iter);
    if (!pIter->hasVal) {
H
Haojun Liao 已提交
2136 2137
      break;
    }
H
Hongze Cheng 已提交
2138

2139
    // data exists but not valid
2140
    TSDBROW* pRow = getValidRow(pIter, pDelList, pReader);
2141 2142 2143 2144 2145
    if (pRow == NULL) {
      break;
    }

    // ts is not identical, quit
H
Haojun Liao 已提交
2146
    TSDBKEY k = TSDBROW_KEY(pRow);
2147
    if (k.ts != ts) {
H
Haojun Liao 已提交
2148 2149 2150
      break;
    }

2151 2152 2153 2154 2155 2156 2157 2158 2159
    int32_t   sversion = TSDBROW_SVERSION(pRow);
    STSchema* pTSchema = NULL;
    if (sversion != pReader->pSchema->version) {
      metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pTSchema);
    } else {
      pTSchema = pReader->pSchema;
    }

    tRowMergerAdd(pMerger, pRow, pTSchema);
M
Minglei Jin 已提交
2160 2161 2162 2163

    if (sversion != pReader->pSchema->version) {
      taosMemoryFree(pTSchema);
    }
H
Haojun Liao 已提交
2164 2165 2166 2167 2168
  }

  return TSDB_CODE_SUCCESS;
}

2169
static int32_t doMergeRowsInFileBlockImpl(SBlockData* pBlockData, int32_t rowIndex, int64_t key, SRowMerger* pMerger,
2170
                                          SVersionRange* pVerRange, int32_t step) {
2171 2172
  while (pBlockData->aTSKEY[rowIndex] == key && rowIndex < pBlockData->nRow && rowIndex >= 0) {
    if (pBlockData->aVersion[rowIndex] > pVerRange->maxVer || pBlockData->aVersion[rowIndex] < pVerRange->minVer) {
2173
      rowIndex += step;
2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190
      continue;
    }

    TSDBROW fRow = tsdbRowFromBlockData(pBlockData, rowIndex);
    tRowMerge(pMerger, &fRow);
    rowIndex += step;
  }

  return rowIndex;
}

typedef enum {
  CHECK_FILEBLOCK_CONT = 0x1,
  CHECK_FILEBLOCK_QUIT = 0x2,
} CHECK_FILEBLOCK_STATE;

static int32_t checkForNeighborFileBlock(STsdbReader* pReader, STableBlockScanInfo* pScanInfo, SBlock* pBlock,
2191 2192
                                         SFileDataBlockInfo* pFBlock, SRowMerger* pMerger, int64_t key,
                                         CHECK_FILEBLOCK_STATE* state) {
2193
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;
2194
  SBlockData*         pBlockData = &pReader->status.fileBlockData;
2195

2196
  *state = CHECK_FILEBLOCK_QUIT;
2197
  int32_t step = ASCENDING_TRAVERSE(pReader->order) ? 1 : -1;
2198 2199 2200

  int32_t nextIndex = -1;
  SBlock* pNeighborBlock = getNeighborBlockOfSameTable(pFBlock, pScanInfo, &nextIndex, pReader->order);
2201
  if (pNeighborBlock == NULL) {  // do nothing
2202 2203 2204 2205
    return 0;
  }

  bool overlap = overlapWithNeighborBlock(pBlock, pNeighborBlock, pReader->order);
2206 2207
  taosMemoryFree(pNeighborBlock);

2208
  if (overlap) {  // load next block
2209
    SReaderStatus*  pStatus = &pReader->status;
2210 2211
    SDataBlockIter* pBlockIter = &pStatus->blockIter;

2212
    // 1. find the next neighbor block in the scan block list
2213
    SFileDataBlockInfo fb = {.uid = pFBlock->uid, .tbBlockIdx = nextIndex};
2214
    int32_t            neighborIndex = findFileBlockInfoIndex(pBlockIter, &fb);
2215

2216
    // 2. remove it from the scan block list
2217
    setFileBlockActiveInBlockIter(pBlockIter, neighborIndex, step);
2218

2219
    // 3. load the neighbor block, and set it to be the currently accessed file data block
H
Haojun Liao 已提交
2220 2221
    tBlockDataReset(&pStatus->fileBlockData);
    tBlockDataClearData(&pStatus->fileBlockData);
2222 2223 2224 2225 2226
    int32_t code = doLoadFileBlockData(pReader, pBlockIter, pScanInfo, &pStatus->fileBlockData);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

2227
    // 4. check the data values
2228 2229 2230 2231
    initBlockDumpInfo(pReader, pBlockIter);

    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
H
Haojun Liao 已提交
2232
    if (pDumpInfo->rowIndex >= pDumpInfo->totalRows) {
2233 2234 2235 2236 2237 2238 2239
      *state = CHECK_FILEBLOCK_CONT;
    }
  }

  return TSDB_CODE_SUCCESS;
}

2240 2241
int32_t doMergeRowsInFileBlocks(SBlockData* pBlockData, STableBlockScanInfo* pScanInfo, STsdbReader* pReader,
                                SRowMerger* pMerger) {
2242 2243
  SFileBlockDumpInfo* pDumpInfo = &pReader->status.fBlockDumpInfo;

2244
  bool    asc = ASCENDING_TRAVERSE(pReader->order);
2245
  int64_t key = pBlockData->aTSKEY[pDumpInfo->rowIndex];
2246
  int32_t step = asc ? 1 : -1;
2247

2248
  pDumpInfo->rowIndex += step;
2249
  if ((pDumpInfo->rowIndex <= pBlockData->nRow - 1 && asc) ||(pDumpInfo->rowIndex >= 0 && !asc)) {
2250 2251 2252
    pDumpInfo->rowIndex =
        doMergeRowsInFileBlockImpl(pBlockData, pDumpInfo->rowIndex, key, pMerger, &pReader->verRange, step);
  }
2253

2254 2255 2256 2257
  // all rows are consumed, let's try next file block
  if ((pDumpInfo->rowIndex >= pBlockData->nRow && asc) || (pDumpInfo->rowIndex < 0 && !asc)) {
    while (1) {
      CHECK_FILEBLOCK_STATE st;
2258

2259
      SFileDataBlockInfo* pFileBlockInfo = getCurrentBlockInfo(&pReader->status.blockIter);
2260
      SBlock*             pCurrentBlock = getCurrentBlock(&pReader->status.blockIter);
2261 2262 2263
      checkForNeighborFileBlock(pReader, pScanInfo, pCurrentBlock, pFileBlockInfo, pMerger, key, &st);
      if (st == CHECK_FILEBLOCK_QUIT) {
        break;
2264
      }
2265
    }
H
Haojun Liao 已提交
2266
  }
2267

H
Haojun Liao 已提交
2268 2269 2270
  return TSDB_CODE_SUCCESS;
}

2271
void updateSchema(TSDBROW* pRow, uint64_t uid, STsdbReader* pReader) {
2272 2273 2274
  int32_t sversion = TSDBROW_SVERSION(pRow);

  if (pReader->pSchema == NULL) {
M
Minglei Jin 已提交
2275
    metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pSchema);
2276 2277
  } else if (pReader->pSchema->version != sversion) {
    taosMemoryFreeClear(pReader->pSchema);
M
Minglei Jin 已提交
2278
    metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pReader->pSchema);
2279 2280 2281
  }
}

dengyihao's avatar
dengyihao 已提交
2282 2283
void doMergeMultiRows(TSDBROW* pRow, uint64_t uid, SIterInfo* pIter, SArray* pDelList, STSRow** pTSRow,
                      STsdbReader* pReader) {
2284 2285 2286
  SRowMerger merge = {0};

  TSDBKEY k = TSDBROW_KEY(pRow);
2287 2288 2289 2290 2291 2292 2293 2294
  // updateSchema(pRow, uid, pReader);
  int32_t   sversion = TSDBROW_SVERSION(pRow);
  STSchema* pTSchema = NULL;
  if (sversion != pReader->pSchema->version) {
    metaGetTbTSchemaEx(pReader->pTsdb->pVnode->pMeta, pReader->suid, uid, sversion, &pTSchema);
  } else {
    pTSchema = pReader->pSchema;
  }
H
Haojun Liao 已提交
2295

2296 2297
  tRowMergerInit2(&merge, pReader->pSchema, pRow, pTSchema);
  doMergeRowsInBuf(pIter, uid, k.ts, pDelList, &merge, pReader);
2298
  tRowMergerGetRow(&merge, pTSRow);
2299
  tRowMergerClear(&merge);
M
Minglei Jin 已提交
2300 2301 2302 2303

  if (sversion != pReader->pSchema->version) {
    taosMemoryFree(pTSchema);
  }
2304 2305
}

2306 2307
void doMergeMemIMemRows(TSDBROW* pRow, TSDBROW* piRow, STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader,
                        STSRow** pTSRow) {
H
Haojun Liao 已提交
2308 2309
  SRowMerger merge = {0};

2310 2311 2312
  TSDBKEY k = TSDBROW_KEY(pRow);
  TSDBKEY ik = TSDBROW_KEY(piRow);

2313 2314 2315 2316
  if (ASCENDING_TRAVERSE(pReader->order)) {  // ascending order imem --> mem
    updateSchema(piRow, pBlockScanInfo->uid, pReader);

    tRowMergerInit(&merge, piRow, pReader->pSchema);
2317
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, ik.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2318

2319
    tRowMerge(&merge, pRow);
2320
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2321 2322
  } else {
    updateSchema(pRow, pBlockScanInfo->uid, pReader);
2323

2324
    tRowMergerInit(&merge, pRow, pReader->pSchema);
2325
    doMergeRowsInBuf(&pBlockScanInfo->iter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2326 2327

    tRowMerge(&merge, piRow);
2328
    doMergeRowsInBuf(&pBlockScanInfo->iiter, pBlockScanInfo->uid, k.ts, pBlockScanInfo->delSkyline, &merge, pReader);
2329
  }
2330 2331 2332 2333

  tRowMergerGetRow(&merge, pTSRow);
}

2334 2335
int32_t tsdbGetNextRowInMem(STableBlockScanInfo* pBlockScanInfo, STsdbReader* pReader, STSRow** pTSRow,
                            int64_t endKey) {
2336 2337
  TSDBROW* pRow = getValidRow(&pBlockScanInfo->iter, pBlockScanInfo->delSkyline, pReader);
  TSDBROW* piRow = getValidRow(&pBlockScanInfo->iiter, pBlockScanInfo->delSkyline, pReader);
dengyihao's avatar
dengyihao 已提交
2338
  SArray*  pDelList = pBlockScanInfo->delSkyline;
H
Haojun Liao 已提交
2339

2340 2341
  // todo refactor
  bool asc = ASCENDING_TRAVERSE(pReader->order);
2342
  if (pBlockScanInfo->iter.hasVal) {
2343 2344 2345 2346 2347 2348
    TSDBKEY k = TSDBROW_KEY(pRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      pRow = NULL;
    }
  }

2349
  if (pBlockScanInfo->iiter.hasVal) {
2350 2351 2352 2353 2354 2355
    TSDBKEY k = TSDBROW_KEY(piRow);
    if ((k.ts >= endKey && asc) || (k.ts <= endKey && !asc)) {
      piRow = NULL;
    }
  }

2356
  if (pBlockScanInfo->iter.hasVal && pBlockScanInfo->iiter.hasVal && pRow != NULL && piRow != NULL) {
2357
    TSDBKEY k = TSDBROW_KEY(pRow);
2358
    TSDBKEY ik = TSDBROW_KEY(piRow);
H
Haojun Liao 已提交
2359

2360
    if (ik.ts < k.ts) {  // ik.ts < k.ts
2361
      doMergeMultiRows(piRow, pBlockScanInfo->uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader);
2362
    } else if (k.ts < ik.ts) {
2363
      doMergeMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader);
2364 2365
    } else {  // ik.ts == k.ts
      doMergeMemIMemRows(pRow, piRow, pBlockScanInfo, pReader, pTSRow);
H
Haojun Liao 已提交
2366
    }
2367 2368

    return TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
2369 2370
  }

2371 2372
  if (pBlockScanInfo->iter.hasVal && pRow != NULL) {
    doMergeMultiRows(pRow, pBlockScanInfo->uid, &pBlockScanInfo->iter, pDelList, pTSRow, pReader);
H
Haojun Liao 已提交
2373 2374 2375
    return TSDB_CODE_SUCCESS;
  }

2376 2377
  if (pBlockScanInfo->iiter.hasVal && piRow != NULL) {
    doMergeMultiRows(piRow, pBlockScanInfo->uid, &pBlockScanInfo->iiter, pDelList, pTSRow, pReader);
H
Haojun Liao 已提交
2378 2379 2380 2381 2382 2383
    return TSDB_CODE_SUCCESS;
  }

  return TSDB_CODE_SUCCESS;
}

2384
int32_t doAppendRowFromTSRow(SSDataBlock* pBlock, STsdbReader* pReader, STSRow* pTSRow) {
2385 2386 2387
  int32_t numOfRows = pBlock->info.rows;
  int32_t numOfCols = (int32_t)taosArrayGetSize(pBlock->pDataBlock);

2388
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;
2389
  STSchema*           pSchema = pReader->pSchema;
2390

2391
  SColVal colVal = {0};
2392
  int32_t i = 0, j = 0;
H
Haojun Liao 已提交
2393

2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413
  SColumnInfoData* pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
  if (pColInfoData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppend(pColInfoData, numOfRows, (const char*)&pTSRow->ts, false);
    i += 1;
  }

  while (i < numOfCols && j < pSchema->numOfCols) {
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    col_id_t colId = pColInfoData->info.colId;

    if (colId == pSchema->columns[j].colId) {
      tTSRowGetVal(pTSRow, pReader->pSchema, j, &colVal);
      doCopyColVal(pColInfoData, numOfRows, i, &colVal, pSupInfo);
      i += 1;
      j += 1;
    } else if (colId < pSchema->columns[j].colId) {
      colDataAppendNULL(pColInfoData, numOfRows);
      i += 1;
    } else if (colId > pSchema->columns[j].colId) {
      j += 1;
2414
    }
2415 2416
  }

2417
  // set null value since current column does not exist in the "pSchema"
2418
  while (i < numOfCols) {
2419 2420 2421 2422 2423
    pColInfoData = taosArrayGet(pBlock->pDataBlock, i);
    colDataAppendNULL(pColInfoData, numOfRows);
    i += 1;
  }

2424 2425 2426 2427
  pBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

2428 2429 2430 2431 2432 2433 2434 2435 2436
int32_t doAppendRowFromBlock(SSDataBlock* pResBlock, STsdbReader* pReader, SBlockData* pBlockData, int32_t rowIndex) {
  int32_t i = 0, j = 0;
  int32_t outputRowIndex = pResBlock->info.rows;

  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

  SColumnInfoData* pColData = taosArrayGet(pResBlock->pDataBlock, i);
  if (pColData->info.colId == PRIMARYKEY_TIMESTAMP_COL_ID) {
    colDataAppendInt64(pColData, outputRowIndex, &pBlockData->aTSKEY[rowIndex]);
2437
    i += 1;
2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448
  }

  SColVal cv = {0};
  int32_t numOfInputCols = taosArrayGetSize(pBlockData->aIdx);
  int32_t numOfOutputCols = blockDataGetNumOfCols(pResBlock);

  while(i < numOfOutputCols && j < numOfInputCols) {
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
    SColData* pData = tBlockDataGetColDataByIdx(pBlockData, j);

    if (pData->cid == pCol->info.colId) {
2449 2450
      tColDataGetValue(pData, rowIndex, &cv);
      doCopyColVal(pCol, outputRowIndex, i, &cv, pSupInfo);
2451 2452 2453 2454 2455 2456 2457 2458 2459 2460
      j += 1;
    } else {  // the specified column does not exist in file block, fill with null data
      colDataAppendNULL(pCol, outputRowIndex);
    }

    i += 1;
  }

  while (i < numOfOutputCols) {
    SColumnInfoData* pCol = taosArrayGet(pResBlock->pDataBlock, i);
2461
    colDataAppendNULL(pCol, outputRowIndex);
2462 2463 2464 2465 2466 2467 2468
    i += 1;
  }

  pResBlock->info.rows += 1;
  return TSDB_CODE_SUCCESS;
}

2469 2470
int32_t buildDataBlockFromBufImpl(STableBlockScanInfo* pBlockScanInfo, int64_t endKey, int32_t capacity,
                                  STsdbReader* pReader) {
H
Haojun Liao 已提交
2471 2472 2473 2474
  SSDataBlock* pBlock = pReader->pResBlock;

  do {
    STSRow* pTSRow = NULL;
2475
    tsdbGetNextRowInMem(pBlockScanInfo, pReader, &pTSRow, endKey);
2476 2477
    if (pTSRow == NULL) {
      break;
H
Haojun Liao 已提交
2478 2479
    }

2480
    doAppendRowFromTSRow(pBlock, pReader, pTSRow);
2481
    taosMemoryFree(pTSRow);
H
Haojun Liao 已提交
2482 2483

    // no data in buffer, return immediately
2484
    if (!(pBlockScanInfo->iter.hasVal || pBlockScanInfo->iiter.hasVal)) {
H
Haojun Liao 已提交
2485 2486 2487
      break;
    }

2488
    if (pBlock->info.rows >= capacity) {
H
Haojun Liao 已提交
2489 2490 2491 2492
      break;
    }
  } while (1);

2493
  ASSERT(pBlock->info.rows <= capacity);
H
Haojun Liao 已提交
2494 2495
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
2496

2497
// todo refactor, use arraylist instead
H
Hongze Cheng 已提交
2498
int32_t tsdbSetTableId(STsdbReader* pReader, int64_t uid) {
2499 2500 2501 2502 2503
  ASSERT(pReader != NULL);
  taosHashClear(pReader->status.pTableMap);

  STableBlockScanInfo info = {.lastKey = 0, .uid = uid};
  taosHashPut(pReader->status.pTableMap, &info.uid, sizeof(uint64_t), &info, sizeof(info));
H
Hongze Cheng 已提交
2504 2505 2506
  return TDB_CODE_SUCCESS;
}

dengyihao's avatar
dengyihao 已提交
2507 2508 2509 2510 2511 2512
void* tsdbGetIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIdx(pMeta);
}
dengyihao's avatar
dengyihao 已提交
2513

dengyihao's avatar
dengyihao 已提交
2514 2515 2516 2517 2518 2519
void* tsdbGetIvtIdx(SMeta* pMeta) {
  if (pMeta == NULL) {
    return NULL;
  }
  return metaGetIvtIdx(pMeta);
}
L
Liu Jicong 已提交
2520

2521 2522 2523 2524
uint64_t getReaderMaxVersion(STsdbReader *pReader) {
  return pReader->verRange.maxVer;
}

C
Cary Xu 已提交
2525 2526 2527 2528 2529 2530 2531 2532 2533 2534
/**
 * @brief Get all suids since suid
 *
 * @param pMeta
 * @param suid return all suids in one vnode if suid is 0
 * @param list
 * @return int32_t
 */
int32_t tsdbGetStbIdList(SMeta* pMeta, int64_t suid, SArray* list) {
  SMStbCursor* pCur = metaOpenStbCursor(pMeta, suid);
L
Liu Jicong 已提交
2535
  if (!pCur) {
C
Cary Xu 已提交
2536 2537
    return TSDB_CODE_FAILED;
  }
C
Cary Xu 已提交
2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551

  while (1) {
    tb_uid_t id = metaStbCursorNext(pCur);
    if (id == 0) {
      break;
    }

    taosArrayPush(list, &id);
  }

  metaCloseStbCursor(pCur);
  return TSDB_CODE_SUCCESS;
}

H
refact  
Hongze Cheng 已提交
2552
// ====================================== EXPOSED APIs ======================================
2553 2554
int32_t tsdbReaderOpen(SVnode* pVnode, SQueryTableDataCond* pCond, SArray* pTableList, STsdbReader** ppReader,
                       const char* idstr) {
2555 2556
  int32_t code = tsdbReaderCreate(pVnode, pCond, ppReader, 4096, idstr);
  if (code != TSDB_CODE_SUCCESS) {
H
Haojun Liao 已提交
2557 2558
    goto _err;
  }
H
Hongze Cheng 已提交
2559

2560
  // check for query time window
H
Haojun Liao 已提交
2561
  STsdbReader* pReader = *ppReader;
2562
  if (isEmptyQueryTimeWindow(&pReader->window)) {
H
Haojun Liao 已提交
2563 2564 2565
    tsdbDebug("%p query window not overlaps with the data set, no result returned, %s", pReader, pReader->idStr);
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
2566

2567 2568 2569
  if (pCond->type == TIMEWINDOW_RANGE_EXTERNAL) {
    // update the SQueryTableDataCond to create inner reader
    STimeWindow w = pCond->twindows;
2570
    int32_t     order = pCond->order;
2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588
    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.ekey = pCond->twindows.skey;
      pCond->twindows.skey = INT64_MIN;
      pCond->order = TSDB_ORDER_DESC;
    } else {
      pCond->twindows.skey = pCond->twindows.ekey;
      pCond->twindows.ekey = INT64_MAX;
      pCond->order = TSDB_ORDER_ASC;
    }

    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[0], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }

    if (order == TSDB_ORDER_ASC) {
      pCond->twindows.skey = w.ekey;
      pCond->twindows.ekey = INT64_MAX;
2589
    } else {
2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
      pCond->twindows.skey = INT64_MIN;
      pCond->twindows.ekey = w.ekey;
    }
    code = tsdbReaderCreate(pVnode, pCond, &pReader->innerReader[1], 1, idstr);
    if (code != TSDB_CODE_SUCCESS) {
      goto _err;
    }
  }

  if (pCond->suid != 0) {
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pReader->suid, -1);
  } else if (taosArrayGetSize(pTableList) > 0) {
    STableKeyInfo* pKey = taosArrayGet(pTableList, 0);
    pReader->pSchema = metaGetTbTSchema(pReader->pTsdb->pVnode->pMeta, pKey->uid, -1);
  }

2606 2607
  int32_t numOfTables = taosArrayGetSize(pTableList);
  pReader->status.pTableMap = createDataBlockScanInfo(pReader, pTableList->pData, numOfTables);
H
Haojun Liao 已提交
2608 2609 2610
  if (pReader->status.pTableMap == NULL) {
    tsdbReaderClose(pReader);
    *ppReader = NULL;
H
Haojun Liao 已提交
2611

H
Haojun Liao 已提交
2612 2613 2614
    code = TSDB_CODE_TDB_OUT_OF_MEMORY;
    goto _err;
  }
H
Hongze Cheng 已提交
2615

H
Hongze Cheng 已提交
2616
  code = tsdbTakeReadSnap(pReader->pTsdb, &pReader->pReadSnap);
2617 2618 2619
  if (code != TSDB_CODE_SUCCESS) {
    goto _err;
  }
H
Hongze Cheng 已提交
2620

2621 2622
  if (pReader->type == TIMEWINDOW_RANGE_CONTAINED) {
    SDataBlockIter* pBlockIter = &pReader->status.blockIter;
2623

2624
    initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader->order, pReader->idStr);
2625
    resetDataBlockIterator(&pReader->status.blockIter, pReader->order, pReader->status.pTableMap);
2626 2627 2628 2629 2630 2631 2632 2633 2634 2635

    // no data in files, let's try buffer in memory
    if (pReader->status.fileIter.numOfFiles == 0) {
      pReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
    }
2636
  } else {
2637
    STsdbReader*    pPrevReader = pReader->innerReader[0];
2638 2639
    SDataBlockIter* pBlockIter = &pPrevReader->status.blockIter;

2640 2641
    initFilesetIterator(&pPrevReader->status.fileIter, pPrevReader->pReadSnap->fs.aDFileSet, pPrevReader->order,
                        pPrevReader->idStr);
2642
    resetDataBlockIterator(&pPrevReader->status.blockIter, pPrevReader->order, pReader->status.pTableMap);
2643 2644 2645 2646 2647 2648 2649 2650 2651

    // no data in files, let's try buffer in memory
    if (pPrevReader->status.fileIter.numOfFiles == 0) {
      pPrevReader->status.loadFromFile = false;
    } else {
      code = initForFirstBlockInFile(pPrevReader, pBlockIter);
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
2652 2653 2654
    }
  }

2655
  tsdbDebug("%p total numOfTable:%d in this query %s", pReader, numOfTables, pReader->idStr);
H
Hongze Cheng 已提交
2656
  return code;
H
Hongze Cheng 已提交
2657 2658

_err:
2659
  tsdbError("failed to create data reader, code: %s %s", tstrerror(code), pReader->idStr);
H
Hongze Cheng 已提交
2660
  return code;
H
refact  
Hongze Cheng 已提交
2661 2662 2663
}

void tsdbReaderClose(STsdbReader* pReader) {
2664 2665
  if (pReader == NULL) {
    return;
2666
  }
H
refact  
Hongze Cheng 已提交
2667

2668 2669
  SBlockLoadSuppInfo* pSupInfo = &pReader->suppInfo;

H
Hongze Cheng 已提交
2670
  tsdbUntakeReadSnap(pReader->pTsdb, pReader->pReadSnap);
H
Hongze Cheng 已提交
2671

2672 2673 2674 2675
  taosMemoryFreeClear(pSupInfo->plist);
  taosMemoryFree(pSupInfo->colIds);

  taosArrayDestroy(pSupInfo->pColAgg);
L
Liu Jicong 已提交
2676
  for (int32_t i = 0; i < blockDataGetNumOfCols(pReader->pResBlock); ++i) {
2677 2678 2679 2680 2681
    if (pSupInfo->buildBuf[i] != NULL) {
      taosMemoryFreeClear(pSupInfo->buildBuf[i]);
    }
  }
  taosMemoryFree(pSupInfo->buildBuf);
H
Haojun Liao 已提交
2682
  tBlockDataClear(&pReader->status.fileBlockData, true);
2683 2684

  cleanupDataBlockIterator(&pReader->status.blockIter);
2685 2686

  size_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
2687
  destroyBlockScanInfo(pReader->status.pTableMap);
2688
  blockDataDestroy(pReader->pResBlock);
2689

H
Haojun Liao 已提交
2690 2691 2692
  if (pReader->pFileReader != NULL) {
    tsdbDataFReaderClose(&pReader->pFileReader);
  }
H
refact  
Hongze Cheng 已提交
2693

2694
  SIOCostSummary* pCost = &pReader->cost;
H
refact  
Hongze Cheng 已提交
2695

2696 2697 2698 2699 2700
  tsdbDebug("%p :io-cost summary: head-file:%" PRIu64 ", head-file time:%.2f ms, SMA:%" PRId64
            " SMA-time:%.2f ms, "
            "fileBlocks:%" PRId64
            ", fileBlocks-time:%.2f ms, build in-memory-block-time:%.2f ms, STableBlockScanInfo "
            "size:%.2f Kb %s",
2701
            pReader, pCost->headFileLoad, pCost->headFileLoadTime, pCost->smaData, pCost->smaLoadTime,
2702
            pCost->numOfBlocks, pCost->blockLoadTime, pCost->buildmemBlock,
2703
            numOfTables * sizeof(STableBlockScanInfo) / 1000.0, pReader->idStr);
H
refact  
Hongze Cheng 已提交
2704

2705 2706 2707
  taosMemoryFree(pReader->idStr);
  taosMemoryFree(pReader->pSchema);
  taosMemoryFreeClear(pReader);
H
refact  
Hongze Cheng 已提交
2708 2709
}

2710
static bool doTsdbNextDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2711
  // cleanup the data that belongs to the previous data block
2712 2713
  SSDataBlock* pBlock = pReader->pResBlock;
  blockDataCleanup(pBlock);
H
Hongze Cheng 已提交
2714

2715
  SReaderStatus* pStatus = &pReader->status;
H
Haojun Liao 已提交
2716

2717 2718 2719 2720 2721
  if (pStatus->loadFromFile) {
    int32_t code = buildBlockFromFiles(pReader);
    if (code != TSDB_CODE_SUCCESS) {
      return false;
    }
2722

2723 2724 2725
    if (pBlock->info.rows > 0) {
      return true;
    } else {
H
Haojun Liao 已提交
2726
      buildBlockFromBufferSequentially(pReader);
2727
      return pBlock->info.rows > 0;
H
Haojun Liao 已提交
2728
    }
2729 2730 2731
  } else {  // no data in files, let's try the buffer
    buildBlockFromBufferSequentially(pReader);
    return pBlock->info.rows > 0;
H
Haojun Liao 已提交
2732
  }
2733

2734
  return false;
H
refact  
Hongze Cheng 已提交
2735 2736
}

2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773
bool tsdbNextDataBlock(STsdbReader* pReader) {
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return false;
  }

  if (pReader->innerReader[0] != NULL) {
    bool ret = doTsdbNextDataBlock(pReader->innerReader[0]);
    if (ret) {
      pReader->step = EXTERNAL_ROWS_PREV;
      return ret;
    }

    tsdbReaderClose(pReader->innerReader[0]);
    pReader->innerReader[0] = NULL;
  }

  pReader->step = EXTERNAL_ROWS_MAIN;
  bool ret = doTsdbNextDataBlock(pReader);
  if (ret) {
    return ret;
  }

  if (pReader->innerReader[1] != NULL) {
    bool ret1 = doTsdbNextDataBlock(pReader->innerReader[1]);
    if (ret1) {
      pReader->step = EXTERNAL_ROWS_NEXT;
      return ret1;
    }

    tsdbReaderClose(pReader->innerReader[1]);
    pReader->innerReader[1] = NULL;
  }

  return false;
}

static void setBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
2774 2775 2776 2777
  ASSERT(pDataBlockInfo != NULL && pReader != NULL);
  pDataBlockInfo->rows = pReader->pResBlock->info.rows;
  pDataBlockInfo->uid = pReader->pResBlock->info.uid;
  pDataBlockInfo->window = pReader->pResBlock->info.window;
H
Hongze Cheng 已提交
2778 2779
}

2780 2781
void tsdbRetrieveDataBlockInfo(STsdbReader* pReader, SDataBlockInfo* pDataBlockInfo) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
2782
    if (pReader->step == EXTERNAL_ROWS_MAIN) {
2783
      setBlockInfo(pReader, pDataBlockInfo);
2784
    } else if (pReader->step == EXTERNAL_ROWS_PREV) {
2785 2786 2787 2788 2789 2790 2791 2792 2793
      setBlockInfo(pReader->innerReader[0], pDataBlockInfo);
    } else {
      setBlockInfo(pReader->innerReader[1], pDataBlockInfo);
    }
  } else {
    setBlockInfo(pReader, pDataBlockInfo);
  }
}

2794
int32_t tsdbRetrieveDatablockSMA(STsdbReader* pReader, SColumnDataAgg*** pBlockStatis, bool* allHave) {
H
Hongze Cheng 已提交
2795
  int32_t code = 0;
2796
  *allHave = false;
H
Hongze Cheng 已提交
2797

2798
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
2799 2800 2801 2802
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }

2803
  // there is no statistics data for composed block
2804 2805 2806 2807
  if (pReader->status.composedDataBlock) {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
2808

2809
  SFileDataBlockInfo* pFBlock = getCurrentBlockInfo(&pReader->status.blockIter);
H
Hongze Cheng 已提交
2810

2811
  SBlock* pBlock = getCurrentBlock(&pReader->status.blockIter);
2812
  int64_t stime = taosGetTimestampUs();
H
Hongze Cheng 已提交
2813

2814 2815
  SBlockLoadSuppInfo* pSup = &pReader->suppInfo;

2816
  if (tBlockHasSma(pBlock)) {
2817
    code = tsdbReadBlockSma(pReader->pFileReader, pBlock, pSup->pColAgg, NULL);
2818
    if (code != TSDB_CODE_SUCCESS) {
2819 2820
      tsdbDebug("vgId:%d, failed to load block SMA for uid %" PRIu64 ", code:%s, %s", 0, pFBlock->uid, tstrerror(code),
                pReader->idStr);
2821 2822
      return code;
    }
2823 2824 2825
  } else {
    *pBlockStatis = NULL;
    return TSDB_CODE_SUCCESS;
2826
  }
H
Hongze Cheng 已提交
2827

2828
  *allHave = true;
H
Hongze Cheng 已提交
2829

2830 2831
  // always load the first primary timestamp column data
  SColumnDataAgg* pTsAgg = &pSup->tsColAgg;
2832

2833 2834
  pTsAgg->numOfNull = 0;
  pTsAgg->colId = PRIMARYKEY_TIMESTAMP_COL_ID;
2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850
  pTsAgg->min = pReader->pResBlock->info.window.skey;
  pTsAgg->max = pReader->pResBlock->info.window.ekey;
  pSup->plist[0] = pTsAgg;

  // update the number of NULL data rows
  size_t numOfCols = blockDataGetNumOfCols(pReader->pResBlock);

  int32_t i = 0, j = 0;
  while (j < numOfCols && i < taosArrayGetSize(pSup->pColAgg)) {
    SColumnDataAgg* pAgg = taosArrayGet(pSup->pColAgg, i);
    if (pAgg->colId == pSup->colIds[j]) {
      if (IS_BSMA_ON(&(pReader->pSchema->columns[i]))) {
        pSup->plist[j] = pAgg;
      } else {
        *allHave = false;
      }
2851 2852
      i += 1;
      j += 1;
2853 2854 2855 2856 2857 2858 2859
    } else if (pAgg->colId < pSup->colIds[j]) {
      i += 1;
    } else if (pSup->colIds[j] < pAgg->colId) {
      j += 1;
    }
  }

2860
  double elapsed = (taosGetTimestampUs() - stime) / 1000.0;
2861
  pReader->cost.smaLoadTime += elapsed;
2862
  pReader->cost.smaData += 1;
2863 2864 2865

  *pBlockStatis = pSup->plist;

2866
  tsdbDebug("vgId:%d, succeed to load block SMA for uid %" PRIu64 ", elapsed time:%.2f ms, %s", 0, pFBlock->uid,
2867 2868
            elapsed, pReader->idStr);

H
Hongze Cheng 已提交
2869
  return code;
H
Hongze Cheng 已提交
2870 2871
}

2872
static SArray* doRetrieveDataBlock(STsdbReader* pReader) {
H
Haojun Liao 已提交
2873 2874 2875
  SReaderStatus* pStatus = &pReader->status;

  if (pStatus->composedDataBlock) {
2876
    return pReader->pResBlock->pDataBlock;
2877
  }
2878

2879
  SFileDataBlockInfo*  pFBlock = getCurrentBlockInfo(&pStatus->blockIter);
2880
  STableBlockScanInfo* pBlockScanInfo = taosHashGet(pStatus->pTableMap, &pFBlock->uid, sizeof(pFBlock->uid));
2881

H
Haojun Liao 已提交
2882 2883 2884
  tBlockDataReset(&pStatus->fileBlockData);
  tBlockDataClearData(&pStatus->fileBlockData);
  int32_t code = doLoadFileBlockData(pReader, &pStatus->blockIter, pBlockScanInfo, &pStatus->fileBlockData);
2885
  if (code != TSDB_CODE_SUCCESS) {
H
Hongze Cheng 已提交
2886
    tBlockDataClear(&pStatus->fileBlockData, 1);
H
Haojun Liao 已提交
2887

2888 2889
    terrno = code;
    return NULL;
2890
  }
2891 2892 2893

  copyBlockDataToSDataBlock(pReader, pBlockScanInfo);
  return pReader->pResBlock->pDataBlock;
H
Hongze Cheng 已提交
2894 2895
}

2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907
SArray* tsdbRetrieveDataBlock(STsdbReader* pReader, SArray* pIdList) {
  if (pReader->type == TIMEWINDOW_RANGE_EXTERNAL) {
    if (pReader->step == EXTERNAL_ROWS_PREV) {
      return doRetrieveDataBlock(pReader->innerReader[0]);
    } else if (pReader->step == EXTERNAL_ROWS_NEXT) {
      return doRetrieveDataBlock(pReader->innerReader[1]);
    }
  }

  return doRetrieveDataBlock(pReader);
}

H
Haojun Liao 已提交
2908
int32_t tsdbReaderReset(STsdbReader* pReader, SQueryTableDataCond* pCond) {
2909 2910 2911
  if (isEmptyQueryTimeWindow(&pReader->window)) {
    return TSDB_CODE_SUCCESS;
  }
H
Hongze Cheng 已提交
2912

L
Liu Jicong 已提交
2913
  pReader->order = pCond->order;
2914
  pReader->type = TIMEWINDOW_RANGE_CONTAINED;
2915
  pReader->status.loadFromFile = true;
dengyihao's avatar
dengyihao 已提交
2916
  pReader->status.pTableIter = NULL;
H
Haojun Liao 已提交
2917
  pReader->window = updateQueryTimeWindow(pReader->pTsdb, &pCond->twindows);
H
Hongze Cheng 已提交
2918

2919
  // allocate buffer in order to load data blocks from file
2920
  memset(&pReader->suppInfo.tsColAgg, 0, sizeof(SColumnDataAgg));
2921 2922
  memset(pReader->suppInfo.plist, 0, POINTER_BYTES);

2923
  pReader->suppInfo.tsColAgg.colId = PRIMARYKEY_TIMESTAMP_COL_ID;
2924
  tsdbDataFReaderClose(&pReader->pFileReader);
2925

2926
  int32_t numOfTables = taosHashGetSize(pReader->status.pTableMap);
L
Liu Jicong 已提交
2927 2928
  tsdbDataFReaderClose(&pReader->pFileReader);

H
Hongze Cheng 已提交
2929
  initFilesetIterator(&pReader->status.fileIter, pReader->pReadSnap->fs.aDFileSet, pReader->order, pReader->idStr);
2930
  resetDataBlockIterator(&pReader->status.blockIter, pReader->order, pReader->status.pTableMap);
2931
  resetDataBlockScanInfo(pReader->status.pTableMap);
2932

2933
  int32_t         code = 0;
2934 2935
  SDataBlockIter* pBlockIter = &pReader->status.blockIter;

2936 2937 2938 2939 2940 2941
  // no data in files, let's try buffer in memory
  if (pReader->status.fileIter.numOfFiles == 0) {
    pReader->status.loadFromFile = false;
  } else {
    code = initForFirstBlockInFile(pReader, pBlockIter);
    if (code != TSDB_CODE_SUCCESS) {
2942 2943
      tsdbError("%p reset reader failed, numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s", pReader,
                numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
2944 2945 2946
      return code;
    }
  }
H
Hongze Cheng 已提交
2947

dengyihao's avatar
dengyihao 已提交
2948 2949
  tsdbDebug("%p reset reader, suid:%" PRIu64 ", numOfTables:%d, query range:%" PRId64 " - %" PRId64 " in query %s",
            pReader, pReader->suid, numOfTables, pReader->window.skey, pReader->window.ekey, pReader->idStr);
2950

2951
  return code;
H
Hongze Cheng 已提交
2952
}
H
Hongze Cheng 已提交
2953

2954 2955 2956
static int32_t getBucketIndex(int32_t startRow, int32_t bucketRange, int32_t numOfRows) {
  return (numOfRows - startRow) / bucketRange;
}
H
Hongze Cheng 已提交
2957

2958 2959 2960 2961
int32_t tsdbGetFileBlocksDistInfo(STsdbReader* pReader, STableBlockDistInfo* pTableBlockInfo) {
  int32_t code = TSDB_CODE_SUCCESS;
  pTableBlockInfo->totalSize = 0;
  pTableBlockInfo->totalRows = 0;
H
Hongze Cheng 已提交
2962

2963 2964
  // find the start data block in file
  SReaderStatus* pStatus = &pReader->status;
H
Hongze Cheng 已提交
2965

2966 2967 2968
  STsdbCfg* pc = &pReader->pTsdb->pVnode->config.tsdbCfg;
  pTableBlockInfo->defMinRows = pc->minRows;
  pTableBlockInfo->defMaxRows = pc->maxRows;
H
Hongze Cheng 已提交
2969

2970
  int32_t bucketRange = ceil((pc->maxRows - pc->minRows) / 20.0);
H
Hongze Cheng 已提交
2971

2972
  pTableBlockInfo->numOfFiles += 1;
H
Hongze Cheng 已提交
2973

2974 2975
  int32_t numOfTables = (int32_t)taosHashGetSize(pStatus->pTableMap);
  int     defaultRows = 4096;
H
Hongze Cheng 已提交
2976

2977 2978
  SDataBlockIter* pBlockIter = &pStatus->blockIter;
  pTableBlockInfo->numOfFiles += pStatus->fileIter.numOfFiles;
H
Haojun Liao 已提交
2979 2980 2981 2982

  if (pBlockIter->numOfBlocks > 0) {
    pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
  }
H
Hongze Cheng 已提交
2983

2984
  pTableBlockInfo->numOfTables = numOfTables;
H
Haojun Liao 已提交
2985
  bool hasNext = (pBlockIter->numOfBlocks > 0);
H
Hongze Cheng 已提交
2986

2987 2988
  while (true) {
    if (hasNext) {
H
Haojun Liao 已提交
2989
      SBlock* pBlock = getCurrentBlock(pBlockIter);
H
Hongze Cheng 已提交
2990

2991 2992
      int32_t numOfRows = pBlock->nRow;
      pTableBlockInfo->totalRows += numOfRows;
H
Hongze Cheng 已提交
2993

2994 2995 2996
      if (numOfRows > pTableBlockInfo->maxRows) {
        pTableBlockInfo->maxRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
2997

2998 2999 3000
      if (numOfRows < pTableBlockInfo->minRows) {
        pTableBlockInfo->minRows = numOfRows;
      }
H
refact  
Hongze Cheng 已提交
3001

3002 3003 3004
      if (numOfRows < defaultRows) {
        pTableBlockInfo->numOfSmallBlocks += 1;
      }
H
refact  
Hongze Cheng 已提交
3005

3006 3007
      int32_t bucketIndex = getBucketIndex(pTableBlockInfo->defMinRows, bucketRange, numOfRows);
      pTableBlockInfo->blockRowsHisto[bucketIndex]++;
3008 3009

      hasNext = blockIteratorNext(&pStatus->blockIter);
3010 3011 3012 3013 3014
    } else {
      code = initForFirstBlockInFile(pReader, pBlockIter);
      if ((code != TSDB_CODE_SUCCESS) || (pReader->status.loadFromFile == false)) {
        break;
      }
H
refact  
Hongze Cheng 已提交
3015

3016
      pTableBlockInfo->numOfBlocks += pBlockIter->numOfBlocks;
3017
      hasNext = (pBlockIter->numOfBlocks > 0);
3018
    }
H
refact  
Hongze Cheng 已提交
3019

H
Hongze Cheng 已提交
3020 3021
    //    tsdbDebug("%p %d blocks found in file for %d table(s), fid:%d, %s", pReader, numOfBlocks, numOfTables,
    //              pReader->pFileGroup->fid, pReader->idStr);
3022
  }
H
Hongze Cheng 已提交
3023

H
refact  
Hongze Cheng 已提交
3024 3025
  return code;
}
H
Hongze Cheng 已提交
3026

H
refact  
Hongze Cheng 已提交
3027
int64_t tsdbGetNumOfRowsInMemTable(STsdbReader* pReader) {
3028
  int64_t rows = 0;
H
Hongze Cheng 已提交
3029

3030 3031
  SReaderStatus* pStatus = &pReader->status;
  pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, NULL);
H
Hongze Cheng 已提交
3032

3033 3034 3035 3036 3037
  while (pStatus->pTableIter != NULL) {
    STableBlockScanInfo* pBlockScanInfo = pStatus->pTableIter;

    STbData* d = NULL;
    if (pReader->pTsdb->mem != NULL) {
3038
      tsdbGetTbDataFromMemTable(pReader->pReadSnap->pMem, pReader->suid, pBlockScanInfo->uid, &d);
3039 3040 3041 3042 3043 3044 3045
      if (d != NULL) {
        rows += tsdbGetNRowsInTbData(d);
      }
    }

    STbData* di = NULL;
    if (pReader->pTsdb->imem != NULL) {
3046
      tsdbGetTbDataFromMemTable(pReader->pReadSnap->pIMem, pReader->suid, pBlockScanInfo->uid, &di);
3047 3048 3049 3050 3051 3052 3053 3054
      if (di != NULL) {
        rows += tsdbGetNRowsInTbData(di);
      }
    }

    // current table is exhausted, let's try the next table
    pStatus->pTableIter = taosHashIterate(pStatus->pTableMap, pStatus->pTableIter);
  }
H
Hongze Cheng 已提交
3055

H
refact  
Hongze Cheng 已提交
3056
  return rows;
H
Hongze Cheng 已提交
3057
}
D
dapan1121 已提交
3058

L
Liu Jicong 已提交
3059
int32_t tsdbGetTableSchema(SVnode* pVnode, int64_t uid, STSchema** pSchema, int64_t* suid) {
D
dapan1121 已提交
3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071
  int32_t sversion = 1;

  SMetaReader mr = {0};
  metaReaderInit(&mr, pVnode->pMeta, 0);
  int32_t code = metaGetTableEntryByUid(&mr, uid);
  if (code != TSDB_CODE_SUCCESS) {
    terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
    metaReaderClear(&mr);
    return terrno;
  }

  *suid = 0;
L
Liu Jicong 已提交
3072

D
dapan1121 已提交
3073
  if (mr.me.type == TSDB_CHILD_TABLE) {
D
dapan1121 已提交
3074
    tDecoderClear(&mr.coder);
D
dapan1121 已提交
3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089
    *suid = mr.me.ctbEntry.suid;
    code = metaGetTableEntryByUid(&mr, *suid);
    if (code != TSDB_CODE_SUCCESS) {
      terrno = TSDB_CODE_TDB_INVALID_TABLE_ID;
      metaReaderClear(&mr);
      return terrno;
    }
    sversion = mr.me.stbEntry.schemaRow.version;
  } else {
    ASSERT(mr.me.type == TSDB_NORMAL_TABLE);
    sversion = mr.me.ntbEntry.schemaRow.version;
  }

  metaReaderClear(&mr);
  *pSchema = metaGetTbTSchema(pVnode->pMeta, uid, sversion);
L
Liu Jicong 已提交
3090

D
dapan1121 已提交
3091 3092
  return TSDB_CODE_SUCCESS;
}
H
Hongze Cheng 已提交
3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122

int32_t tsdbTakeReadSnap(STsdb* pTsdb, STsdbReadSnap** ppSnap) {
  int32_t code = 0;

  // alloc
  *ppSnap = (STsdbReadSnap*)taosMemoryCalloc(1, sizeof(STsdbReadSnap));
  if (*ppSnap == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
    goto _exit;
  }

  // lock
  code = taosThreadRwlockRdlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

  // take snapshot
  (*ppSnap)->pMem = pTsdb->mem;
  (*ppSnap)->pIMem = pTsdb->imem;

  if ((*ppSnap)->pMem) {
    tsdbRefMemTable((*ppSnap)->pMem);
  }

  if ((*ppSnap)->pIMem) {
    tsdbRefMemTable((*ppSnap)->pIMem);
  }

H
Hongze Cheng 已提交
3123
  // fs
H
Hongze Cheng 已提交
3124 3125 3126 3127 3128
  code = tsdbFSRef(pTsdb, &(*ppSnap)->fs);
  if (code) {
    taosThreadRwlockUnlock(&pTsdb->rwLock);
    goto _exit;
  }
H
Hongze Cheng 已提交
3129 3130 3131 3132 3133 3134 3135 3136

  // unlock
  code = taosThreadRwlockUnlock(&pTsdb->rwLock);
  if (code) {
    code = TAOS_SYSTEM_ERROR(code);
    goto _exit;
  }

S
Shengliang Guan 已提交
3137
  tsdbTrace("vgId:%d, take read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151
_exit:
  return code;
}

void tsdbUntakeReadSnap(STsdb* pTsdb, STsdbReadSnap* pSnap) {
  if (pSnap) {
    if (pSnap->pMem) {
      tsdbUnrefMemTable(pSnap->pMem);
    }

    if (pSnap->pIMem) {
      tsdbUnrefMemTable(pSnap->pIMem);
    }

H
Hongze Cheng 已提交
3152
    tsdbFSUnref(pTsdb, &pSnap->fs);
H
Hongze Cheng 已提交
3153
    taosMemoryFree(pSnap);
H
Hongze Cheng 已提交
3154
  }
H
Hongze Cheng 已提交
3155

S
Shengliang Guan 已提交
3156
  tsdbTrace("vgId:%d, untake read snapshot", TD_VID(pTsdb->pVnode));
H
Hongze Cheng 已提交
3157
}