queryExecutor.c 206.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#include "os.h"

#include "hash.h"
#include "hashfunc.h"
19
#include "taosmsg.h"
20
#include "tlog.h"
21 22 23 24
#include "tlosertree.h"
#include "tscompression.h"
#include "ttime.h"

H
hjxilinx 已提交
25 26
#include "qast.h"

27
#include "qresultBuf.h"
28 29
#include "queryExecutor.h"
#include "queryUtil.h"
H
hjxilinx 已提交
30 31
#include "query.h"
#include "tsdbMain.h"   //todo use TableId instead of STable object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

#define DEFAULT_INTERN_BUF_SIZE 16384L

/**
 * check if the primary column is load by default, otherwise, the program will
 * forced to load primary column explicitly.
 */
#define PRIMARY_TSCOL_LOADED(query) ((query)->colList[0].data.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX)

#define Q_STATUS_EQUAL(p, s) (((p) & (s)) != 0)
#define TSDB_COL_IS_TAG(f) (((f)&TSDB_COL_TAG) != 0)
#define QUERY_IS_ASC_QUERY(q) (GET_FORWARD_DIRECTION_FACTOR((q)->order.order) == QUERY_ASC_FORWARD_STEP)

#define IS_MASTER_SCAN(runtime) (((runtime)->scanFlag & 1u) == MASTER_SCAN)
#define IS_SUPPLEMENT_SCAN(runtime) ((runtime)->scanFlag == SUPPLEMENTARY_SCAN)
#define SET_SUPPLEMENT_SCAN_FLAG(runtime) ((runtime)->scanFlag = SUPPLEMENTARY_SCAN)
#define SET_MASTER_SCAN_FLAG(runtime) ((runtime)->scanFlag = MASTER_SCAN)

50
#define GET_QINFO_ADDR(x) ((void*)((char *)(x)-offsetof(SQInfo, runtimeEnv)))
51

52
#define GET_COL_DATA_POS(query, index, step) ((query)->pos + (index) * (step))
53 54 55

/* get the qinfo struct address from the query struct address */
#define GET_COLUMN_BYTES(query, colidx) \
56
  ((query)->colList[(query)->pSelectExpr[colidx].pBase.colInfo.colIndex].info.bytes)
57
#define GET_COLUMN_TYPE(query, colidx) \
58
  ((query)->colList[(query)->pSelectExpr[colidx].pBase.colInfo.colIndex].info.type)
59 60 61 62 63 64 65 66

typedef struct SPointInterpoSupporter {
  int32_t numOfCols;
  char ** pPrevPoint;
  char ** pNextPoint;
} SPointInterpoSupporter;

typedef enum {
H
hjxilinx 已提交
67
  // when query starts to execute, this status will set
68 69
  QUERY_NOT_COMPLETED = 0x1u,

H
hjxilinx 已提交
70 71
  /* result output buffer is full, current query is paused.
   * this status is only exist in group-by clause and diff/add/division/multiply/ query.
72
   */
73 74
  QUERY_RESBUF_FULL = 0x2u,

H
hjxilinx 已提交
75 76 77
  /* query is over
   * 1. this status is used in one row result query process, e.g., count/sum/first/last/ avg...etc.
   * 2. when all data within queried time window, it is also denoted as query_completed
78
   */
79
  QUERY_COMPLETED = 0x4u,
H
hjxilinx 已提交
80 81 82
  
  /* when the result is not completed return to client, this status will be
   * usually used in case of interval query with interpolation option
83
   */
H
hjxilinx 已提交
84
  QUERY_OVER      = 0x8u,
85 86 87
} vnodeQueryStatus;

static void setQueryStatus(SQuery *pQuery, int8_t status);
88
bool        isIntervalQuery(SQuery *pQuery) { return pQuery->intervalTime > 0; }
89 90 91 92 93 94 95

enum {
  TS_JOIN_TS_EQUAL = 0,
  TS_JOIN_TS_NOT_EQUALS = 1,
  TS_JOIN_TAG_NOT_EQUALS = 2,
};

96
static int32_t mergeIntoGroupResultImpl(SQInfo *pQInfo, SArray* group);
97 98
static void setWindowResOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pResult);

99 100 101
static void resetMergeResultBuf(SQuery *pQuery, SQLFunctionCtx *pCtx, SResultInfo *pResultInfo);
static bool functionNeedToExecute(SQueryRuntimeEnv *pRuntimeEnv, SQLFunctionCtx *pCtx, int32_t functionId);
static void getNextTimeWindow(SQuery *pQuery, STimeWindow *pTimeWindow);
102

103
static void setExecParams(SQuery *pQuery, SQLFunctionCtx *pCtx, void *inputData, TSKEY *tsCol, int32_t size,
104 105 106
                          int32_t functionId, SDataStatis *pStatis, bool hasNull, void *param, int32_t scanFlag);
static void initCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv);
static void destroyMeterQueryInfo(STableQueryInfo *pTableQueryInfo, int32_t numOfCols);
107 108 109 110
static void resetCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv);
static bool hasMainOutput(SQuery *pQuery);
static void createTableDataInfo(SQInfo* pQInfo);

111
static int32_t setAdditionalInfo(SQInfo *pQInfo, STable* pTable, STableQueryInfo *pTableQueryInfo);
112
static int32_t flushFromResultBuf(SQInfo *pQInfo);
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218

bool getNeighborPoints(SQInfo *pQInfo, void *pMeterObj, SPointInterpoSupporter *pPointInterpSupporter) {
#if 0
  SQueryRuntimeEnv *pRuntimeEnv = &pSupporter->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

  if (!isPointInterpoQuery(pQuery)) {
    return false;
  }

  /*
   * for interpolate point query, points that are directly before/after the specified point are required
   */
  if (isFirstLastRowQuery(pQuery)) {
    assert(!QUERY_IS_ASC_QUERY(pQuery));
  } else {
    assert(QUERY_IS_ASC_QUERY(pQuery));
  }
  assert(pPointInterpSupporter != NULL && pQuery->skey == pQuery->ekey);

  SCacheBlock *pBlock = NULL;

  qTrace("QInfo:%p get next data point, fileId:%d, slot:%d, pos:%d", GET_QINFO_ADDR(pQuery), pQuery->fileId,
         pQuery->slot, pQuery->pos);

  // save the point that is directly after or equals to the specified point
  getOneRowFromDataBlock(pRuntimeEnv, pPointInterpSupporter->pNextPoint, pQuery->pos);

  /*
   * 1. for last_row query, return immediately.
   * 2. the specified timestamp equals to the required key, interpolation according to neighbor points is not necessary
   *    for interp query.
   */
  TSKEY actualKey = *(TSKEY *)pPointInterpSupporter->pNextPoint[0];
  if (isFirstLastRowQuery(pQuery) || actualKey == pQuery->skey) {
    setQueryStatus(pQuery, QUERY_NOT_COMPLETED);

    /*
     * the retrieved ts may not equals to pMeterObj->lastKey due to cache re-allocation
     * set the pQuery->ekey/pQuery->skey/pQuery->lastKey to be the new value.
     */
    if (pQuery->ekey != actualKey) {
      pQuery->skey = actualKey;
      pQuery->ekey = actualKey;
      pQuery->lastKey = actualKey;
      pSupporter->rawSKey = actualKey;
      pSupporter->rawEKey = actualKey;
    }
    return true;
  }

  /* the qualified point is not the first point in data block */
  if (pQuery->pos > 0) {
    int32_t prevPos = pQuery->pos - 1;

    /* save the point that is directly after the specified point */
    getOneRowFromDataBlock(pRuntimeEnv, pPointInterpSupporter->pPrevPoint, prevPos);
  } else {
    __block_search_fn_t searchFn = vnodeSearchKeyFunc[pMeterObj->searchAlgorithm];

//    savePointPosition(&pRuntimeEnv->startPos, pQuery->fileId, pQuery->slot, pQuery->pos);

    // backwards movement would not set the pQuery->pos correct. We need to set it manually later.
    moveToNextBlock(pRuntimeEnv, QUERY_DESC_FORWARD_STEP, searchFn, true);

    /*
     * no previous data exists.
     * reset the status and load the data block that contains the qualified point
     */
    if (Q_STATUS_EQUAL(pQuery->over, QUERY_NO_DATA_TO_CHECK)) {
      dTrace("QInfo:%p no previous data block, start fileId:%d, slot:%d, pos:%d, qrange:%" PRId64 "-%" PRId64
             ", out of range",
             GET_QINFO_ADDR(pQuery), pRuntimeEnv->startPos.fileId, pRuntimeEnv->startPos.slot,
             pRuntimeEnv->startPos.pos, pQuery->skey, pQuery->ekey);

      // no result, return immediately
      setQueryStatus(pQuery, QUERY_COMPLETED);
      return false;
    } else {  // prev has been located
      if (pQuery->fileId >= 0) {
        pQuery->pos = pQuery->pBlock[pQuery->slot].numOfPoints - 1;
        getOneRowFromDataBlock(pRuntimeEnv, pPointInterpSupporter->pPrevPoint, pQuery->pos);

        qTrace("QInfo:%p get prev data point, fileId:%d, slot:%d, pos:%d, pQuery->pos:%d", GET_QINFO_ADDR(pQuery),
               pQuery->fileId, pQuery->slot, pQuery->pos, pQuery->pos);
      } else {
        // moveToNextBlock make sure there is a available cache block, if exists
        assert(vnodeIsDatablockLoaded(pRuntimeEnv, pMeterObj, -1, true) == DISK_BLOCK_NO_NEED_TO_LOAD);
        pBlock = &pRuntimeEnv->cacheBlock;

        pQuery->pos = pBlock->numOfPoints - 1;
        getOneRowFromDataBlock(pRuntimeEnv, pPointInterpSupporter->pPrevPoint, pQuery->pos);

        qTrace("QInfo:%p get prev data point, fileId:%d, slot:%d, pos:%d, pQuery->pos:%d", GET_QINFO_ADDR(pQuery),
               pQuery->fileId, pQuery->slot, pBlock->numOfPoints - 1, pQuery->pos);
      }
    }
  }

  pQuery->skey = *(TSKEY *)pPointInterpSupporter->pPrevPoint[0];
  pQuery->ekey = *(TSKEY *)pPointInterpSupporter->pNextPoint[0];
  pQuery->lastKey = pQuery->skey;
#endif
  return true;
}

219
bool vnodeDoFilterData(SQuery *pQuery, int32_t elemPos) {
220 221
  for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
    SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
222 223 224
    char *                   pElem = pFilterInfo->pData + pFilterInfo->info.info.bytes * elemPos;

    if (isNull(pElem, pFilterInfo->info.info.type)) {
225 226
      return false;
    }
227

228
    int32_t num = pFilterInfo->numOfFilters;
229 230 231
    bool    qualified = false;
    for (int32_t j = 0; j < num; ++j) {
      SColumnFilterElem *pFilterElem = &pFilterInfo->pFilters[j];
232 233 234 235 236
      if (pFilterElem->fp(pFilterElem, pElem, pElem)) {
        qualified = true;
        break;
      }
    }
237

238 239 240 241
    if (!qualified) {
      return false;
    }
  }
242

243 244 245
  return true;
}

246
bool vnodeFilterData(SQuery *pQuery, int32_t *numOfActualRead, int32_t index) {
247 248 249 250
  (*numOfActualRead)++;
  if (!vnodeDoFilterData(pQuery, index)) {
    return false;
  }
251

252 253 254 255
  if (pQuery->limit.offset > 0) {
    pQuery->limit.offset--;  // ignore this qualified row
    return false;
  }
256

257 258 259 260 261 262
  return true;
}

int64_t getNumOfResult(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
  bool    hasMainFunction = hasMainOutput(pQuery);
263

264 265 266
  int64_t maxOutput = 0;
  for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
    int32_t functionId = pQuery->pSelectExpr[j].pBase.functionId;
267

268 269 270 271 272 273 274 275
    /*
     * ts, tag, tagprj function can not decide the output number of current query
     * the number of output result is decided by main output
     */
    if (hasMainFunction &&
        (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ)) {
      continue;
    }
276

277 278 279 280 281
    SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
    if (pResInfo != NULL && maxOutput < pResInfo->numOfRes) {
      maxOutput = pResInfo->numOfRes;
    }
  }
282

283
  assert(maxOutput >= 0);
284 285 286 287 288 289 290 291 292 293 294 295
  return maxOutput;
}

static int32_t getGroupResultId(int32_t groupIndex) {
  int32_t base = 200000;
  return base + (groupIndex * 10000);
}

bool isGroupbyNormalCol(SSqlGroupbyExpr *pGroupbyExpr) {
  if (pGroupbyExpr == NULL || pGroupbyExpr->numOfGroupCols == 0) {
    return false;
  }
296

297
  for (int32_t i = 0; i < pGroupbyExpr->numOfGroupCols; ++i) {
298
    SColIndex *pColIndex = &pGroupbyExpr->columnInfo[i];
299 300 301 302 303
    if (pColIndex->flag == TSDB_COL_NORMAL) {
      /*
       * make sure the normal column locates at the second position if tbname exists in group by clause
       */
      if (pGroupbyExpr->numOfGroupCols > 1) {
304
        assert(pColIndex->colIndex > 0);
305
      }
306

307 308 309
      return true;
    }
  }
310

311 312 313 314 315
  return false;
}

int16_t getGroupbyColumnType(SQuery *pQuery, SSqlGroupbyExpr *pGroupbyExpr) {
  assert(pGroupbyExpr != NULL);
316

317 318
  int32_t colId = -2;
  int16_t type = TSDB_DATA_TYPE_NULL;
319

320
  for (int32_t i = 0; i < pGroupbyExpr->numOfGroupCols; ++i) {
321
    SColIndex *pColIndex = &pGroupbyExpr->columnInfo[i];
322 323 324 325 326
    if (pColIndex->flag == TSDB_COL_NORMAL) {
      colId = pColIndex->colId;
      break;
    }
  }
327

328 329 330 331 332 333
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
    if (colId == pQuery->colList[i].info.colId) {
      type = pQuery->colList[i].info.type;
      break;
    }
  }
334

335 336 337 338 339 340
  return type;
}

bool isSelectivityWithTagsQuery(SQuery *pQuery) {
  bool    hasTags = false;
  int32_t numOfSelectivity = 0;
341

342 343 344 345 346 347
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functId == TSDB_FUNC_TAG_DUMMY || functId == TSDB_FUNC_TS_DUMMY) {
      hasTags = true;
      continue;
    }
348

349 350 351 352
    if ((aAggs[functId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0) {
      numOfSelectivity++;
    }
  }
353

354 355 356
  if (numOfSelectivity > 0 && hasTags) {
    return true;
  }
357

358 359 360 361 362 363 364
  return false;
}

bool isTSCompQuery(SQuery *pQuery) { return pQuery->pSelectExpr[0].pBase.functionId == TSDB_FUNC_TS_COMP; }

bool doRevisedResultsByLimit(SQInfo *pQInfo) {
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
365

366 367 368 369
  if ((pQuery->limit.limit > 0) && (pQuery->rec.total + pQuery->rec.rows > pQuery->limit.limit)) {
    pQuery->rec.rows = pQuery->limit.limit - pQuery->rec.total;
    assert(pQuery->rec.rows > 0);
    
370 371 372
    setQueryStatus(pQuery, QUERY_COMPLETED);
    return true;
  }
373

374 375 376 377 378 379 380 381 382
  return false;
}

static bool isTopBottomQuery(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionId == TSDB_FUNC_TS) {
      continue;
    }
383

384 385 386 387
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
      return true;
    }
  }
388

389 390 391
  return false;
}

392
static SDataStatis *getStatisInfo(SQuery *pQuery, SDataStatis *pStatis, SDataBlockInfo *pDataBlockInfo, int32_t index) {
393
  // for a tag column, no corresponding field info
394
  SColIndex *pColIndexEx = &pQuery->pSelectExpr[index].pBase.colInfo;
395 396 397
  if (TSDB_COL_IS_TAG(pColIndexEx->flag)) {
    return NULL;
  }
398

399 400 401 402 403 404 405 406 407
  /*
   * Choose the right column field info by field id, since the file block may be out of date,
   * which means the newest table schema is not equalled to the schema of this block.
   */
  for (int32_t i = 0; i < pDataBlockInfo->numOfCols; ++i) {
    if (pColIndexEx->colId == pStatis[i].colId) {
      return &pStatis[i];
    }
  }
408

409 410 411
  return NULL;
}

412 413 414 415 416 417 418 419
/**
 * @param pQuery
 * @param col
 * @param pDataBlockInfo
 * @param pStatis
 * @param pColStatis
 * @return
 */
420
static bool hasNullValue(SQuery *pQuery, int32_t col, SDataBlockInfo *pDataBlockInfo, SDataStatis *pStatis,
421
                         SDataStatis **pColStatis) {
422
  SColIndex* pColIndex = &pQuery->pSelectExpr[col].pBase.colInfo;
423
  if (TSDB_COL_IS_TAG(pColIndex->flag)) {
424 425
    return false;
  }
426 427 428 429 430 431 432 433 434 435 436
  
  // query on primary timestamp column, not null value at all
  if (pColIndex->colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
    return false;
  }
  
  *pColStatis = NULL;
  if (pStatis != NULL) {
    *pColStatis = getStatisInfo(pQuery, pStatis, pDataBlockInfo, col);
  }
  
437 438 439
  if ((*pColStatis) != NULL && (*pColStatis)->numOfNull == 0) {
    return false;
  }
440

441 442 443 444 445 446
  return true;
}

static SWindowResult *doSetTimeWindowFromKey(SQueryRuntimeEnv *pRuntimeEnv, SWindowResInfo *pWindowResInfo, char *pData,
                                             int16_t bytes) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
447

448 449 450 451 452 453
  int32_t *p1 = (int32_t *)taosHashGet(pWindowResInfo->hashList, pData, bytes);
  if (p1 != NULL) {
    pWindowResInfo->curIndex = *p1;
  } else {  // more than the capacity, reallocate the resources
    if (pWindowResInfo->size >= pWindowResInfo->capacity) {
      int64_t newCap = pWindowResInfo->capacity * 2;
454

455 456 457 458 459 460 461
      char *t = realloc(pWindowResInfo->pResult, newCap * sizeof(SWindowResult));
      if (t != NULL) {
        pWindowResInfo->pResult = (SWindowResult *)t;
        memset(&pWindowResInfo->pResult[pWindowResInfo->capacity], 0, sizeof(SWindowResult) * pWindowResInfo->capacity);
      } else {
        // todo
      }
462

463 464 465 466
      for (int32_t i = pWindowResInfo->capacity; i < newCap; ++i) {
        SPosInfo pos = {-1, -1};
        createQueryResultInfo(pQuery, &pWindowResInfo->pResult[i], pRuntimeEnv->stableQuery, &pos);
      }
467

468 469
      pWindowResInfo->capacity = newCap;
    }
470

471 472 473 474
    // add a new result set for a new group
    pWindowResInfo->curIndex = pWindowResInfo->size++;
    taosHashPut(pWindowResInfo->hashList, pData, bytes, (char *)&pWindowResInfo->curIndex, sizeof(int32_t));
  }
475

476 477 478 479 480 481
  return getWindowResult(pWindowResInfo, pWindowResInfo->curIndex);
}

// get the correct time window according to the handled timestamp
static STimeWindow getActiveTimeWindow(SWindowResInfo *pWindowResInfo, int64_t ts, SQuery *pQuery) {
  STimeWindow w = {0};
482

483 484 485 486 487 488 489
  if (pWindowResInfo->curIndex == -1) {  // the first window, from the previous stored value
    w.skey = pWindowResInfo->prevSKey;
    w.ekey = w.skey + pQuery->intervalTime - 1;
  } else {
    int32_t slot = curTimeWindow(pWindowResInfo);
    w = getWindowResult(pWindowResInfo, slot)->window;
  }
490

491 492
  if (w.skey > ts || w.ekey < ts) {
    int64_t st = w.skey;
493

494 495 496
    if (st > ts) {
      st -= ((st - ts + pQuery->slidingTime - 1) / pQuery->slidingTime) * pQuery->slidingTime;
    }
497

498 499 500 501
    int64_t et = st + pQuery->intervalTime - 1;
    if (et < ts) {
      st += ((ts - et + pQuery->slidingTime - 1) / pQuery->slidingTime) * pQuery->slidingTime;
    }
502

503 504 505
    w.skey = st;
    w.ekey = w.skey + pQuery->intervalTime - 1;
  }
506

507 508 509 510 511 512 513
  /*
   * query border check, skey should not be bounded by the query time range, since the value skey will
   * be used as the time window index value. So we only change ekey of time window accordingly.
   */
  if (w.ekey > pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) {
    w.ekey = pQuery->window.ekey;
  }
514

515
  assert(ts >= w.skey && ts <= w.ekey && w.skey != 0);
516

517 518 519 520 521 522 523 524
  return w;
}

static int32_t addNewWindowResultBuf(SWindowResult *pWindowRes, SDiskbasedResultBuf *pResultBuf, int32_t sid,
                                     int32_t numOfRowsPerPage) {
  if (pWindowRes->pos.pageId != -1) {
    return 0;
  }
525

526
  tFilePage *pData = NULL;
527

528 529 530
  // in the first scan, new space needed for results
  int32_t pageId = -1;
  SIDList list = getDataBufPagesIdList(pResultBuf, sid);
531

532 533 534 535 536
  if (list.size == 0) {
    pData = getNewDataBuf(pResultBuf, sid, &pageId);
  } else {
    pageId = getLastPageId(&list);
    pData = getResultBufferPageById(pResultBuf, pageId);
537

538 539 540 541 542 543 544
    if (pData->numOfElems >= numOfRowsPerPage) {
      pData = getNewDataBuf(pResultBuf, sid, &pageId);
      if (pData != NULL) {
        assert(pData->numOfElems == 0);  // number of elements must be 0 for new allocated buffer
      }
    }
  }
545

546 547 548
  if (pData == NULL) {
    return -1;
  }
549

550 551 552 553 554
  // set the number of rows in current disk page
  if (pWindowRes->pos.pageId == -1) {  // not allocated yet, allocate new buffer
    pWindowRes->pos.pageId = pageId;
    pWindowRes->pos.rowId = pData->numOfElems++;
  }
555

556 557 558 559 560 561 562
  return 0;
}

static int32_t setWindowOutputBufByKey(SQueryRuntimeEnv *pRuntimeEnv, SWindowResInfo *pWindowResInfo, int32_t sid,
                                       STimeWindow *win) {
  assert(win->skey <= win->ekey);
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
563

564 565 566 567
  SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, pWindowResInfo, (char *)&win->skey, TSDB_KEYSIZE);
  if (pWindowRes == NULL) {
    return -1;
  }
568

569 570 571 572 573 574 575
  // not assign result buffer yet, add new result buffer
  if (pWindowRes->pos.pageId == -1) {
    int32_t ret = addNewWindowResultBuf(pWindowRes, pResultBuf, sid, pRuntimeEnv->numOfRowsPerPage);
    if (ret != 0) {
      return -1;
    }
  }
576

577 578
  // set time window for current result
  pWindowRes->window = *win;
579

580 581
  setWindowResOutputBuf(pRuntimeEnv, pWindowRes);
  initCtxOutputBuf(pRuntimeEnv);
582

583 584 585 586 587 588 589 590 591 592 593 594
  return TSDB_CODE_SUCCESS;
}

static SWindowStatus *getTimeWindowResStatus(SWindowResInfo *pWindowResInfo, int32_t slot) {
  assert(slot >= 0 && slot < pWindowResInfo->size);
  return &pWindowResInfo->pResult[slot].status;
}

static int32_t getForwardStepsInBlock(int32_t numOfPoints, __block_search_fn_t searchFn, TSKEY ekey, int16_t pos,
                                      int16_t order, int64_t *pData) {
  int32_t endPos = searchFn((char *)pData, numOfPoints, ekey, order);
  int32_t forwardStep = 0;
595

596
  if (endPos >= 0) {
597
    forwardStep = (order == TSDB_ORDER_ASC) ? (endPos - pos) : (pos - endPos);
598
    assert(forwardStep >= 0);
599

600 601 602 603 604
    // endPos data is equalled to the key so, we do need to read the element in endPos
    if (pData[endPos] == ekey) {
      forwardStep += 1;
    }
  }
605

606 607 608 609 610 611 612 613 614 615 616
  return forwardStep;
}

/**
 * NOTE: the query status only set for the first scan of master scan.
 */
static void doCheckQueryCompleted(SQueryRuntimeEnv *pRuntimeEnv, TSKEY lastKey, SWindowResInfo *pWindowResInfo) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
  if (pRuntimeEnv->scanFlag != MASTER_SCAN || (!isIntervalQuery(pQuery))) {
    return;
  }
617

618 619 620 621
  // no qualified results exist, abort check
  if (pWindowResInfo->size == 0) {
    return;
  }
622

623 624 625 626
  // query completed
  if ((lastKey >= pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
      (lastKey <= pQuery->window.ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
    closeAllTimeWindow(pWindowResInfo);
627

628 629 630 631 632
    pWindowResInfo->curIndex = pWindowResInfo->size - 1;
    setQueryStatus(pQuery, QUERY_COMPLETED | QUERY_RESBUF_FULL);
  } else {  // set the current index to be the last unclosed window
    int32_t i = 0;
    int64_t skey = 0;
633

634 635 636 637 638
    for (i = 0; i < pWindowResInfo->size; ++i) {
      SWindowResult *pResult = &pWindowResInfo->pResult[i];
      if (pResult->status.closed) {
        continue;
      }
639

640 641 642 643 644 645 646 647
      if ((pResult->window.ekey <= lastKey && QUERY_IS_ASC_QUERY(pQuery)) ||
          (pResult->window.skey >= lastKey && !QUERY_IS_ASC_QUERY(pQuery))) {
        closeTimeWindow(pWindowResInfo, i);
      } else {
        skey = pResult->window.skey;
        break;
      }
    }
648

649 650 651 652 653 654 655
    // all windows are closed, set the last one to be the skey
    if (skey == 0) {
      assert(i == pWindowResInfo->size);
      pWindowResInfo->curIndex = pWindowResInfo->size - 1;
    } else {
      pWindowResInfo->curIndex = i;
    }
656

657
    pWindowResInfo->prevSKey = pWindowResInfo->pResult[pWindowResInfo->curIndex].window.skey;
658

659 660 661 662 663
    // the number of completed slots are larger than the threshold, dump to client immediately.
    int32_t n = numOfClosedTimeWindow(pWindowResInfo);
    if (n > pWindowResInfo->threshold) {
      setQueryStatus(pQuery, QUERY_RESBUF_FULL);
    }
664

665 666
    qTrace("QInfo:%p total window:%d, closed:%d", GET_QINFO_ADDR(pQuery), pWindowResInfo->size, n);
  }
667

668 669 670 671
  assert(pWindowResInfo->prevSKey != 0);
}

static int32_t getNumOfRowsInTimeWindow(SQuery *pQuery, SDataBlockInfo *pDataBlockInfo, TSKEY *pPrimaryColumn,
672 673
                                        int32_t startPos, TSKEY ekey, __block_search_fn_t searchFn,
                                        bool updateLastKey) {
674
  assert(startPos >= 0 && startPos < pDataBlockInfo->rows);
675

676 677
  int32_t num = -1;
  int32_t order = pQuery->order.order;
678

679
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(order);
680

681 682
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    if (ekey < pDataBlockInfo->window.ekey) {
683
      num = getForwardStepsInBlock(pDataBlockInfo->rows, searchFn, ekey, startPos, order, pPrimaryColumn);
684 685 686 687 688 689 690 691
      if (num == 0) {  // no qualified data in current block, do not update the lastKey value
        assert(ekey < pPrimaryColumn[startPos]);
      } else {
        if (updateLastKey) {
          pQuery->lastKey = pPrimaryColumn[startPos + (num - 1)] + step;
        }
      }
    } else {
692
      num = pDataBlockInfo->rows - startPos;
693 694 695 696 697 698
      if (updateLastKey) {
        pQuery->lastKey = pDataBlockInfo->window.ekey + step;
      }
    }
  } else {  // desc
    if (ekey > pDataBlockInfo->window.skey) {
699
      num = getForwardStepsInBlock(pDataBlockInfo->rows, searchFn, ekey, startPos, order, pPrimaryColumn);
700 701 702 703 704 705 706 707 708 709 710 711 712 713
      if (num == 0) {  // no qualified data in current block, do not update the lastKey value
        assert(ekey > pPrimaryColumn[startPos]);
      } else {
        if (updateLastKey) {
          pQuery->lastKey = pPrimaryColumn[startPos - (num - 1)] + step;
        }
      }
    } else {
      num = startPos + 1;
      if (updateLastKey) {
        pQuery->lastKey = pDataBlockInfo->window.skey + step;
      }
    }
  }
714

715 716 717 718 719
  assert(num >= 0);
  return num;
}

static void doBlockwiseApplyFunctions(SQueryRuntimeEnv *pRuntimeEnv, SWindowStatus *pStatus, STimeWindow *pWin,
720
                                      int32_t startPos, int32_t forwardStep, TSKEY *tsBuf) {
721 722
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
723

724 725 726
  if (IS_MASTER_SCAN(pRuntimeEnv) || pStatus->closed) {
    for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
      int32_t functionId = pQuery->pSelectExpr[k].pBase.functionId;
727

728 729 730
      pCtx[k].nStartQueryTimestamp = pWin->skey;
      pCtx[k].size = forwardStep;
      pCtx[k].startOffset = (QUERY_IS_ASC_QUERY(pQuery)) ? startPos : startPos - (forwardStep - 1);
731

732 733 734
      if ((aAggs[functionId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0) {
        pCtx[k].ptsList = &tsBuf[pCtx[k].startOffset];
      }
735

736 737 738 739 740 741 742 743 744 745 746
      if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
        aAggs[functionId].xFunction(&pCtx[k]);
      }
    }
  }
}

static void doRowwiseApplyFunctions(SQueryRuntimeEnv *pRuntimeEnv, SWindowStatus *pStatus, STimeWindow *pWin,
                                    int32_t offset) {
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
747

748 749 750
  if (IS_MASTER_SCAN(pRuntimeEnv) || pStatus->closed) {
    for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
      pCtx[k].nStartQueryTimestamp = pWin->skey;
751

752 753 754 755 756 757 758 759 760 761 762 763
      int32_t functionId = pQuery->pSelectExpr[k].pBase.functionId;
      if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
        aAggs[functionId].xFunctionF(&pCtx[k], offset);
      }
    }
  }
}

static int32_t getNextQualifiedWindow(SQueryRuntimeEnv *pRuntimeEnv, STimeWindow *pNextWin,
                                      SWindowResInfo *pWindowResInfo, SDataBlockInfo *pDataBlockInfo,
                                      TSKEY *primaryKeys, __block_search_fn_t searchFn) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
764

765 766 767 768 769
  while (1) {
    if ((pNextWin->ekey > pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
        (pNextWin->skey < pQuery->window.ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
      return -1;
    }
770

771
    getNextTimeWindow(pQuery, pNextWin);
772

773 774 775 776 777
    // next time window is not in current block
    if ((pNextWin->skey > pDataBlockInfo->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
        (pNextWin->ekey < pDataBlockInfo->window.skey && !QUERY_IS_ASC_QUERY(pQuery))) {
      return -1;
    }
778

779 780 781 782 783 784 785 786 787 788 789 790
    TSKEY startKey = -1;
    if (QUERY_IS_ASC_QUERY(pQuery)) {
      startKey = pNextWin->skey;
      if (startKey < pQuery->window.skey) {
        startKey = pQuery->window.skey;
      }
    } else {
      startKey = pNextWin->ekey;
      if (startKey > pQuery->window.skey) {
        startKey = pQuery->window.skey;
      }
    }
791

792
    int32_t startPos = searchFn((char *)primaryKeys, pDataBlockInfo->rows, startKey, pQuery->order.order);
793

794 795 796 797 798 799 800 801
    /*
     * This time window does not cover any data, try next time window,
     * this case may happen when the time window is too small
     */
    if ((primaryKeys[startPos] > pNextWin->ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
        (primaryKeys[startPos] < pNextWin->skey && !QUERY_IS_ASC_QUERY(pQuery))) {
      continue;
    }
802

803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
    return startPos;
  }
}

static TSKEY reviseWindowEkey(SQuery *pQuery, STimeWindow *pWindow) {
  TSKEY ekey = -1;
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    ekey = pWindow->ekey;
    if (ekey > pQuery->window.ekey) {
      ekey = pQuery->window.ekey;
    }
  } else {
    ekey = pWindow->skey;
    if (ekey < pQuery->window.ekey) {
      ekey = pQuery->window.ekey;
    }
  }
820

821 822 823 824
  return ekey;
}

char *getDataBlocks(SQueryRuntimeEnv *pRuntimeEnv, SArithmeticSupport *sas, int32_t col, int32_t size,
825
                    SArray *pDataBlock) {
826 827
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
828

829
  char *dataBlock = NULL;
830

831
  int32_t functionId = pQuery->pSelectExpr[col].pBase.functionId;
832

833 834
  if (functionId == TSDB_FUNC_ARITHM) {
    sas->pExpr = &pQuery->pSelectExpr[col];
835

836 837 838 839 840 841
    // set the start offset to be the lowest start position, no matter asc/desc query order
    if (QUERY_IS_ASC_QUERY(pQuery)) {
      pCtx->startOffset = pQuery->pos;
    } else {
      pCtx->startOffset = pQuery->pos - (size - 1);
    }
842

843 844 845
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
      SColumnInfo *pColMsg = &pQuery->colList[i].info;
      assert(0);
846 847
      //      char *       pData = doGetDataBlocks(pQuery, pRuntimeEnv->colDataBuffer, pQuery->colList[i].colIdxInBuf);

848
      sas->elemSize[i] = pColMsg->bytes;
849
      //      sas->data[i] = pData + pCtx->startOffset * sas->elemSize[i];  // start from the offset
850
    }
851

852 853 854
    sas->numOfCols = pQuery->numOfCols;
    sas->offset = 0;
  } else {  // other type of query function
855
    SColIndex *pCol = &pQuery->pSelectExpr[col].pBase.colInfo;
856 857 858 859
    if (TSDB_COL_IS_TAG(pCol->flag)) {
      dataBlock = NULL;
    } else {
      /*
860
       *  the colIndex is acquired from the first meter of all qualified meters in this vnode during query prepare stage,
861 862 863 864 865 866
       *  the remain meter may not have the required column in cache actually.
       *  So, the validation of required column in cache with the corresponding meter schema is reinforced.
       */
      if (pDataBlock == NULL) {
        return NULL;
      }
867

868 869
      int32_t numOfCols = taosArrayGetSize(pDataBlock);
      for (int32_t i = 0; i < numOfCols; ++i) {
H
hjxilinx 已提交
870
        SColumnInfoData *p = taosArrayGet(pDataBlock, i);
871 872 873 874 875 876 877
        if (pCol->colId == p->info.colId) {
          dataBlock = p->pData;
          break;
        }
      }
    }
  }
878

879 880 881 882 883 884 885 886 887 888 889 890 891
  return dataBlock;
}

/**
 *
 * @param pRuntimeEnv
 * @param forwardStep
 * @param primaryKeyCol
 * @param pFields
 * @param isDiskFileBlock
 * @return                  the incremental number of output value, so it maybe 0 for fixed number of query,
 *                          such as count/min/max etc.
 */
892
static void blockwiseApplyAllFunctions(SQueryRuntimeEnv *pRuntimeEnv, SDataStatis *pStatis,
893 894 895 896
                                          SDataBlockInfo *pDataBlockInfo, SWindowResInfo *pWindowResInfo,
                                          __block_search_fn_t searchFn, SArray *pDataBlock) {
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
  SQuery *        pQuery = pRuntimeEnv->pQuery;
897

H
hjxilinx 已提交
898
  SColumnInfoData *pColInfo = NULL;
899 900
  TSKEY *        primaryKeyCol = NULL;

901 902 903 904
  if (pDataBlock != NULL) {
    pColInfo = taosArrayGet(pDataBlock, 0);
    primaryKeyCol = (TSKEY *)(pColInfo->pData);
  }
905

906
  pQuery->pos = QUERY_IS_ASC_QUERY(pQuery) ? 0 : pDataBlockInfo->rows - 1;
907
  SArithmeticSupport *sasArray = calloc((size_t)pQuery->numOfOutputCols, sizeof(SArithmeticSupport));
908

909 910
  for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
    int32_t functionId = pQuery->pSelectExpr[k].pBase.functionId;
911

912
    SDataStatis *tpField = NULL;
913 914
    
    bool hasNull = hasNullValue(pQuery, k, pDataBlockInfo, pStatis, &tpField);
915
    char *dataBlock = getDataBlocks(pRuntimeEnv, &sasArray[k], k, pDataBlockInfo->rows, pDataBlock);
916

917
    setExecParams(pQuery, &pCtx[k], dataBlock, primaryKeyCol, pDataBlockInfo->rows, functionId, tpField,
918 919
                  hasNull, &sasArray[k], pRuntimeEnv->scanFlag);
  }
920

921 922 923 924
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
  if (isIntervalQuery(pQuery)) {
    int32_t offset = GET_COL_DATA_POS(pQuery, 0, step);
    TSKEY   ts = primaryKeyCol[offset];
925

926
    STimeWindow win = getActiveTimeWindow(pWindowResInfo, ts, pQuery);
927
    if (setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pDataBlockInfo->sid, &win) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
928
      return;
929
    }
930

931 932 933
    TSKEY   ekey = reviseWindowEkey(pQuery, &win);
    int32_t forwardStep =
        getNumOfRowsInTimeWindow(pQuery, pDataBlockInfo, primaryKeyCol, pQuery->pos, ekey, searchFn, true);
934

935 936
    SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
    doBlockwiseApplyFunctions(pRuntimeEnv, pStatus, &win, pQuery->pos, forwardStep, primaryKeyCol);
937

938 939
    int32_t     index = pWindowResInfo->curIndex;
    STimeWindow nextWin = win;
940

941 942 943 944 945 946
    while (1) {
      int32_t startPos =
          getNextQualifiedWindow(pRuntimeEnv, &nextWin, pWindowResInfo, pDataBlockInfo, primaryKeyCol, searchFn);
      if (startPos < 0) {
        break;
      }
947

948
      // null data, failed to allocate more memory buffer
949
      if (setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pDataBlockInfo->sid, &nextWin) != TSDB_CODE_SUCCESS) {
950 951
        break;
      }
952

953 954
      ekey = reviseWindowEkey(pQuery, &nextWin);
      forwardStep = getNumOfRowsInTimeWindow(pQuery, pDataBlockInfo, primaryKeyCol, startPos, ekey, searchFn, true);
955

956 957 958
      pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
      doBlockwiseApplyFunctions(pRuntimeEnv, pStatus, &nextWin, startPos, forwardStep, primaryKeyCol);
    }
959

960 961 962 963 964 965 966 967 968 969 970 971 972 973
    pWindowResInfo->curIndex = index;
  } else {
    /*
     * the sqlfunctionCtx parameters should be set done before all functions are invoked,
     * since the selectivity + tag_prj query needs all parameters been set done.
     * tag_prj function are changed to be TSDB_FUNC_TAG_DUMMY
     */
    for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
      int32_t functionId = pQuery->pSelectExpr[k].pBase.functionId;
      if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
        aAggs[functionId].xFunction(&pCtx[k]);
      }
    }
  }
974
  
975 976 977 978 979 980 981
  tfree(sasArray);
}

static int32_t setGroupResultOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, char *pData, int16_t type, int16_t bytes) {
  if (isNull(pData, type)) {  // ignore the null value
    return -1;
  }
982

983
  int32_t GROUPRESULTID = 1;
984

985
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
986

987 988 989 990
  SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, &pRuntimeEnv->windowResInfo, pData, bytes);
  if (pWindowRes == NULL) {
    return -1;
  }
991

992 993 994 995 996 997 998
  // not assign result buffer yet, add new result buffer
  if (pWindowRes->pos.pageId == -1) {
    int32_t ret = addNewWindowResultBuf(pWindowRes, pResultBuf, GROUPRESULTID, pRuntimeEnv->numOfRowsPerPage);
    if (ret != 0) {
      return -1;
    }
  }
999

1000 1001 1002 1003 1004
  setWindowResOutputBuf(pRuntimeEnv, pWindowRes);
  initCtxOutputBuf(pRuntimeEnv);
  return TSDB_CODE_SUCCESS;
}

1005
static UNUSED_FUNC char *getGroupbyColumnData(SQuery *pQuery, SData **data, int16_t *type, int16_t *bytes) {
1006
  char *groupbyColumnData = NULL;
1007

1008
  SSqlGroupbyExpr *pGroupbyExpr = pQuery->pGroupbyExpr;
1009

1010 1011 1012 1013
  for (int32_t k = 0; k < pGroupbyExpr->numOfGroupCols; ++k) {
    if (pGroupbyExpr->columnInfo[k].flag == TSDB_COL_TAG) {
      continue;
    }
1014

1015 1016
    int16_t colIndex = -1;
    int32_t colId = pGroupbyExpr->columnInfo[k].colId;
1017

1018 1019 1020 1021 1022 1023
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
      if (pQuery->colList[i].info.colId == colId) {
        colIndex = i;
        break;
      }
    }
1024

1025
    assert(colIndex >= 0 && colIndex < pQuery->numOfCols);
1026

1027 1028
    *type = pQuery->colList[colIndex].info.type;
    *bytes = pQuery->colList[colIndex].info.bytes;
1029 1030

    //    groupbyColumnData = doGetDataBlocks(pQuery, data, pQuery->colList[colIndex].inf);
1031 1032
    break;
  }
1033

1034 1035 1036 1037 1038
  return groupbyColumnData;
}

static int32_t doTSJoinFilter(SQueryRuntimeEnv *pRuntimeEnv, int32_t offset) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
1039

1040 1041
  STSElem         elem = tsBufGetElem(pRuntimeEnv->pTSBuf);
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
1042

1043 1044 1045 1046
  // compare tag first
  if (pCtx[0].tag.i64Key != elem.tag) {
    return TS_JOIN_TAG_NOT_EQUALS;
  }
1047

1048 1049 1050 1051 1052 1053 1054 1055
  TSKEY key = *(TSKEY *)(pCtx[0].aInputElemBuf + TSDB_KEYSIZE * offset);

#if defined(_DEBUG_VIEW)
  printf("elem in comp ts file:%" PRId64 ", key:%" PRId64
         ", tag:%d, id:%s, query order:%d, ts order:%d, traverse:%d, index:%d\n",
         elem.ts, key, elem.tag, pRuntimeEnv->pTabObj->meterId, pQuery->order.order, pRuntimeEnv->pTSBuf->tsOrder,
         pRuntimeEnv->pTSBuf->cur.order, pRuntimeEnv->pTSBuf->cur.tsIndex);
#endif
1056

1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    if (key < elem.ts) {
      return TS_JOIN_TS_NOT_EQUALS;
    } else if (key > elem.ts) {
      assert(false);
    }
  } else {
    if (key > elem.ts) {
      return TS_JOIN_TS_NOT_EQUALS;
    } else if (key < elem.ts) {
      assert(false);
    }
  }
1070

1071 1072 1073 1074 1075
  return TS_JOIN_TS_EQUAL;
}

static bool functionNeedToExecute(SQueryRuntimeEnv *pRuntimeEnv, SQLFunctionCtx *pCtx, int32_t functionId) {
  SResultInfo *pResInfo = GET_RES_INFO(pCtx);
1076

1077 1078 1079
  if (pResInfo->complete || functionId == TSDB_FUNC_TAG_DUMMY || functionId == TSDB_FUNC_TS_DUMMY) {
    return false;
  }
1080

1081 1082 1083
  // in the supplementary scan, only the following functions need to be executed
  if (IS_SUPPLEMENT_SCAN(pRuntimeEnv) &&
      !(functionId == TSDB_FUNC_LAST_DST || functionId == TSDB_FUNC_FIRST_DST || functionId == TSDB_FUNC_FIRST ||
1084
        functionId == TSDB_FUNC_LAST || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TS)) {
1085 1086
    return false;
  }
1087

1088 1089 1090
  return true;
}

1091 1092 1093
static int32_t rowwiseApplyAllFunctions(SQueryRuntimeEnv *pRuntimeEnv, SDataStatis *pStatis,
                                        SDataBlockInfo *pDataBlockInfo, SWindowResInfo *pWindowResInfo,
                                        SArray *pDataBlock) {
1094 1095 1096 1097
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  TSKEY *         primaryKeyCol = (TSKEY *)taosArrayGet(pDataBlock, 0);

1098 1099
  //  SData **data = pRuntimeEnv->colDataBuffer;

1100 1101
  int64_t prevNumOfRes = 0;
  bool    groupbyStateValue = isGroupbyNormalCol(pQuery->pGroupbyExpr);
1102

1103 1104 1105
  if (!groupbyStateValue) {
    prevNumOfRes = getNumOfResult(pRuntimeEnv);
  }
1106

1107
  SArithmeticSupport *sasArray = calloc((size_t)pQuery->numOfOutputCols, sizeof(SArithmeticSupport));
1108

1109 1110
  int16_t type = 0;
  int16_t bytes = 0;
1111

1112 1113 1114
  char *groupbyColumnData = NULL;
  if (groupbyStateValue) {
    assert(0);
1115
    //    groupbyColumnData = getGroupbyColumnData(pQuery, data, &type, &bytes);
1116
  }
1117

1118 1119
  for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
    int32_t functionId = pQuery->pSelectExpr[k].pBase.functionId;
1120 1121 1122

    SDataStatis *pColStatis = NULL;

1123
    bool  hasNull = hasNullValue(pQuery, k, pDataBlockInfo, pStatis, &pColStatis);
1124
    char *dataBlock = getDataBlocks(pRuntimeEnv, &sasArray[k], k, pDataBlockInfo->rows, pDataBlock);
1125

1126
    setExecParams(pQuery, &pCtx[k], dataBlock, primaryKeyCol, pDataBlockInfo->rows, functionId, pColStatis,
1127
                  hasNull, &sasArray[k], pRuntimeEnv->scanFlag);
1128
  }
1129

1130 1131
  // set the input column data
  for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
1132
//    SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
1133 1134 1135 1136 1137
    assert(0);
    /*
     * NOTE: here the tbname/tags column cannot reach here, since it will never be a filter column,
     * so we do NOT check if is a tag or not
     */
1138
    //    pFilterInfo->pData = doGetDataBlocks(pQuery, data, pFilterInfo->info.colIdxInBuf);
1139
  }
1140

1141 1142
  int32_t numOfRes = 0;
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
1143

1144 1145 1146 1147
  // from top to bottom in desc
  // from bottom to top in asc order
  if (pRuntimeEnv->pTSBuf != NULL) {
    SQInfo *pQInfo = (SQInfo *)GET_QINFO_ADDR(pQuery);
1148
    qTrace("QInfo:%p process data rows, numOfRows:%d, query order:%d, ts comp order:%d", pQInfo, pDataBlockInfo->rows,
1149 1150
           pQuery->order.order, pRuntimeEnv->pTSBuf->cur.order);
  }
1151

1152 1153
  int32_t j = 0;
  TSKEY   lastKey = -1;
1154

1155
  for (j = 0; j < pDataBlockInfo->rows; ++j) {
1156
    int32_t offset = GET_COL_DATA_POS(pQuery, j, step);
1157

1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
    if (pRuntimeEnv->pTSBuf != NULL) {
      int32_t r = doTSJoinFilter(pRuntimeEnv, offset);
      if (r == TS_JOIN_TAG_NOT_EQUALS) {
        break;
      } else if (r == TS_JOIN_TS_NOT_EQUALS) {
        continue;
      } else {
        assert(r == TS_JOIN_TS_EQUAL);
      }
    }
1168

1169 1170 1171
    if (pQuery->numOfFilterCols > 0 && (!vnodeDoFilterData(pQuery, offset))) {
      continue;
    }
1172

1173 1174 1175 1176 1177
    // interval window query
    if (isIntervalQuery(pQuery)) {
      // decide the time window according to the primary timestamp
      int64_t     ts = primaryKeyCol[offset];
      STimeWindow win = getActiveTimeWindow(pWindowResInfo, ts, pQuery);
1178

1179 1180
      assert(0);
      int32_t ret = 0;
1181
      //      int32_t ret = setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pRuntimeEnv->pTabObj->sid, &win);
1182 1183 1184
      if (ret != TSDB_CODE_SUCCESS) {  // null data, too many state code
        continue;
      }
1185

1186 1187
      // all startOffset are identical
      offset -= pCtx[0].startOffset;
1188

1189 1190
      SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
      doRowwiseApplyFunctions(pRuntimeEnv, pStatus, &win, offset);
1191

1192 1193 1194 1195
      lastKey = ts;
      STimeWindow nextWin = win;
      int32_t     index = pWindowResInfo->curIndex;
      assert(0);
1196 1197
      int32_t sid = 0;  // pRuntimeEnv->pTabObj->sid;

1198 1199
      while (1) {
        getNextTimeWindow(pQuery, &nextWin);
1200 1201
        if (pWindowResInfo->startTime > nextWin.skey ||
            (nextWin.skey > pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
1202 1203 1204
            (nextWin.skey > pQuery->window.skey && !QUERY_IS_ASC_QUERY(pQuery))) {
          break;
        }
1205

1206 1207 1208
        if (ts < nextWin.skey || ts > nextWin.ekey) {
          break;
        }
1209

1210 1211 1212 1213
        // null data, failed to allocate more memory buffer
        if (setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, sid, &nextWin) != TSDB_CODE_SUCCESS) {
          break;
        }
1214

1215 1216 1217
        pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
        doRowwiseApplyFunctions(pRuntimeEnv, pStatus, &nextWin, offset);
      }
1218

1219 1220 1221 1222 1223
      pWindowResInfo->curIndex = index;
    } else {  // other queries
      // decide which group this rows belongs to according to current state value
      if (groupbyStateValue) {
        char *stateVal = groupbyColumnData + bytes * offset;
1224

1225 1226 1227 1228 1229
        int32_t ret = setGroupResultOutputBuf(pRuntimeEnv, stateVal, type, bytes);
        if (ret != TSDB_CODE_SUCCESS) {  // null data, too many state code
          continue;
        }
      }
1230

1231 1232
      // update the lastKey
      lastKey = primaryKeyCol[offset];
1233

1234 1235
      // all startOffset are identical
      offset -= pCtx[0].startOffset;
1236

1237 1238 1239 1240 1241 1242 1243
      for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
        int32_t functionId = pQuery->pSelectExpr[k].pBase.functionId;
        if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
          aAggs[functionId].xFunctionF(&pCtx[k], offset);
        }
      }
    }
1244

1245 1246 1247
    if (pRuntimeEnv->pTSBuf != NULL) {
      // if timestamp filter list is empty, quit current query
      if (!tsBufNextPos(pRuntimeEnv->pTSBuf)) {
H
hjxilinx 已提交
1248
        setQueryStatus(pQuery, QUERY_COMPLETED);
1249 1250 1251
        break;
      }
    }
1252

1253 1254 1255 1256
    /*
     * pointsOffset is the maximum available space in result buffer update the actual forward step for query that
     * requires checking buffer during loop
     */
1257
    if ((pQuery->checkBuffer == 1) && (++numOfRes) >= pQuery->pointsOffset) {
1258 1259
      pQuery->lastKey = lastKey + step;
      assert(0);
1260
      //      *forwardStep = j + 1;
1261 1262 1263
      break;
    }
  }
1264

1265
  free(sasArray);
1266

1267 1268 1269 1270 1271 1272 1273 1274
  /*
   * No need to calculate the number of output results for group-by normal columns, interval query
   * because the results of group by normal column is put into intermediate buffer.
   */
  int32_t num = 0;
  if (!groupbyStateValue && !isIntervalQuery(pQuery)) {
    num = getNumOfResult(pRuntimeEnv) - prevNumOfRes;
  }
1275

1276 1277 1278
  return num;
}

1279
static UNUSED_FUNC int32_t reviseForwardSteps(SQueryRuntimeEnv *pRuntimeEnv, int32_t forwardStep) {
1280 1281 1282
  /*
   * 1. If value filter exists, we try all data in current block, and do not set the QUERY_RESBUF_FULL flag.
   *
1283
   * 2. In case of top/bottom/ts_comp query, the checkBuffer == 1 and pQuery->numOfFilterCols
1284 1285 1286 1287 1288
   * may be 0 or not. We do not check the capacity of output buffer, since the filter function will do it.
   *
   * 3. In handling the query of secondary query of join, tsBuf servers as a ts filter.
   */
  SQuery *pQuery = pRuntimeEnv->pQuery;
1289

1290 1291 1292
  if (isTopBottomQuery(pQuery) || isTSCompQuery(pQuery) || pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL) {
    return forwardStep;
  }
1293

1294
  // current buffer does not have enough space, try in the next loop
1295
  if ((pQuery->checkBuffer == 1) && (pQuery->pointsOffset <= forwardStep)) {
1296 1297
    forwardStep = pQuery->pointsOffset;
  }
1298

1299 1300 1301 1302
  return forwardStep;
}

static int32_t tableApplyFunctionsOnBlock(SQueryRuntimeEnv *pRuntimeEnv, SDataBlockInfo *pDataBlockInfo,
1303
                                          SDataStatis *pStatis, __block_search_fn_t searchFn, SWindowResInfo *pWindowResInfo, SArray *pDataBlock) {
1304

H
hjxilinx 已提交
1305
  SQuery *pQuery = pRuntimeEnv->pQuery;
1306
  
1307
  if (pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL || isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
1308
    /*numOfRes = */rowwiseApplyAllFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, pDataBlock);
1309
  } else {
1310
    blockwiseApplyAllFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, searchFn, pDataBlock);
1311
  }
H
hjxilinx 已提交
1312
  
1313
  TSKEY lastKey = QUERY_IS_ASC_QUERY(pQuery)? pDataBlockInfo->window.ekey : pDataBlockInfo->window.skey;
H
hjxilinx 已提交
1314
  pQuery->lastKey = lastKey + GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
1315

1316
  doCheckQueryCompleted(pRuntimeEnv, lastKey, pWindowResInfo);
1317

1318 1319 1320 1321 1322 1323
  // interval query with limit applied
  if (isIntervalQuery(pQuery) && pQuery->limit.limit > 0 &&
      (pQuery->limit.limit + pQuery->limit.offset) <= numOfClosedTimeWindow(pWindowResInfo) &&
      pRuntimeEnv->scanFlag == MASTER_SCAN) {
    setQueryStatus(pQuery, QUERY_COMPLETED);
  }
1324

1325
  int32_t numOfRes = getNumOfResult(pRuntimeEnv);
1326

1327 1328 1329 1330 1331 1332 1333 1334
  // update the number of output result
  if (numOfRes > 0 && pQuery->checkBuffer == 1) {
    assert(numOfRes >= pQuery->rec.rows);
    pQuery->rec.rows = numOfRes;
    
    if (numOfRes >= pQuery->rec.threshold) {
      setQueryStatus(pQuery, QUERY_RESBUF_FULL);
    }
1335
  }
1336

1337
  return numOfRes;
1338 1339
}

1340
void setExecParams(SQuery *pQuery, SQLFunctionCtx *pCtx, void *inputData, TSKEY *tsCol, int32_t size,
1341 1342
                   int32_t functionId, SDataStatis *pStatis, bool hasNull, void *param, int32_t scanFlag) {
  pCtx->scanFlag = scanFlag;
1343

1344 1345
  pCtx->aInputElemBuf = inputData;
  pCtx->hasNull = hasNull;
1346

1347 1348 1349 1350 1351 1352 1353
  if (pStatis != NULL) {
    pCtx->preAggVals.isSet = true;
    pCtx->preAggVals.size = size;
    pCtx->preAggVals.statis = *pStatis;
  } else {
    pCtx->preAggVals.isSet = false;
  }
1354

1355 1356
  if ((aAggs[functionId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0 && (tsCol != NULL)) {
    pCtx->ptsList = tsCol;
1357
  }
1358

1359 1360 1361 1362
  if (functionId >= TSDB_FUNC_FIRST_DST && functionId <= TSDB_FUNC_LAST_DST) {
    // last_dist or first_dist function
    // store the first&last timestamp into the intermediate buffer [1], the true
    // value may be null but timestamp will never be null
1363
    pCtx->ptsList = tsCol;
1364
  } else if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_TWA ||
1365
             functionId == TSDB_FUNC_DIFF || (functionId >= TSDB_FUNC_RATE && functionId <= TSDB_FUNC_AVG_IRATE)) {
1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377
    /*
     * leastsquares function needs two columns of input, currently, the x value of linear equation is set to
     * timestamp column, and the y-value is the column specified in pQuery->pSelectExpr[i].colIdxInBuffer
     *
     * top/bottom function needs timestamp to indicate when the
     * top/bottom values emerge, so does diff function
     */
    if (functionId == TSDB_FUNC_TWA) {
      STwaInfo *pTWAInfo = GET_RES_INFO(pCtx)->interResultBuf;
      pTWAInfo->SKey = pQuery->window.skey;
      pTWAInfo->EKey = pQuery->window.ekey;
    }
1378

1379
    pCtx->ptsList = tsCol;
1380

1381 1382 1383
  } else if (functionId == TSDB_FUNC_ARITHM) {
    pCtx->param[1].pz = param;
  }
1384

1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
  pCtx->startOffset = 0;
  pCtx->size = size;

#if defined(_DEBUG_VIEW)
  //  int64_t *tsList = (int64_t *)primaryColumnData;
//  int64_t  s = tsList[0];
//  int64_t  e = tsList[size - 1];

//    if (IS_DATA_BLOCK_LOADED(blockStatus)) {
//        dTrace("QInfo:%p query ts:%lld-%lld, offset:%d, rows:%d, bstatus:%d,
//        functId:%d", GET_QINFO_ADDR(pQuery),
//               s, e, startOffset, size, blockStatus, functionId);
//    } else {
//        dTrace("QInfo:%p block not loaded, bstatus:%d",
//        GET_QINFO_ADDR(pQuery), blockStatus);
//    }
#endif
}

// set the output buffer for the selectivity + tag query
static void setCtxTagColumnInfo(SQuery *pQuery, SQLFunctionCtx *pCtx) {
  if (isSelectivityWithTagsQuery(pQuery)) {
    int32_t         num = 0;
    SQLFunctionCtx *p = NULL;
1409

1410
    int16_t tagLen = 0;
1411

1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
    SQLFunctionCtx **pTagCtx = calloc(pQuery->numOfOutputCols, POINTER_BYTES);
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      SSqlFuncExprMsg *pSqlFuncMsg = &pQuery->pSelectExpr[i].pBase;
      if (pSqlFuncMsg->functionId == TSDB_FUNC_TAG_DUMMY || pSqlFuncMsg->functionId == TSDB_FUNC_TS_DUMMY) {
        tagLen += pCtx[i].outputBytes;
        pTagCtx[num++] = &pCtx[i];
      } else if ((aAggs[pSqlFuncMsg->functionId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0) {
        p = &pCtx[i];
      } else if (pSqlFuncMsg->functionId == TSDB_FUNC_TS || pSqlFuncMsg->functionId == TSDB_FUNC_TAG) {
        // tag function may be the group by tag column
        // ts may be the required primary timestamp column
        continue;
      } else {
        // the column may be the normal column, group by normal_column, the functionId is TSDB_FUNC_PRJ
      }
    }
1428

1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
    p->tagInfo.pTagCtxList = pTagCtx;
    p->tagInfo.numOfTagCols = num;
    p->tagInfo.tagsLen = tagLen;
  }
}

static void setWindowResultInfo(SResultInfo *pResultInfo, SQuery *pQuery, bool isStableQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    setResultInfoBuf(&pResultInfo[i], pQuery->pSelectExpr[i].interResBytes, isStableQuery);
  }
}

1441
static int32_t setupQueryRuntimeEnv(SQueryRuntimeEnv *pRuntimeEnv, SColumnModel *pTagsSchema, int16_t order) {
1442
  dTrace("QInfo:%p setup runtime env", GET_QINFO_ADDR(pRuntimeEnv));
1443 1444
  SQuery *pQuery = pRuntimeEnv->pQuery;

1445 1446
  pRuntimeEnv->resultInfo = calloc(pQuery->numOfOutputCols, sizeof(SResultInfo));
  pRuntimeEnv->pCtx = (SQLFunctionCtx *)calloc(pQuery->numOfOutputCols, sizeof(SQLFunctionCtx));
1447

1448 1449 1450
  if (pRuntimeEnv->resultInfo == NULL || pRuntimeEnv->pCtx == NULL) {
    goto _error_clean;
  }
1451

1452 1453 1454
  pRuntimeEnv->offset[0] = 0;
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    SSqlFuncExprMsg *pSqlFuncMsg = &pQuery->pSelectExpr[i].pBase;
1455

1456
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
1457 1458
    pCtx->inputType = GET_COLUMN_TYPE(pQuery, i);
    pCtx->inputBytes = GET_COLUMN_BYTES(pQuery, i);
1459

1460
    pCtx->ptsOutputBuf = NULL;
1461

1462 1463
    pCtx->outputBytes = pQuery->pSelectExpr[i].resBytes;
    pCtx->outputType = pQuery->pSelectExpr[i].resType;
1464

1465 1466
    pCtx->order = pQuery->order.order;
    pCtx->functionId = pSqlFuncMsg->functionId;
1467

1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
    pCtx->numOfParams = pSqlFuncMsg->numOfParams;
    for (int32_t j = 0; j < pCtx->numOfParams; ++j) {
      int16_t type = pSqlFuncMsg->arg[j].argType;
      int16_t bytes = pSqlFuncMsg->arg[j].argBytes;
      if (type == TSDB_DATA_TYPE_BINARY || type == TSDB_DATA_TYPE_NCHAR) {
        tVariantCreateFromBinary(&pCtx->param[j], pSqlFuncMsg->arg->argValue.pz, bytes, type);
      } else {
        tVariantCreateFromBinary(&pCtx->param[j], (char *)&pSqlFuncMsg->arg[j].argValue.i64, bytes, type);
      }
    }
1478

1479 1480
    // set the order information for top/bottom query
    int32_t functionId = pCtx->functionId;
1481

1482 1483 1484
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
      int32_t f = pQuery->pSelectExpr[0].pBase.functionId;
      assert(f == TSDB_FUNC_TS || f == TSDB_FUNC_TS_DUMMY);
1485

1486 1487 1488 1489
      pCtx->param[2].i64Key = order;
      pCtx->param[2].nType = TSDB_DATA_TYPE_BIGINT;
      pCtx->param[3].i64Key = functionId;
      pCtx->param[3].nType = TSDB_DATA_TYPE_BIGINT;
1490

1491 1492
      pCtx->param[1].i64Key = pQuery->order.orderColId;
    }
1493

1494 1495 1496 1497
    if (i > 0) {
      pRuntimeEnv->offset[i] = pRuntimeEnv->offset[i - 1] + pRuntimeEnv->pCtx[i - 1].outputBytes;
    }
  }
1498

1499
  // set the intermediate result output buffer
1500
  setWindowResultInfo(pRuntimeEnv->resultInfo, pQuery, pRuntimeEnv->stableQuery);
1501

1502
  // if it is group by normal column, do not set output buffer, the output buffer is pResult
1503
  if (!isGroupbyNormalCol(pQuery->pGroupbyExpr) && !pRuntimeEnv->stableQuery) {
1504 1505
    resetCtxOutputBuf(pRuntimeEnv);
  }
1506

1507 1508
  setCtxTagColumnInfo(pQuery, pRuntimeEnv->pCtx);
  return TSDB_CODE_SUCCESS;
1509 1510

_error_clean:
1511 1512
  tfree(pRuntimeEnv->resultInfo);
  tfree(pRuntimeEnv->pCtx);
1513

1514 1515 1516 1517 1518 1519 1520
  return TSDB_CODE_SERV_OUT_OF_MEMORY;
}

static void teardownQueryRuntimeEnv(SQueryRuntimeEnv *pRuntimeEnv) {
  if (pRuntimeEnv->pQuery == NULL) {
    return;
  }
1521

1522
  SQuery *pQuery = pRuntimeEnv->pQuery;
1523

1524 1525
  dTrace("QInfo:%p teardown runtime env", GET_QINFO_ADDR(pQuery));
  cleanupTimeWindowInfo(&pRuntimeEnv->windowResInfo, pQuery->numOfOutputCols);
1526

1527 1528 1529
  if (pRuntimeEnv->pCtx != NULL) {
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
1530

1531 1532 1533
      for (int32_t j = 0; j < pCtx->numOfParams; ++j) {
        tVariantDestroy(&pCtx->param[j]);
      }
1534

1535 1536 1537 1538
      tVariantDestroy(&pCtx->tag);
      tfree(pCtx->tagInfo.pTagCtxList);
      tfree(pRuntimeEnv->resultInfo[i].interResultBuf);
    }
1539

1540 1541 1542
    tfree(pRuntimeEnv->resultInfo);
    tfree(pRuntimeEnv->pCtx);
  }
1543

1544
  taosDestoryInterpoInfo(&pRuntimeEnv->interpoInfo);
1545

1546 1547 1548 1549
  if (pRuntimeEnv->pInterpoBuf != NULL) {
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      tfree(pRuntimeEnv->pInterpoBuf[i]);
    }
1550

1551 1552
    tfree(pRuntimeEnv->pInterpoBuf);
  }
1553

1554
  destroyResultBuf(pRuntimeEnv->pResultBuf);
1555
  tsdbCleanupQueryHandle(pRuntimeEnv->pQueryHandle);
1556 1557
  tsdbCleanupQueryHandle(pRuntimeEnv->pSecQueryHandle);
  
1558 1559 1560
  pRuntimeEnv->pTSBuf = tsBufDestory(pRuntimeEnv->pTSBuf);
}

1561 1562
static bool isQueryKilled(SQInfo *pQInfo) {
  return (pQInfo->code == TSDB_CODE_QUERY_CANCELLED);
1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
#if 0
  /*
   * check if the queried meter is going to be deleted.
   * if it will be deleted soon, stop current query ASAP.
   */
  SMeterObj *pMeterObj = pQInfo->pObj;
  if (vnodeIsMeterState(pMeterObj, TSDB_METER_STATE_DROPPING)) {
    pQInfo->killed = 1;
    return true;
  }
  
  return (pQInfo->killed == 1);
#endif
}

1578
static void setQueryKilled(SQInfo* pQInfo) {
H
hjxilinx 已提交
1579 1580 1581
  pQInfo->code = TSDB_CODE_QUERY_CANCELLED;
}

1582 1583 1584 1585
bool isFixedOutputQuery(SQuery *pQuery) {
  if (pQuery->intervalTime != 0) {
    return false;
  }
1586

1587 1588 1589 1590
  // Note:top/bottom query is fixed output query
  if (isTopBottomQuery(pQuery) || isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
    return true;
  }
1591

1592 1593
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    SSqlFuncExprMsg *pExprMsg = &pQuery->pSelectExpr[i].pBase;
1594

1595 1596
    // ignore the ts_comp function
    if (i == 0 && pExprMsg->functionId == TSDB_FUNC_PRJ && pExprMsg->numOfParams == 1 &&
1597
        pExprMsg->colInfo.colIndex == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
1598 1599
      continue;
    }
1600

1601 1602 1603
    if (pExprMsg->functionId == TSDB_FUNC_TS || pExprMsg->functionId == TSDB_FUNC_TS_DUMMY) {
      continue;
    }
1604

1605 1606 1607 1608
    if (!IS_MULTIOUTPUT(aAggs[pExprMsg->functionId].nStatus)) {
      return true;
    }
  }
1609

1610 1611 1612 1613 1614 1615 1616 1617 1618 1619
  return false;
}

bool isPointInterpoQuery(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionID = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionID == TSDB_FUNC_INTERP || functionID == TSDB_FUNC_LAST_ROW) {
      return true;
    }
  }
1620

1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
  return false;
}

// TODO REFACTOR:MERGE WITH CLIENT-SIDE FUNCTION
bool isSumAvgRateQuery(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionId == TSDB_FUNC_TS) {
      continue;
    }
1631

1632 1633 1634 1635 1636
    if (functionId == TSDB_FUNC_SUM_RATE || functionId == TSDB_FUNC_SUM_IRATE || functionId == TSDB_FUNC_AVG_RATE ||
        functionId == TSDB_FUNC_AVG_IRATE) {
      return true;
    }
  }
1637

1638 1639 1640 1641 1642 1643 1644 1645 1646 1647
  return false;
}

bool isFirstLastRowQuery(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionID = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionID == TSDB_FUNC_LAST_ROW) {
      return true;
    }
  }
1648

1649 1650 1651 1652 1653
  return false;
}

bool notHasQueryTimeRange(SQuery *pQuery) {
  return (pQuery->window.skey == 0 && pQuery->window.ekey == INT64_MAX && QUERY_IS_ASC_QUERY(pQuery)) ||
1654
         (pQuery->window.skey == INT64_MAX && pQuery->window.ekey == 0 && (!QUERY_IS_ASC_QUERY(pQuery)));
1655 1656
}

H
hjxilinx 已提交
1657
static bool needReverseScan(SQuery *pQuery) {
1658 1659 1660 1661 1662
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TS_DUMMY || functionId == TSDB_FUNC_TAG) {
      continue;
    }
1663

1664 1665 1666 1667 1668
    if (((functionId == TSDB_FUNC_LAST || functionId == TSDB_FUNC_LAST_DST) && QUERY_IS_ASC_QUERY(pQuery)) ||
        ((functionId == TSDB_FUNC_FIRST || functionId == TSDB_FUNC_FIRST_DST) && !QUERY_IS_ASC_QUERY(pQuery))) {
      return true;
    }
  }
1669

1670 1671 1672 1673
  return false;
}
/////////////////////////////////////////////////////////////////////////////////////////////

1674
void getAlignQueryTimeWindow(SQuery *pQuery, int64_t key, int64_t keyFirst, int64_t keyLast,
1675
                                        int64_t *realSkey, int64_t *realEkey, STimeWindow *win) {
1676
  assert(key >= keyFirst && key <= keyLast && pQuery->slidingTime <= pQuery->intervalTime);
1677

1678
  win->skey = taosGetIntervalStartTimestamp(key, pQuery->slidingTime, pQuery->slidingTimeUnit, pQuery->precision);
1679

1680 1681 1682 1683 1684 1685
  if (keyFirst > (INT64_MAX - pQuery->intervalTime)) {
    /*
     * if the realSkey > INT64_MAX - pQuery->intervalTime, the query duration between
     * realSkey and realEkey must be less than one interval.Therefore, no need to adjust the query ranges.
     */
    assert(keyLast - keyFirst < pQuery->intervalTime);
1686

1687 1688
    *realSkey = keyFirst;
    *realEkey = keyLast;
1689

1690 1691 1692
    win->ekey = INT64_MAX;
    return;
  }
1693

1694
  win->ekey = win->skey + pQuery->intervalTime - 1;
1695

1696 1697 1698 1699 1700
  if (win->skey < keyFirst) {
    *realSkey = keyFirst;
  } else {
    *realSkey = win->skey;
  }
1701

1702 1703 1704 1705 1706 1707 1708
  if (win->ekey < keyLast) {
    *realEkey = win->ekey;
  } else {
    *realEkey = keyLast;
  }
}

1709
static UNUSED_FUNC bool doGetQueryPos(TSKEY key, SQInfo *pQInfo, SPointInterpoSupporter *pPointInterpSupporter) {
1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729
#if 0
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
  SMeterObj *       pMeterObj = pRuntimeEnv->pTabObj;
  
  /* key in query range. If not, no qualified in disk file */
  if (key != -1 && key <= pQuery->window.ekey) {
    if (isPointInterpoQuery(pQuery)) { /* no qualified data in this query range */
      return getNeighborPoints(pQInfo, pMeterObj, pPointInterpSupporter);
    } else {
      return true;
    }
  } else {  // key > pQuery->window.ekey, abort for normal query, continue for interp query
    if (isPointInterpoQuery(pQuery)) {
      return getNeighborPoints(pQInfo, pMeterObj, pPointInterpSupporter);
    } else {
      return false;
    }
  }
#endif
1730
  return true;
1731 1732
}

1733
static UNUSED_FUNC bool doSetDataInfo(SQInfo *pQInfo, SPointInterpoSupporter *pPointInterpSupporter, void *pMeterObj,
1734
                          TSKEY nextKey) {
1735 1736
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
1737

1738 1739 1740 1741 1742 1743
  if (isFirstLastRowQuery(pQuery)) {
    /*
     * if the pQuery->window.skey != pQuery->window.ekey for last_row query,
     * the query range is existed, so set them both the value of nextKey
     */
    if (pQuery->window.skey != pQuery->window.ekey) {
1744 1745 1746
      assert(pQuery->window.skey >= pQuery->window.ekey && !QUERY_IS_ASC_QUERY(pQuery) &&
             nextKey >= pQuery->window.ekey && nextKey <= pQuery->window.skey);

1747 1748 1749
      pQuery->window.skey = nextKey;
      pQuery->window.ekey = nextKey;
    }
1750

1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819
    return getNeighborPoints(pQInfo, pMeterObj, pPointInterpSupporter);
  } else {
    return true;
  }
}

// TODO refactor code, the best way to implement the last_row is utilizing the iterator
bool normalizeUnBoundLastRowQuery(SQInfo *pQInfo, SPointInterpoSupporter *pPointInterpSupporter) {
#if 0
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;

  SQuery *   pQuery = pRuntimeEnv->pQuery;
  SMeterObj *pMeterObj = pRuntimeEnv->pTabObj;

  assert(!QUERY_IS_ASC_QUERY(pQuery) && notHasQueryTimeRange(pQuery));
  __block_search_fn_t searchFn = vnodeSearchKeyFunc[pMeterObj->searchAlgorithm];

  TSKEY lastKey = -1;

  pQuery->fileId = -1;
  vnodeFreeFieldsEx(pRuntimeEnv);

  // keep in-memory cache status in local variables in case that it may be changed by write operation
  getBasicCacheInfoSnapshot(pQuery, pMeterObj->pCache, pMeterObj->vnode);

  SCacheInfo *pCacheInfo = (SCacheInfo *)pMeterObj->pCache;
  if (pCacheInfo != NULL && pCacheInfo->cacheBlocks != NULL && pQuery->numOfBlocks > 0) {
    pQuery->fileId = -1;
    TSKEY key = pMeterObj->lastKey;

    pQuery->window.skey = key;
    pQuery->window.ekey = key;
    pQuery->lastKey = pQuery->window.skey;

    /*
     * cache block may have been flushed to disk, and no data in cache anymore.
     * So, copy cache block to local buffer is required.
     */
    lastKey = getQueryStartPositionInCache(pRuntimeEnv, &pQuery->slot, &pQuery->pos, false);
    if (lastKey < 0) {  // data has been flushed to disk, try again search in file
      lastKey = getQueryPositionForCacheInvalid(pRuntimeEnv, searchFn);

      if (Q_STATUS_EQUAL(pQuery->status, QUERY_NO_DATA_TO_CHECK | QUERY_COMPLETED)) {
        return false;
      }
    }
  } else {  // no data in cache, try file
    TSKEY key = pMeterObj->lastKeyOnFile;

    pQuery->window.skey = key;
    pQuery->window.ekey = key;
    pQuery->lastKey = pQuery->window.skey;

    bool ret = getQualifiedDataBlock(pMeterObj, pRuntimeEnv, QUERY_RANGE_LESS_EQUAL, searchFn);
    if (!ret) {  // no data in file, return false;
      return false;
    }

    lastKey = getTimestampInDiskBlock(pRuntimeEnv, pQuery->pos);
  }

  assert(lastKey <= pQuery->window.skey);

  pQuery->window.skey = lastKey;
  pQuery->window.ekey = lastKey;
  pQuery->lastKey = pQuery->window.skey;

  return getNeighborPoints(pQInfo, pMeterObj, pPointInterpSupporter);
#endif
1820

1821 1822 1823 1824 1825
  return true;
}

static void setScanLimitationByResultBuffer(SQuery *pQuery) {
  if (isTopBottomQuery(pQuery)) {
1826
    pQuery->checkBuffer = 0;
1827
  } else if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
1828
    pQuery->checkBuffer = 0;
1829 1830 1831 1832 1833 1834 1835
  } else {
    bool hasMultioutput = false;
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      SSqlFuncExprMsg *pExprMsg = &pQuery->pSelectExpr[i].pBase;
      if (pExprMsg->functionId == TSDB_FUNC_TS || pExprMsg->functionId == TSDB_FUNC_TS_DUMMY) {
        continue;
      }
1836

1837 1838 1839 1840 1841
      hasMultioutput = IS_MULTIOUTPUT(aAggs[pExprMsg->functionId].nStatus);
      if (!hasMultioutput) {
        break;
      }
    }
1842

1843
    pQuery->checkBuffer = hasMultioutput ? 1 : 0;
1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865
  }
}

/*
 * todo add more parameters to check soon..
 */
bool vnodeParametersSafetyCheck(SQuery *pQuery) {
  // load data column information is incorrect
  for (int32_t i = 0; i < pQuery->numOfCols - 1; ++i) {
    if (pQuery->colList[i].info.colId == pQuery->colList[i + 1].info.colId) {
      dError("QInfo:%p invalid data load column for query", GET_QINFO_ADDR(pQuery));
      return false;
    }
  }
  return true;
}

// todo ignore the avg/sum/min/max/count/stddev/top/bottom functions, of which
// the scan order is not matter
static bool onlyOneQueryType(SQuery *pQuery, int32_t functId, int32_t functIdDst) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
1866

1867 1868 1869 1870
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TS_DUMMY || functionId == TSDB_FUNC_TAG ||
        functionId == TSDB_FUNC_TAG_DUMMY) {
      continue;
    }
1871

1872 1873 1874 1875
    if (functionId != functId && functionId != functIdDst) {
      return false;
    }
  }
1876

1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887
  return true;
}

static bool onlyFirstQuery(SQuery *pQuery) { return onlyOneQueryType(pQuery, TSDB_FUNC_FIRST, TSDB_FUNC_FIRST_DST); }

static bool onlyLastQuery(SQuery *pQuery) { return onlyOneQueryType(pQuery, TSDB_FUNC_LAST, TSDB_FUNC_LAST_DST); }

static void changeExecuteScanOrder(SQuery *pQuery, bool metricQuery) {
  // in case of point-interpolation query, use asc order scan
  char msg[] = "QInfo:%p scan order changed for %s query, old:%d, new:%d, qrange exchanged, old qrange:%" PRId64
               "-%" PRId64 ", new qrange:%" PRId64 "-%" PRId64;
1888

1889 1890 1891 1892
  // todo handle the case the the order irrelevant query type mixed up with order critical query type
  // descending order query for last_row query
  if (isFirstLastRowQuery(pQuery)) {
    dTrace("QInfo:%p scan order changed for last_row query, old:%d, new:%d", GET_QINFO_ADDR(pQuery),
1893
           pQuery->order.order, TSDB_ORDER_DESC);
1894

1895
    pQuery->order.order = TSDB_ORDER_DESC;
1896

1897 1898
    int64_t skey = MIN(pQuery->window.skey, pQuery->window.ekey);
    int64_t ekey = MAX(pQuery->window.skey, pQuery->window.ekey);
1899

1900 1901
    pQuery->window.skey = ekey;
    pQuery->window.ekey = skey;
1902

1903 1904
    return;
  }
1905

1906 1907
  if (isPointInterpoQuery(pQuery) && pQuery->intervalTime == 0) {
    if (!QUERY_IS_ASC_QUERY(pQuery)) {
1908
      dTrace(msg, GET_QINFO_ADDR(pQuery), "interp", pQuery->order.order, TSDB_ORDER_ASC, pQuery->window.skey,
1909
             pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);
1910 1911
      SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
    }
1912

1913
    pQuery->order.order = TSDB_ORDER_ASC;
1914 1915
    return;
  }
1916

1917 1918 1919
  if (pQuery->intervalTime == 0) {
    if (onlyFirstQuery(pQuery)) {
      if (!QUERY_IS_ASC_QUERY(pQuery)) {
1920
        dTrace(msg, GET_QINFO_ADDR(pQuery), "only-first", pQuery->order.order, TSDB_ORDER_ASC, pQuery->window.skey,
1921 1922
               pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1923 1924
        SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
      }
1925

1926
      pQuery->order.order = TSDB_ORDER_ASC;
1927 1928
    } else if (onlyLastQuery(pQuery)) {
      if (QUERY_IS_ASC_QUERY(pQuery)) {
1929
        dTrace(msg, GET_QINFO_ADDR(pQuery), "only-last", pQuery->order.order, TSDB_ORDER_DESC, pQuery->window.skey,
1930 1931
               pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1932 1933
        SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
      }
1934

1935
      pQuery->order.order = TSDB_ORDER_DESC;
1936
    }
1937

1938 1939 1940 1941
  } else {  // interval query
    if (metricQuery) {
      if (onlyFirstQuery(pQuery)) {
        if (!QUERY_IS_ASC_QUERY(pQuery)) {
1942
          dTrace(msg, GET_QINFO_ADDR(pQuery), "only-first stable", pQuery->order.order, TSDB_ORDER_ASC,
1943 1944
                 pQuery->window.skey, pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1945 1946
          SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
        }
1947

1948
        pQuery->order.order = TSDB_ORDER_ASC;
1949 1950
      } else if (onlyLastQuery(pQuery)) {
        if (QUERY_IS_ASC_QUERY(pQuery)) {
1951
          dTrace(msg, GET_QINFO_ADDR(pQuery), "only-last stable", pQuery->order.order, TSDB_ORDER_DESC,
1952 1953
                 pQuery->window.skey, pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1954 1955
          SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
        }
1956

1957
        pQuery->order.order = TSDB_ORDER_DESC;
1958 1959 1960 1961 1962 1963 1964
      }
    }
  }
}

static void doSetInterpVal(SQLFunctionCtx *pCtx, TSKEY ts, int16_t type, int32_t index, char *data) {
  assert(pCtx->param[index].pz == NULL);
1965

1966 1967
  int32_t len = 0;
  size_t  t = 0;
1968

1969 1970
  if (type == TSDB_DATA_TYPE_BINARY) {
    t = strlen(data);
1971

1972 1973 1974 1975
    len = t + 1 + TSDB_KEYSIZE;
    pCtx->param[index].pz = calloc(1, len);
  } else if (type == TSDB_DATA_TYPE_NCHAR) {
    t = wcslen((const wchar_t *)data);
1976

1977 1978 1979 1980 1981 1982
    len = (t + 1) * TSDB_NCHAR_SIZE + TSDB_KEYSIZE;
    pCtx->param[index].pz = calloc(1, len);
  } else {
    len = TSDB_KEYSIZE * 2;
    pCtx->param[index].pz = malloc(len);
  }
1983

1984
  pCtx->param[index].nType = TSDB_DATA_TYPE_BINARY;
1985

1986 1987 1988
  char *z = pCtx->param[index].pz;
  *(TSKEY *)z = ts;
  z += TSDB_KEYSIZE;
1989

1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
  switch (type) {
    case TSDB_DATA_TYPE_FLOAT:
      *(double *)z = GET_FLOAT_VAL(data);
      break;
    case TSDB_DATA_TYPE_DOUBLE:
      *(double *)z = GET_DOUBLE_VAL(data);
      break;
    case TSDB_DATA_TYPE_INT:
    case TSDB_DATA_TYPE_BOOL:
    case TSDB_DATA_TYPE_BIGINT:
    case TSDB_DATA_TYPE_TINYINT:
    case TSDB_DATA_TYPE_SMALLINT:
    case TSDB_DATA_TYPE_TIMESTAMP:
      *(int64_t *)z = GET_INT64_VAL(data);
      break;
    case TSDB_DATA_TYPE_BINARY:
      strncpy(z, data, t);
      break;
    case TSDB_DATA_TYPE_NCHAR: {
      wcsncpy((wchar_t *)z, (const wchar_t *)data, t);
    } break;
    default:
      assert(0);
  }
2014

2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028
  pCtx->param[index].nLen = len;
}

/**
 * param[1]: default value/previous value of specified timestamp
 * param[2]: next value of specified timestamp
 * param[3]: denotes if the result is a precious result or interpolation results
 *
 * @param pQInfo
 * @param pQInfo
 * @param pInterpoRaw
 */
void pointInterpSupporterSetData(SQInfo *pQInfo, SPointInterpoSupporter *pPointInterpSupport) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
2029 2030
  SQuery *          pQuery = pRuntimeEnv->pQuery;

2031 2032 2033 2034
  // not point interpolation query, abort
  if (!isPointInterpoQuery(pQuery)) {
    return;
  }
2035

2036 2037
  int32_t count = 1;
  TSKEY   key = *(TSKEY *)pPointInterpSupport->pNextPoint[0];
2038

2039 2040 2041 2042
  if (key == pQuery->window.skey) {
    // the queried timestamp has value, return it directly without interpolation
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      tVariantCreateFromBinary(&pRuntimeEnv->pCtx[i].param[3], (char *)&count, sizeof(count), TSDB_DATA_TYPE_INT);
2043

2044 2045 2046 2047 2048 2049
      pRuntimeEnv->pCtx[i].param[0].i64Key = key;
      pRuntimeEnv->pCtx[i].param[0].nType = TSDB_DATA_TYPE_BIGINT;
    }
  } else {
    // set the direct previous(next) point for process
    count = 2;
2050

2051 2052 2053
    if (pQuery->interpoType == TSDB_INTERPO_SET_VALUE) {
      for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
        SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
2054

2055 2056 2057 2058
        // only the function of interp needs the corresponding information
        if (pCtx->functionId != TSDB_FUNC_INTERP) {
          continue;
        }
2059

2060
        pCtx->numOfParams = 4;
2061

2062 2063
        SInterpInfo *pInterpInfo = (SInterpInfo *)pRuntimeEnv->pCtx[i].aOutputBuf;
        pInterpInfo->pInterpDetail = calloc(1, sizeof(SInterpInfoDetail));
2064

2065
        SInterpInfoDetail *pInterpDetail = pInterpInfo->pInterpDetail;
2066

2067 2068 2069 2070
        // for primary timestamp column, set the flag
        if (pQuery->pSelectExpr[i].pBase.colInfo.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
          pInterpDetail->primaryCol = 1;
        }
2071

2072
        tVariantCreateFromBinary(&pCtx->param[3], (char *)&count, sizeof(count), TSDB_DATA_TYPE_INT);
2073

2074 2075 2076 2077 2078
        if (isNull((char *)&pQuery->defaultVal[i], pCtx->inputType)) {
          pCtx->param[1].nType = TSDB_DATA_TYPE_NULL;
        } else {
          tVariantCreateFromBinary(&pCtx->param[1], (char *)&pQuery->defaultVal[i], pCtx->inputBytes, pCtx->inputType);
        }
2079

2080 2081 2082 2083 2084 2085
        pInterpDetail->ts = pQuery->window.skey;
        pInterpDetail->type = pQuery->interpoType;
      }
    } else {
      TSKEY prevKey = *(TSKEY *)pPointInterpSupport->pPrevPoint[0];
      TSKEY nextKey = *(TSKEY *)pPointInterpSupport->pNextPoint[0];
2086

2087 2088
      for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
        SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
2089

2090 2091 2092 2093
        // tag column does not need the interp environment
        if (pQuery->pSelectExpr[i].pBase.functionId == TSDB_FUNC_TAG) {
          continue;
        }
2094

2095
        int32_t colInBuf = 0;//pQuery->pSelectExpr[i].pBase.colInfo.colIdxInBuf;
2096
        SInterpInfo *pInterpInfo = (SInterpInfo *)pRuntimeEnv->pCtx[i].aOutputBuf;
2097

2098 2099
        pInterpInfo->pInterpDetail = calloc(1, sizeof(SInterpInfoDetail));
        SInterpInfoDetail *pInterpDetail = pInterpInfo->pInterpDetail;
2100 2101

        //        int32_t type = GET_COLUMN_TYPE(pQuery, i);
2102 2103
        int32_t type = 0;
        assert(0);
2104

2105 2106 2107 2108 2109 2110 2111
        // for primary timestamp column, set the flag
        if (pQuery->pSelectExpr[i].pBase.colInfo.colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
          pInterpDetail->primaryCol = 1;
        } else {
          doSetInterpVal(pCtx, prevKey, type, 1, pPointInterpSupport->pPrevPoint[colInBuf]);
          doSetInterpVal(pCtx, nextKey, type, 2, pPointInterpSupport->pNextPoint[colInBuf]);
        }
2112

2113
        tVariantCreateFromBinary(&pRuntimeEnv->pCtx[i].param[3], (char *)&count, sizeof(count), TSDB_DATA_TYPE_INT);
2114

2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125
        pInterpDetail->ts = pQInfo->runtimeEnv.pQuery->window.skey;
        pInterpDetail->type = pQuery->interpoType;
      }
    }
  }
}

void pointInterpSupporterInit(SQuery *pQuery, SPointInterpoSupporter *pInterpoSupport) {
  if (isPointInterpoQuery(pQuery)) {
    pInterpoSupport->pPrevPoint = malloc(pQuery->numOfCols * POINTER_BYTES);
    pInterpoSupport->pNextPoint = malloc(pQuery->numOfCols * POINTER_BYTES);
2126

2127
    pInterpoSupport->numOfCols = pQuery->numOfCols;
2128

2129 2130 2131 2132 2133
    /* get appropriated size for one row data source*/
    int32_t len = 0;
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
      len += pQuery->colList[i].info.bytes;
    }
2134 2135 2136

    //    assert(PRIMARY_TSCOL_LOADED(pQuery));

2137 2138
    void *prev = calloc(1, len);
    void *next = calloc(1, len);
2139

2140
    int32_t offset = 0;
2141

2142 2143 2144
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
      pInterpoSupport->pPrevPoint[i] = prev + offset;
      pInterpoSupport->pNextPoint[i] = next + offset;
2145

2146 2147 2148 2149 2150 2151 2152 2153 2154
      offset += pQuery->colList[i].info.bytes;
    }
  }
}

void pointInterpSupporterDestroy(SPointInterpoSupporter *pPointInterpSupport) {
  if (pPointInterpSupport->numOfCols <= 0 || pPointInterpSupport->pPrevPoint == NULL) {
    return;
  }
2155

2156 2157
  tfree(pPointInterpSupport->pPrevPoint[0]);
  tfree(pPointInterpSupport->pNextPoint[0]);
2158

2159 2160
  tfree(pPointInterpSupport->pPrevPoint);
  tfree(pPointInterpSupport->pNextPoint);
2161

2162 2163 2164
  pPointInterpSupport->numOfCols = 0;
}

2165
static UNUSED_FUNC void allocMemForInterpo(SQInfo *pQInfo, SQuery *pQuery, void *pMeterObj) {
2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184
#if 0
  if (pQuery->interpoType != TSDB_INTERPO_NONE) {
    assert(isIntervalQuery(pQuery) || (pQuery->intervalTime == 0 && isPointInterpoQuery(pQuery)));
    
    if (isIntervalQuery(pQuery)) {
      pQInfo->runtimeEnv.pInterpoBuf = malloc(POINTER_BYTES * pQuery->numOfOutputCols);
      
      for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
        pQInfo->runtimeEnv.pInterpoBuf[i] =
            calloc(1, sizeof(tFilePage) + pQuery->pSelectExpr[i].resBytes * pMeterObj->pointsPerFileBlock);
      }
    }
  }
#endif
}

static int32_t getInitialPageNum(SQInfo *pQInfo) {
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
  int32_t INITIAL_RESULT_ROWS_VALUE = 16;
2185

2186
  int32_t num = 0;
2187

2188 2189 2190
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
    num = 128;
  } else if (isIntervalQuery(pQuery)) {  // time window query, allocate one page for each table
2191
    size_t s = pQInfo->groupInfo.numOfTables;
2192 2193
    num = MAX(s, INITIAL_RESULT_ROWS_VALUE);
  } else {  // for super table query, one page for each subset
2194
    num = 1;//pQInfo->pSidSet->numOfSubSet;
2195
  }
2196

2197 2198 2199 2200 2201 2202
  assert(num > 0);
  return num;
}

static int32_t getRowParamForMultiRowsOutput(SQuery *pQuery, bool isSTableQuery) {
  int32_t rowparam = 1;
2203

2204 2205 2206
  if (isTopBottomQuery(pQuery) && (!isSTableQuery)) {
    rowparam = pQuery->pSelectExpr[1].pBase.arg->argValue.i64;
  }
2207

2208 2209 2210 2211 2212 2213 2214 2215 2216 2217
  return rowparam;
}

static int32_t getNumOfRowsInResultPage(SQuery *pQuery, bool isSTableQuery) {
  int32_t rowSize = pQuery->rowSize * getRowParamForMultiRowsOutput(pQuery, isSTableQuery);
  return (DEFAULT_INTERN_BUF_SIZE - sizeof(tFilePage)) / rowSize;
}

char *getPosInResultPage(SQueryRuntimeEnv *pRuntimeEnv, int32_t columnIndex, SWindowResult *pResult) {
  assert(pResult != NULL && pRuntimeEnv != NULL);
2218

2219 2220
  SQuery *   pQuery = pRuntimeEnv->pQuery;
  tFilePage *page = getResultBufferPageById(pRuntimeEnv->pResultBuf, pResult->pos.pageId);
2221

2222 2223
  int32_t numOfRows = getNumOfRowsInResultPage(pQuery, pRuntimeEnv->stableQuery);
  int32_t realRowId = pResult->pos.rowId * getRowParamForMultiRowsOutput(pQuery, pRuntimeEnv->stableQuery);
2224

2225
  return ((char *)page->data) + pRuntimeEnv->offset[columnIndex] * numOfRows +
2226
         pQuery->pSelectExpr[columnIndex].resBytes * realRowId;
2227 2228
}

H
hjxilinx 已提交
2229
int32_t UNUSED_FUNC vnodeSTableQueryPrepare(SQInfo *pQInfo, SQuery *pQuery, void *param) {
2230 2231
  if ((QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.skey > pQuery->window.ekey)) ||
      (!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.ekey > pQuery->window.skey))) {
2232 2233 2234
    dTrace("QInfo:%p no result in time range %" PRId64 "-%" PRId64 ", order %d", pQInfo, pQuery->window.skey,
           pQuery->window.ekey, pQuery->order.order);

2235 2236 2237
    sem_post(&pQInfo->dataReady);
    return TSDB_CODE_SUCCESS;
  }
2238

2239
  pQuery->status = 0;
2240 2241
  pQuery->rec = (SResultRec){0};

2242 2243
  changeExecuteScanOrder(pQuery, true);
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
2244

2245 2246 2247 2248 2249
  /*
   * since we employ the output control mechanism in main loop.
   * so, disable it during data block scan procedure.
   */
  setScanLimitationByResultBuffer(pQuery);
2250

2251 2252
  // save raw query range for applying to each subgroup
  pQuery->lastKey = pQuery->window.skey;
2253

2254
  // create runtime environment
2255
  //  SColumnModel *pTagSchemaInfo = pQInfo->pSidSet->pColumnModel;
2256

2257 2258
  // get one queried meter
  assert(0);
2259
  //  SMeterObj *pMeter = getMeterObj(pQInfo->groupInfo, pQInfo->pSidSet->pTableIdList[0]->sid);
2260

2261 2262
  pRuntimeEnv->pTSBuf = param;
  pRuntimeEnv->cur.vnodeIndex = -1;
2263

2264 2265
  // set the ts-comp file traverse order
  if (param != NULL) {
2266
    int16_t order = (pQuery->order.order == pRuntimeEnv->pTSBuf->tsOrder) ? TSDB_ORDER_ASC : TSDB_ORDER_DESC;
2267 2268
    tsBufSetTraverseOrder(pRuntimeEnv->pTSBuf, order);
  }
2269

2270
  assert(0);
2271
  //  int32_t ret = setupQueryRuntimeEnv(pMeter, pQuery, &pQInfo->runtimeEnv, pTagSchemaInfo, TSDB_ORDER_ASC, true);
2272 2273 2274 2275
  //  if (ret != TSDB_CODE_SUCCESS) {
  //    return ret;
  //  }

2276
  //  createTableGroup(pQInfo->pSidSet);
2277

2278 2279 2280 2281 2282
  int32_t size = getInitialPageNum(pQInfo);
  int32_t ret = createDiskbasedResultBuffer(&pRuntimeEnv->pResultBuf, size, pQuery->rowSize);
  if (ret != TSDB_CODE_SUCCESS) {
    return ret;
  }
2283

2284 2285
  if (pQuery->intervalTime == 0) {
    int16_t type = TSDB_DATA_TYPE_NULL;
2286

2287 2288 2289 2290 2291
    if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {  // group by columns not tags;
      type = getGroupbyColumnType(pQuery, pQuery->pGroupbyExpr);
    } else {
      type = TSDB_DATA_TYPE_INT;  // group id
    }
2292

2293 2294
    initWindowResInfo(&pRuntimeEnv->windowResInfo, pRuntimeEnv, 512, 4096, type);
  }
2295

2296
  pRuntimeEnv->numOfRowsPerPage = getNumOfRowsInResultPage(pQuery, true);
2297

H
hjxilinx 已提交
2298 2299 2300 2301 2302 2303
  STsdbQueryCond cond = {
    .twindow = (STimeWindow) {.skey = pQuery->window.skey, .ekey = pQuery->window.ekey},
    .order = pQuery->order.order,
    .colList = pQuery->colList,
  };
  
2304
  //  for(int32_t i = 0; i < pQInfo->pSidSet->numOfTables; ++i) {
2305
  //    SMeterObj *p1 = getMeterObj(pQInfo->groupInfo, pQInfo->pSidSet->pTableIdList[i]->sid);
2306 2307 2308
  //    taosArrayPush(sa, &p1);
  //  }

2309 2310 2311 2312
  SArray *cols = taosArrayInit(pQuery->numOfCols, sizeof(pQuery->colList[0]));
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
    taosArrayPush(cols, &pQuery->colList[i]);
  }
2313

2314
  pRuntimeEnv->pQueryHandle = tsdbQueryTables(NULL, &cond, &pQInfo->groupInfo, cols);
2315

2316 2317 2318 2319
  // metric query do not invoke interpolation, it will be done at the second-stage merge
  if (!isPointInterpoQuery(pQuery)) {
    pQuery->interpoType = TSDB_INTERPO_NONE;
  }
2320 2321 2322

  TSKEY revisedStime = taosGetIntervalStartTimestamp(pQuery->window.skey, pQuery->intervalTime, pQuery->slidingTimeUnit,
                                                     pQuery->precision);
2323 2324
  taosInitInterpoInfo(&pRuntimeEnv->interpoInfo, pQuery->order.order, revisedStime, 0, 0);
  pRuntimeEnv->stableQuery = true;
2325

2326 2327 2328 2329 2330 2331 2332 2333 2334
  return TSDB_CODE_SUCCESS;
}

/**
 * decrease the refcount for each table involved in this query
 * @param pQInfo
 */
void vnodeDecMeterRefcnt(SQInfo *pQInfo) {
  if (pQInfo != NULL) {
2335
    //    assert(taosHashGetSize(pQInfo->groupInfo) >= 1);
2336 2337 2338
  }

#if 0
2339
  if (pQInfo == NULL || pQInfo->groupInfo.numOfTables == 1) {
2340 2341 2342 2343 2344
    atomic_fetch_sub_32(&pQInfo->pObj->numOfQueries, 1);
    dTrace("QInfo:%p vid:%d sid:%d meterId:%s, query is over, numOfQueries:%d", pQInfo, pQInfo->pObj->vnode,
           pQInfo->pObj->sid, pQInfo->pObj->meterId, pQInfo->pObj->numOfQueries);
  } else {
    int32_t num = 0;
2345 2346
    for (int32_t i = 0; i < pQInfo->groupInfo.numOfTables; ++i) {
      SMeterObj *pMeter = getMeterObj(pQInfo->groupInfo, pQInfo->pSidSet->pTableIdList[i]->sid);
2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359
      atomic_fetch_sub_32(&(pMeter->numOfQueries), 1);
      
      if (pMeter->numOfQueries > 0) {
        dTrace("QInfo:%p vid:%d sid:%d meterId:%s, query is over, numOfQueries:%d", pQInfo, pMeter->vnode, pMeter->sid,
               pMeter->meterId, pMeter->numOfQueries);
        num++;
      }
    }
    
    /*
     * in order to reduce log output, for all meters of which numOfQueries count are 0,
     * we do not output corresponding information
     */
2360
    num = pQInfo->groupInfo.numOfTables - num;
2361
    dTrace("QInfo:%p metric query is over, dec query ref for %d meters, numOfQueries on %d meters are 0", pQInfo,
2362
           pQInfo->groupInfo.numOfTables, num);
2363 2364 2365 2366 2367 2368
  }
#endif
}

void setTimestampRange(SQueryRuntimeEnv *pRuntimeEnv, int64_t stime, int64_t etime) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
2369

2370 2371
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
2372

2373 2374 2375
    if (functionId == TSDB_FUNC_SPREAD) {
      pRuntimeEnv->pCtx[i].param[1].dKey = stime;
      pRuntimeEnv->pCtx[i].param[2].dKey = etime;
2376

2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391
      pRuntimeEnv->pCtx[i].param[1].nType = TSDB_DATA_TYPE_DOUBLE;
      pRuntimeEnv->pCtx[i].param[2].nType = TSDB_DATA_TYPE_DOUBLE;
    }
  }
}

static bool needToLoadDataBlock(SQuery *pQuery, SDataStatis *pDataStatis, SQLFunctionCtx *pCtx,
                                int32_t numOfTotalPoints) {
  if (pDataStatis == NULL) {
    return true;
  }

#if 0
  for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
    SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
2392
    int32_t                  colIndex = pFilterInfo->info.colIndex;
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434
    
    // this column not valid in current data block
    if (colIndex < 0 || pDataStatis[colIndex].colId != pFilterInfo->info.data.colId) {
      continue;
    }
    
    // not support pre-filter operation on binary/nchar data type
    if (!vnodeSupportPrefilter(pFilterInfo->info.data.type)) {
      continue;
    }
    
    // all points in current column are NULL, no need to check its boundary value
    if (pDataStatis[colIndex].numOfNull == numOfTotalPoints) {
      continue;
    }
    
    if (pFilterInfo->info.info.type == TSDB_DATA_TYPE_FLOAT) {
      float minval = *(double *)(&pDataStatis[colIndex].min);
      float maxval = *(double *)(&pDataStatis[colIndex].max);
      
      for (int32_t i = 0; i < pFilterInfo->numOfFilters; ++i) {
        if (pFilterInfo->pFilters[i].fp(&pFilterInfo->pFilters[i], (char *)&minval, (char *)&maxval)) {
          return true;
        }
      }
    } else {
      for (int32_t i = 0; i < pFilterInfo->numOfFilters; ++i) {
        if (pFilterInfo->pFilters[i].fp(&pFilterInfo->pFilters[i], (char *)&pDataStatis[colIndex].min,
                                        (char *)&pDataStatis[colIndex].max)) {
          return true;
        }
      }
    }
  }
  
  // todo disable this opt code block temporarily
  //  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
  //    int32_t functId = pQuery->pSelectExpr[i].pBase.functionId;
  //    if (functId == TSDB_FUNC_TOP || functId == TSDB_FUNC_BOTTOM) {
  //      return top_bot_datablock_filter(&pCtx[i], functId, (char *)&pField[i].min, (char *)&pField[i].max);
  //    }
  //  }
2435

2436 2437 2438 2439 2440 2441 2442
#endif
  return true;
}

// previous time window may not be of the same size of pQuery->intervalTime
static void getNextTimeWindow(SQuery *pQuery, STimeWindow *pTimeWindow) {
  int32_t factor = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
2443

2444 2445 2446 2447
  pTimeWindow->skey += (pQuery->slidingTime * factor);
  pTimeWindow->ekey = pTimeWindow->skey + (pQuery->intervalTime - 1);
}

2448
SArray *loadDataBlockOnDemand(SQueryRuntimeEnv *pRuntimeEnv, SDataBlockInfo *pBlockInfo, SDataStatis **pStatis) {
2449
  SQuery *pQuery = pRuntimeEnv->pQuery;
2450 2451 2452 2453

  uint32_t r = 0;
  SArray * pDataBlock = NULL;

2454 2455 2456 2457 2458 2459
  if (pQuery->numOfFilterCols > 0) {
    r = BLK_DATA_ALL_NEEDED;
  } else {
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
      int32_t colId = pQuery->pSelectExpr[i].pBase.colInfo.colId;
2460
      r |= aAggs[functionId].dataReqFunc(&pRuntimeEnv->pCtx[i], pQuery->window.skey, pQuery->window.ekey, colId);
2461
    }
2462

2463 2464 2465 2466
    if (pRuntimeEnv->pTSBuf > 0 || isIntervalQuery(pQuery)) {
      r |= BLK_DATA_ALL_NEEDED;
    }
  }
2467

2468
  if (r == BLK_DATA_NO_NEEDED) {
H
hjxilinx 已提交
2469
    qTrace("QInfo:%p slot:%d, data block ignored, brange:%" PRId64 "-%" PRId64 ", rows:%d",
2470
          GET_QINFO_ADDR(pRuntimeEnv), pBlockInfo->window.skey, pBlockInfo->window.ekey, pBlockInfo->rows);
2471 2472
  } else if (r == BLK_DATA_FILEDS_NEEDED) {
    if (tsdbRetrieveDataBlockStatisInfo(pRuntimeEnv->pQueryHandle, pStatis) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
2473
      //        return DISK_DATA_LOAD_FAILED;
2474
    }
2475

2476
    if (*pStatis == NULL) {
2477 2478 2479 2480 2481 2482 2483
      pDataBlock = tsdbRetrieveDataBlock(pRuntimeEnv->pQueryHandle, NULL);
    }
  } else {
    assert(r == BLK_DATA_ALL_NEEDED);
    if (tsdbRetrieveDataBlockStatisInfo(pRuntimeEnv->pQueryHandle, pStatis) != TSDB_CODE_SUCCESS) {
      //        return DISK_DATA_LOAD_FAILED;
    }
2484

2485 2486 2487 2488 2489
    /*
     * if this block is completed included in the query range, do more filter operation
     * filter the data block according to the value filter condition.
     * no need to load the data block, continue for next block
     */
2490
    if (!needToLoadDataBlock(pQuery, *pStatis, pRuntimeEnv->pCtx, pBlockInfo->rows)) {
2491 2492
#if defined(_DEBUG_VIEW)
      dTrace("QInfo:%p fileId:%d, slot:%d, block discarded by per-filter", GET_QINFO_ADDR(pQuery), pQuery->fileId,
2493
             pQuery->slot);
2494 2495 2496
#endif
      //        return DISK_DATA_DISCARDED;
    }
2497

2498 2499
    pDataBlock = tsdbRetrieveDataBlock(pRuntimeEnv->pQueryHandle, NULL);
  }
2500

2501 2502 2503
  return pDataBlock;
}

H
hjxilinx 已提交
2504
int32_t binarySearchForKey(char *pValue, int num, TSKEY key, int order) {
2505 2506
  int32_t midPos = -1;
  int32_t numOfPoints;
2507

2508 2509 2510
  if (num <= 0) {
    return -1;
  }
2511

2512 2513 2514 2515 2516
  assert(order == TSDB_ORDER_ASC || order == TSDB_ORDER_DESC);
  
  TSKEY* keyList   = (TSKEY *)pValue;
  int32_t firstPos = 0;
  int32_t lastPos  = num - 1;
2517

2518
  if (order == TSDB_ORDER_DESC) {
H
hjxilinx 已提交
2519 2520 2521 2522 2523
    // find the first position which is smaller than the key
    while (1) {
      if (key >= keyList[lastPos]) return lastPos;
      if (key == keyList[firstPos]) return firstPos;
      if (key < keyList[firstPos]) return firstPos - 1;
2524

H
hjxilinx 已提交
2525 2526
      numOfPoints = lastPos - firstPos + 1;
      midPos = (numOfPoints >> 1) + firstPos;
2527

H
hjxilinx 已提交
2528 2529 2530 2531 2532 2533 2534 2535
      if (key < keyList[midPos]) {
        lastPos = midPos - 1;
      } else if (key > keyList[midPos]) {
        firstPos = midPos + 1;
      } else {
        break;
      }
    }
2536

H
hjxilinx 已提交
2537 2538 2539 2540 2541
  } else {
    // find the first position which is bigger than the key
    while (1) {
      if (key <= keyList[firstPos]) return firstPos;
      if (key == keyList[lastPos]) return lastPos;
2542

H
hjxilinx 已提交
2543 2544 2545 2546 2547 2548 2549
      if (key > keyList[lastPos]) {
        lastPos = lastPos + 1;
        if (lastPos >= num)
          return -1;
        else
          return lastPos;
      }
2550

H
hjxilinx 已提交
2551 2552
      numOfPoints = lastPos - firstPos + 1;
      midPos = (numOfPoints >> 1) + firstPos;
2553

H
hjxilinx 已提交
2554 2555 2556 2557 2558 2559 2560 2561 2562
      if (key < keyList[midPos]) {
        lastPos = midPos - 1;
      } else if (key > keyList[midPos]) {
        firstPos = midPos + 1;
      } else {
        break;
      }
    }
  }
2563

H
hjxilinx 已提交
2564 2565 2566
  return midPos;
}

2567 2568
static int64_t doScanAllDataBlocks(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
2569
  dTrace("QInfo:%p query start, qrange:%" PRId64 "-%" PRId64 ", lastkey:%" PRId64 ", order:%d",
2570 2571
         GET_QINFO_ADDR(pRuntimeEnv), pQuery->window.skey, pQuery->window.ekey, pQuery->lastKey, pQuery->order.order);

H
hzcheng 已提交
2572
  TsdbQueryHandleT pQueryHandle = pRuntimeEnv->scanFlag == MASTER_SCAN? pRuntimeEnv->pQueryHandle:pRuntimeEnv->pSecQueryHandle;
2573
  while (tsdbNextDataBlock(pQueryHandle)) {
H
hjxilinx 已提交
2574
    
2575
    if (isQueryKilled(GET_QINFO_ADDR(pRuntimeEnv))) {
2576
      return 0;
2577
    }
2578

2579
    SDataBlockInfo blockInfo = tsdbRetrieveDataBlockInfo(pQueryHandle);
2580

2581
    // todo extract methods
2582
    if (isIntervalQuery(pQuery) && pRuntimeEnv->windowResInfo.prevSKey == 0) {
2583 2584
      TSKEY skey1, ekey1;
      STimeWindow w = {0};
2585 2586
      SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;

2587
      if (QUERY_IS_ASC_QUERY(pQuery)) {
2588
        getAlignQueryTimeWindow(pQuery, blockInfo.window.skey, blockInfo.window.skey, pQuery->window.ekey,
2589
                                           &skey1, &ekey1, &w);
2590 2591 2592 2593
        pWindowResInfo->startTime = w.skey;
        pWindowResInfo->prevSKey = w.skey;
      } else {
        // the start position of the first time window in the endpoint that spreads beyond the queried last timestamp
2594 2595
        TSKEY start = blockInfo.window.ekey - pQuery->intervalTime;
        getAlignQueryTimeWindow(pQuery, start, pQuery->window.ekey, blockInfo.window.ekey, &skey1, &ekey1, &w);
2596

H
hjxilinx 已提交
2597
        pWindowResInfo->startTime = pQuery->window.skey;
2598 2599 2600
        pWindowResInfo->prevSKey = w.skey;
      }
    }
2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625
    
    // in case of prj/diff query, ensure the output buffer is sufficient to accomodate the results of current block
    if (!isIntervalQuery(pQuery) && !isGroupbyNormalCol(pQuery->pGroupbyExpr) && !isFixedOutputQuery(pQuery)) {
      SResultRec* pRec = &pQuery->rec;
      
      if (pQuery->rec.capacity - pQuery->rec.rows < blockInfo.rows) {
        int32_t remain = pRec->capacity - pRec->rows;
        int32_t newSize = pRec->capacity + (blockInfo.rows - remain);
        
        for(int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
          int32_t bytes = pQuery->pSelectExpr[i].resBytes;
          
          char* tmp = realloc(pQuery->sdata[i], bytes * newSize + sizeof(SData));
          if (tmp == NULL) { // todo handle the oom
          } else {
            pQuery->sdata[i] = (SData*) tmp;
          }
          
          // set the pCtx output buffer position
          pRuntimeEnv->pCtx[i].aOutputBuf = pQuery->sdata[i]->data + pRec->rows*bytes;
        }
        
        pRec->capacity = newSize;
      }
    }
2626

2627
    SDataStatis *pStatis = NULL;
H
hjxilinx 已提交
2628
    SArray *pDataBlock = loadDataBlockOnDemand(pRuntimeEnv, &blockInfo, &pStatis);
2629
    int32_t numOfRes = tableApplyFunctionsOnBlock(pRuntimeEnv, &blockInfo, pStatis, binarySearchForKey,
H
hjxilinx 已提交
2630
                                                     &pRuntimeEnv->windowResInfo, pDataBlock);
2631

2632 2633
    dTrace("QInfo:%p check data block, brange:%" PRId64 "-%" PRId64 ", rows:%d, res:%d",
               GET_QINFO_ADDR(pRuntimeEnv), blockInfo.window.skey, blockInfo.window.ekey, blockInfo.rows, numOfRes);
2634

2635
    // save last access position
H
hjxilinx 已提交
2636 2637
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
      break;
2638 2639
    }
  }
2640

2641 2642 2643 2644
  // if the result buffer is not full, set the query completed flag
  if (!Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
    setQueryStatus(pQuery, QUERY_COMPLETED);
  }
2645

2646
  if (isIntervalQuery(pQuery) && IS_MASTER_SCAN(pRuntimeEnv)) {
H
hjxilinx 已提交
2647
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
2648 2649
      int32_t step = QUERY_IS_ASC_QUERY(pQuery) ? QUERY_ASC_FORWARD_STEP : QUERY_DESC_FORWARD_STEP;

2650 2651 2652 2653 2654 2655 2656
      closeAllTimeWindow(&pRuntimeEnv->windowResInfo);
      removeRedundantWindow(&pRuntimeEnv->windowResInfo, pQuery->lastKey - step, step);
      pRuntimeEnv->windowResInfo.curIndex = pRuntimeEnv->windowResInfo.size - 1;
    } else {
      assert(Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL));
    }
  }
2657

2658
  return 0;
2659 2660 2661 2662 2663 2664 2665 2666
}

static void updatelastkey(SQuery *pQuery, STableQueryInfo *pTableQInfo) { pTableQInfo->lastKey = pQuery->lastKey; }

/*
 * set tag value in SQLFunctionCtx
 * e.g.,tag information into input buffer
 */
2667
static void doSetTagValueInParam(void* tsdb, STableId id, int32_t tagColId, tVariant *param) {
2668
  tVariantDestroy(param);
2669 2670 2671

  char* val = NULL;
  int16_t bytes = 0;
2672
  int16_t type  = 0;
2673

2674
  tsdbGetTableTagVal(tsdb, id, tagColId, &type, &bytes, &val);
2675
  tVariantCreateFromBinary(param, val, bytes, type);
2676 2677
}

2678
void setTagVal(SQueryRuntimeEnv *pRuntimeEnv, STableId id, void* tsdb) {
2679
  SQuery *      pQuery = pRuntimeEnv->pQuery;
2680

2681 2682 2683
  SSqlFuncExprMsg *pFuncMsg = &pQuery->pSelectExpr[0].pBase;
  if (pQuery->numOfOutputCols == 1 && pFuncMsg->functionId == TSDB_FUNC_TS_COMP) {
    assert(pFuncMsg->numOfParams == 1);
2684
    doSetTagValueInParam(tsdb, id, pFuncMsg->arg->argValue.i64, &pRuntimeEnv->pCtx[0].tag);
2685 2686 2687
  } else {
    // set tag value, by which the results are aggregated.
    for (int32_t idx = 0; idx < pQuery->numOfOutputCols; ++idx) {
2688
      SColIndex *pCol = &pQuery->pSelectExpr[idx].pBase.colInfo;
2689

2690
      // ts_comp column required the tag value for join filter
2691
      if (!TSDB_COL_IS_TAG(pCol->flag)) {
2692 2693
        continue;
      }
2694 2695 2696 2697
 
      
      // todo use tag column index to optimize performance
      doSetTagValueInParam(tsdb, id, pCol->colId, &pRuntimeEnv->pCtx[idx].tag);
2698
    }
2699

2700
    // set the join tag for first column
2701
    if (pFuncMsg->functionId == TSDB_FUNC_TS && pFuncMsg->colInfo.colIndex == PRIMARYKEY_TIMESTAMP_COL_INDEX &&
2702 2703
        pRuntimeEnv->pTSBuf != NULL) {
      assert(pFuncMsg->numOfParams == 1);
2704
      assert(0); // to do fix me
2705
//      doSetTagValueInParam(pTagSchema, pFuncMsg->arg->argValue.i64, pMeterSidInfo, &pRuntimeEnv->pCtx[0].tag);
2706 2707 2708 2709 2710 2711 2712
    }
  }
}

static void doMerge(SQueryRuntimeEnv *pRuntimeEnv, int64_t timestamp, SWindowResult *pWindowRes, bool mergeFlag) {
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
2713

2714 2715 2716 2717 2718
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (!mergeFlag) {
      pCtx[i].aOutputBuf = pCtx[i].aOutputBuf + pCtx[i].outputBytes;
      pCtx[i].currentStage = FIRST_STAGE_MERGE;
2719

2720 2721 2722
      resetResultInfo(pCtx[i].resultInfo);
      aAggs[functionId].init(&pCtx[i]);
    }
2723

2724 2725 2726 2727 2728 2729
    pCtx[i].hasNull = true;
    pCtx[i].nStartQueryTimestamp = timestamp;
    pCtx[i].aInputElemBuf = getPosInResultPage(pRuntimeEnv, i, pWindowRes);
    //    pCtx[i].aInputElemBuf = ((char *)inputSrc->data) +
    //                            ((int32_t)pRuntimeEnv->offset[i] * pRuntimeEnv->numOfRowsPerPage) +
    //                            pCtx[i].outputBytes * inputIdx;
2730

2731 2732 2733 2734 2735 2736
    // in case of tag column, the tag information should be extracted from input buffer
    if (functionId == TSDB_FUNC_TAG_DUMMY || functionId == TSDB_FUNC_TAG) {
      tVariantDestroy(&pCtx[i].tag);
      tVariantCreateFromBinary(&pCtx[i].tag, pCtx[i].aInputElemBuf, pCtx[i].inputBytes, pCtx[i].inputType);
    }
  }
2737

2738 2739 2740 2741 2742
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionId == TSDB_FUNC_TAG_DUMMY) {
      continue;
    }
2743

2744 2745 2746 2747
    aAggs[functionId].distMergeFunc(&pCtx[i]);
  }
}

2748
static UNUSED_FUNC void printBinaryData(int32_t functionId, char *data, int32_t srcDataType) {
2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828
  if (functionId == TSDB_FUNC_FIRST_DST || functionId == TSDB_FUNC_LAST_DST) {
    switch (srcDataType) {
      case TSDB_DATA_TYPE_BINARY:
        printf("%" PRId64 ",%s\t", *(TSKEY *)data, (data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_BOOL:
        printf("%" PRId64 ",%d\t", *(TSKEY *)data, *(int8_t *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_SMALLINT:
        printf("%" PRId64 ",%d\t", *(TSKEY *)data, *(int16_t *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_BIGINT:
      case TSDB_DATA_TYPE_TIMESTAMP:
        printf("%" PRId64 ",%" PRId64 "\t", *(TSKEY *)data, *(TSKEY *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_INT:
        printf("%" PRId64 ",%d\t", *(TSKEY *)data, *(int32_t *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_FLOAT:
        printf("%" PRId64 ",%f\t", *(TSKEY *)data, *(float *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_DOUBLE:
        printf("%" PRId64 ",%lf\t", *(TSKEY *)data, *(double *)(data + TSDB_KEYSIZE + 1));
        break;
    }
  } else if (functionId == TSDB_FUNC_AVG) {
    printf("%lf,%d\t", *(double *)data, *(int32_t *)(data + sizeof(double)));
  } else if (functionId == TSDB_FUNC_SPREAD) {
    printf("%lf,%lf\t", *(double *)data, *(double *)(data + sizeof(double)));
  } else if (functionId == TSDB_FUNC_TWA) {
    data += 1;
    printf("%lf,%" PRId64 ",%" PRId64 ",%" PRId64 "\t", *(double *)data, *(int64_t *)(data + 8),
           *(int64_t *)(data + 16), *(int64_t *)(data + 24));
  } else if (functionId == TSDB_FUNC_MIN || functionId == TSDB_FUNC_MAX) {
    switch (srcDataType) {
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_BOOL:
        printf("%d\t", *(int8_t *)data);
        break;
      case TSDB_DATA_TYPE_SMALLINT:
        printf("%d\t", *(int16_t *)data);
        break;
      case TSDB_DATA_TYPE_BIGINT:
      case TSDB_DATA_TYPE_TIMESTAMP:
        printf("%" PRId64 "\t", *(int64_t *)data);
        break;
      case TSDB_DATA_TYPE_INT:
        printf("%d\t", *(int *)data);
        break;
      case TSDB_DATA_TYPE_FLOAT:
        printf("%f\t", *(float *)data);
        break;
      case TSDB_DATA_TYPE_DOUBLE:
        printf("%f\t", *(float *)data);
        break;
    }
  } else if (functionId == TSDB_FUNC_SUM) {
    if (srcDataType == TSDB_DATA_TYPE_FLOAT || srcDataType == TSDB_DATA_TYPE_DOUBLE) {
      printf("%lf\t", *(float *)data);
    } else {
      printf("%" PRId64 "\t", *(int64_t *)data);
    }
  } else {
    printf("%s\t", data);
  }
}

void UNUSED_FUNC displayInterResult(SData **pdata, SQuery *pQuery, int32_t numOfRows) {
#if 0
  int32_t numOfCols = pQuery->numOfOutputCols;
  printf("super table query intermediate result, total:%d\n", numOfRows);
  
  SQInfo *   pQInfo = (SQInfo *)(GET_QINFO_ADDR(pQuery));
  SMeterObj *pMeterObj = pQInfo->pObj;
  
  for (int32_t j = 0; j < numOfRows; ++j) {
    for (int32_t i = 0; i < numOfCols; ++i) {
      switch (pQuery->pSelectExpr[i].resType) {
        case TSDB_DATA_TYPE_BINARY: {
2829
          int32_t colIndex = pQuery->pSelectExpr[i].pBase.colInfo.colIndex;
2830 2831 2832 2833 2834
          int32_t type = 0;
          
          if (TSDB_COL_IS_TAG(pQuery->pSelectExpr[i].pBase.colInfo.flag)) {
            type = pQuery->pSelectExpr[i].resType;
          } else {
2835
            type = pMeterObj->schema[colIndex].type;
2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861
          }
          printBinaryData(pQuery->pSelectExpr[i].pBase.functionId, pdata[i]->data + pQuery->pSelectExpr[i].resBytes * j,
                          type);
          break;
        }
        case TSDB_DATA_TYPE_TIMESTAMP:
        case TSDB_DATA_TYPE_BIGINT:
          printf("%" PRId64 "\t", *(int64_t *)(pdata[i]->data + pQuery->pSelectExpr[i].resBytes * j));
          break;
        case TSDB_DATA_TYPE_INT:
          printf("%d\t", *(int32_t *)(pdata[i]->data + pQuery->pSelectExpr[i].resBytes * j));
          break;
        case TSDB_DATA_TYPE_FLOAT:
          printf("%f\t", *(float *)(pdata[i]->data + pQuery->pSelectExpr[i].resBytes * j));
          break;
        case TSDB_DATA_TYPE_DOUBLE:
          printf("%lf\t", *(double *)(pdata[i]->data + pQuery->pSelectExpr[i].resBytes * j));
          break;
      }
    }
    printf("\n");
  }
#endif
}

typedef struct SCompSupporter {
2862 2863 2864
  STableDataInfo **pTableDataInfo;
  int32_t *        position;
  SQInfo *         pQInfo;
2865 2866 2867 2868 2869
} SCompSupporter;

int32_t tableResultComparFn(const void *pLeft, const void *pRight, void *param) {
  int32_t left = *(int32_t *)pLeft;
  int32_t right = *(int32_t *)pRight;
2870

2871 2872
  SCompSupporter *  supporter = (SCompSupporter *)param;
  SQueryRuntimeEnv *pRuntimeEnv = &supporter->pQInfo->runtimeEnv;
2873

2874 2875
  int32_t leftPos = supporter->position[left];
  int32_t rightPos = supporter->position[right];
2876

2877 2878 2879 2880
  /* left source is exhausted */
  if (leftPos == -1) {
    return 1;
  }
2881

2882 2883 2884 2885
  /* right source is exhausted*/
  if (rightPos == -1) {
    return -1;
  }
2886

2887 2888
  SWindowResInfo *pWindowResInfo1 = &supporter->pTableDataInfo[left]->pTableQInfo->windowResInfo;
  SWindowResult * pWindowRes1 = getWindowResult(pWindowResInfo1, leftPos);
2889

2890 2891
  char *b1 = getPosInResultPage(pRuntimeEnv, PRIMARYKEY_TIMESTAMP_COL_INDEX, pWindowRes1);
  TSKEY leftTimestamp = GET_INT64_VAL(b1);
2892

2893 2894
  SWindowResInfo *pWindowResInfo2 = &supporter->pTableDataInfo[right]->pTableQInfo->windowResInfo;
  SWindowResult * pWindowRes2 = getWindowResult(pWindowResInfo2, rightPos);
2895

2896 2897
  char *b2 = getPosInResultPage(pRuntimeEnv, PRIMARYKEY_TIMESTAMP_COL_INDEX, pWindowRes2);
  TSKEY rightTimestamp = GET_INT64_VAL(b2);
2898

2899 2900 2901
  if (leftTimestamp == rightTimestamp) {
    return 0;
  }
2902

2903 2904 2905
  return leftTimestamp > rightTimestamp ? 1 : -1;
}

2906
int32_t mergeIntoGroupResult(SQInfo *pQInfo) {
2907
  int64_t st = taosGetTimestampMs();
2908
  int32_t ret = TSDB_CODE_SUCCESS;
2909

2910 2911 2912 2913
  int32_t numOfGroups = taosArrayGetSize(pQInfo->groupInfo.pGroupList);
  
  while (pQInfo->subgroupIdx < numOfGroups) {
    SArray* group = taosArrayGetP(pQInfo->groupInfo.pGroupList, pQInfo->subgroupIdx);
2914
    ret = mergeIntoGroupResultImpl(pQInfo, group);
2915 2916 2917 2918 2919 2920 2921
    if (ret < 0) {  // not enough disk space to save the data into disk
      return -1;
    }

    pQInfo->subgroupIdx += 1;

    // this group generates at least one result, return results
2922 2923 2924
    if (ret > 0) {
      break;
    }
2925 2926

    assert(pQInfo->numOfGroupResultPages == 0);
2927 2928
    dTrace("QInfo:%p no result in group %d, continue", pQInfo, pQInfo->subgroupIdx - 1);
  }
2929

2930 2931
  dTrace("QInfo:%p merge res data into group, index:%d, total group:%d, elapsed time:%lldms",
  pQInfo, pQInfo->subgroupIdx - 1, numOfGroups, taosGetTimestampMs() - st);
2932

2933 2934 2935 2936 2937 2938
  return TSDB_CODE_SUCCESS;
}

void copyResToQueryResultBuf(SQInfo *pQInfo, SQuery *pQuery) {
  if (pQInfo->offset == pQInfo->numOfGroupResultPages) {
    pQInfo->numOfGroupResultPages = 0;
2939

2940
    // current results of group has been sent to client, try next group
2941
    if (mergeIntoGroupResult(pQInfo) != TSDB_CODE_SUCCESS) {
2942 2943
      return;  // failed to save data in the disk
    }
2944

2945
    // set current query completed
2946 2947 2948 2949
    //    if (pQInfo->numOfGroupResultPages == 0 && pQInfo->subgroupIdx == pQInfo->pSidSet->numOfSubSet) {
    //      pQInfo->tableIndex = pQInfo->pSidSet->numOfTables;
    //      return;
    //    }
2950
  }
2951 2952

  SQueryRuntimeEnv *   pRuntimeEnv = &pQInfo->runtimeEnv;
2953
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
2954

2955 2956
  int32_t id = getGroupResultId(pQInfo->subgroupIdx - 1);
  SIDList list = getDataBufPagesIdList(pResultBuf, pQInfo->offset + id);
2957

2958 2959 2960 2961 2962
  int32_t total = 0;
  for (int32_t i = 0; i < list.size; ++i) {
    tFilePage *pData = getResultBufferPageById(pResultBuf, list.pData[i]);
    total += pData->numOfElems;
  }
2963

2964
  int32_t rows = total;
2965

2966 2967 2968
  int32_t offset = 0;
  for (int32_t num = 0; num < list.size; ++num) {
    tFilePage *pData = getResultBufferPageById(pResultBuf, list.pData[num]);
2969

2970 2971
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      int32_t bytes = pRuntimeEnv->pCtx[i].outputBytes;
2972
      char *  pDest = pQuery->sdata[i]->data;
2973

2974 2975 2976
      memcpy(pDest + offset * bytes, pData->data + pRuntimeEnv->offset[i] * pData->numOfElems,
             bytes * pData->numOfElems);
    }
2977

2978 2979
    offset += pData->numOfElems;
  }
2980

2981
  assert(pQuery->rec.rows == 0);
2982

2983
  pQuery->rec.rows += rows;
2984 2985 2986 2987 2988
  pQInfo->offset += 1;
}

int64_t getNumOfResultWindowRes(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pWindowRes) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
2989

2990 2991 2992
  int64_t maxOutput = 0;
  for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
    int32_t functionId = pQuery->pSelectExpr[j].pBase.functionId;
2993

2994 2995 2996 2997 2998 2999 3000
    /*
     * ts, tag, tagprj function can not decide the output number of current query
     * the number of output result is decided by main output
     */
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ) {
      continue;
    }
3001

3002 3003 3004 3005 3006
    SResultInfo *pResultInfo = &pWindowRes->resultInfo[j];
    if (pResultInfo != NULL && maxOutput < pResultInfo->numOfRes) {
      maxOutput = pResultInfo->numOfRes;
    }
  }
3007

3008 3009 3010
  return maxOutput;
}

3011
int32_t mergeIntoGroupResultImpl(SQInfo *pQInfo, SArray* pGroup) {
3012
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
3013
  SQuery *pQuery = pRuntimeEnv->pQuery;
3014

3015 3016 3017 3018 3019 3020
  size_t size = taosArrayGetSize(pGroup);
  
  tFilePage **buffer = (tFilePage **)pQuery->sdata;
  int32_t *posList = calloc(size, sizeof(int32_t));
  
  STableDataInfo **pTableList = malloc(POINTER_BYTES * size);
3021

3022
  // todo opt for the case of one table per group
3023
  int32_t numOfTables = 0;
3024 3025
  for (int32_t i = 0; i < size; ++i) {
    SPair* p = taosArrayGet(pGroup, i);
3026
    STableDataInfo* pInfo = p->sec;
3027
    
3028 3029 3030
    SIDList list = getDataBufPagesIdList(pRuntimeEnv->pResultBuf, pInfo->pTableQInfo->tid);
    if (list.size > 0 && pInfo->pTableQInfo->windowResInfo.size > 0) {
      pTableList[numOfTables] = pInfo;
3031
      numOfTables += 1;
3032 3033
    }
  }
3034

3035
  if (numOfTables == 0) {
3036 3037
    tfree(posList);
    tfree(pTableList);
3038

3039 3040 3041
    assert(pQInfo->numOfGroupResultPages == 0);
    return 0;
  }
3042

3043
  SCompSupporter cs = {pTableList, posList, pQInfo};
3044

3045
  SLoserTreeInfo *pTree = NULL;
3046
  tLoserTreeCreate(&pTree, numOfTables, &cs, tableResultComparFn);
3047

3048 3049 3050
  SResultInfo *pResultInfo = calloc(pQuery->numOfOutputCols, sizeof(SResultInfo));
  setWindowResultInfo(pResultInfo, pQuery, pRuntimeEnv->stableQuery);
  resetMergeResultBuf(pQuery, pRuntimeEnv->pCtx, pResultInfo);
3051
  
3052 3053
  int64_t lastTimestamp = -1;
  int64_t startt = taosGetTimestampMs();
3054

3055 3056
  while (1) {
    int32_t pos = pTree->pNode[0].index;
3057

3058 3059
    SWindowResInfo *pWindowResInfo = &pTableList[pos]->pTableQInfo->windowResInfo;
    SWindowResult * pWindowRes = getWindowResult(pWindowResInfo, cs.position[pos]);
3060

3061 3062
    char *b = getPosInResultPage(pRuntimeEnv, PRIMARYKEY_TIMESTAMP_COL_INDEX, pWindowRes);
    TSKEY ts = GET_INT64_VAL(b);
3063

3064 3065 3066 3067
    assert(ts == pWindowRes->window.skey);
    int64_t num = getNumOfResultWindowRes(pRuntimeEnv, pWindowRes);
    if (num <= 0) {
      cs.position[pos] += 1;
3068

3069 3070
      if (cs.position[pos] >= pWindowResInfo->size) {
        cs.position[pos] = -1;
3071

3072
        // all input sources are exhausted
3073
        if (--numOfTables == 0) {
3074 3075 3076 3077 3078 3079 3080
          break;
        }
      }
    } else {
      if (ts == lastTimestamp) {  // merge with the last one
        doMerge(pRuntimeEnv, ts, pWindowRes, true);
      } else {  // copy data to disk buffer
3081 3082 3083 3084
        if (buffer[0]->numOfElems == pQuery->rec.capacity) {
          if (flushFromResultBuf(pQInfo) != TSDB_CODE_SUCCESS) {
            return -1;
          }
3085

3086 3087
          resetMergeResultBuf(pQuery, pRuntimeEnv->pCtx, pResultInfo);
        }
3088

3089 3090 3091
        doMerge(pRuntimeEnv, ts, pWindowRes, false);
        buffer[0]->numOfElems += 1;
      }
3092

3093
      lastTimestamp = ts;
3094

3095 3096 3097
      cs.position[pos] += 1;
      if (cs.position[pos] >= pWindowResInfo->size) {
        cs.position[pos] = -1;
3098

3099
        // all input sources are exhausted
3100
        if (--numOfTables == 0) {
3101 3102 3103 3104
          break;
        }
      }
    }
3105

3106 3107
    tLoserTreeAdjust(pTree, pos + pTree->numOfEntries);
  }
3108

3109 3110
  if (buffer[0]->numOfElems != 0) {  // there are data in buffer
    if (flushFromResultBuf(pQInfo) != TSDB_CODE_SUCCESS) {
3111 3112
      dError("QInfo:%p failed to flush data into temp file, abort query", pQInfo);
      
3113 3114 3115 3116
      tfree(pTree);
      tfree(pTableList);
      tfree(posList);
      tfree(pResultInfo);
3117

3118 3119 3120
      return -1;
    }
  }
3121

3122 3123 3124 3125 3126
  int64_t endt = taosGetTimestampMs();

#ifdef _DEBUG_VIEW
  displayInterResult(pQuery->sdata, pQuery, pQuery->sdata[0]->len);
#endif
3127

3128 3129 3130 3131
  dTrace("QInfo:%p result merge completed, elapsed time:%" PRId64 " ms", GET_QINFO_ADDR(pQuery), endt - startt);
  tfree(pTree);
  tfree(pTableList);
  tfree(posList);
3132

3133 3134 3135 3136
  pQInfo->offset = 0;
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    tfree(pResultInfo[i].interResultBuf);
  }
3137

3138 3139 3140 3141 3142
  tfree(pResultInfo);
  return pQInfo->numOfGroupResultPages;
}

int32_t flushFromResultBuf(SQInfo *pQInfo) {
3143 3144 3145
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

3146
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
3147 3148
  int32_t              capacity = (DEFAULT_INTERN_BUF_SIZE - sizeof(tFilePage)) / pQuery->rowSize;

3149 3150
  // the base value for group result, since the maximum number of table for each vnode will not exceed 100,000.
  int32_t pageId = -1;
3151

3152
  int32_t remain = pQuery->sdata[0]->num;
3153
  int32_t offset = 0;
3154

3155 3156 3157 3158 3159
  while (remain > 0) {
    int32_t r = remain;
    if (r > capacity) {
      r = capacity;
    }
3160

3161 3162
    int32_t    id = getGroupResultId(pQInfo->subgroupIdx) + pQInfo->numOfGroupResultPages;
    tFilePage *buf = getNewDataBuf(pResultBuf, id, &pageId);
3163

3164 3165 3166 3167
    // pagewise copy to dest buffer
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      int32_t bytes = pRuntimeEnv->pCtx[i].outputBytes;
      buf->numOfElems = r;
3168

3169 3170 3171
      memcpy(buf->data + pRuntimeEnv->offset[i] * buf->numOfElems, ((char *)pQuery->sdata[i]->data) + offset * bytes,
             buf->numOfElems * bytes);
    }
3172

3173 3174 3175
    offset += r;
    remain -= r;
  }
3176

3177 3178 3179 3180 3181 3182
  pQInfo->numOfGroupResultPages += 1;
  return TSDB_CODE_SUCCESS;
}

void resetMergeResultBuf(SQuery *pQuery, SQLFunctionCtx *pCtx, SResultInfo *pResultInfo) {
  for (int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
3183
    pCtx[k].aOutputBuf = pQuery->sdata[k]->data - pCtx[k].outputBytes;
3184 3185 3186
    pCtx[k].size = 1;
    pCtx[k].startOffset = 0;
    pCtx[k].resultInfo = &pResultInfo[k];
3187

3188
    pQuery->sdata[k]->num = 0;
3189 3190 3191
  }
}

H
hjxilinx 已提交
3192
void setTableDataInfo(STableDataInfo *pTableDataInfo, int32_t tableIndex, int32_t groupId) {
3193
  pTableDataInfo->groupIdx = groupId;
H
hjxilinx 已提交
3194
  pTableDataInfo->tableIndex = tableIndex;
3195 3196 3197 3198 3199 3200 3201 3202
}

static void doDisableFunctsForSupplementaryScan(SQuery *pQuery, SWindowResInfo *pWindowResInfo, int32_t order) {
  for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
    SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, i);
    if (!pStatus->closed) {
      continue;
    }
3203

3204
    SWindowResult *buf = getWindowResult(pWindowResInfo, i);
3205

3206 3207 3208
    // open/close the specified query for each group result
    for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
      int32_t functId = pQuery->pSelectExpr[j].pBase.functionId;
3209

3210 3211
      if (((functId == TSDB_FUNC_FIRST || functId == TSDB_FUNC_FIRST_DST) && order == TSDB_ORDER_DESC) ||
          ((functId == TSDB_FUNC_LAST || functId == TSDB_FUNC_LAST_DST) && order == TSDB_ORDER_ASC)) {
3212 3213 3214 3215 3216 3217 3218 3219 3220 3221
        buf->resultInfo[j].complete = false;
      } else if (functId != TSDB_FUNC_TS && functId != TSDB_FUNC_TAG) {
        buf->resultInfo[j].complete = true;
      }
    }
  }
}

void disableFunctForTableSuppleScan(SQueryRuntimeEnv *pRuntimeEnv, int32_t order) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3222

3223 3224 3225 3226
  // group by normal columns and interval query on normal table
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    pRuntimeEnv->pCtx[i].order = (pRuntimeEnv->pCtx[i].order) ^ 1u;
  }
3227

3228 3229 3230 3231 3232 3233 3234
  SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr) || isIntervalQuery(pQuery)) {
    doDisableFunctsForSupplementaryScan(pQuery, pWindowResInfo, order);
  } else {  // for simple result of table query,
    for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
      int32_t         functId = pQuery->pSelectExpr[j].pBase.functionId;
      SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[j];
3235

3236 3237
      if (((functId == TSDB_FUNC_FIRST || functId == TSDB_FUNC_FIRST_DST) && order == TSDB_ORDER_DESC) ||
          ((functId == TSDB_FUNC_LAST || functId == TSDB_FUNC_LAST_DST) && order == TSDB_ORDER_ASC)) {
3238 3239 3240 3241 3242 3243
        pCtx->resultInfo->complete = false;
      } else if (functId != TSDB_FUNC_TS && functId != TSDB_FUNC_TAG) {
        pCtx->resultInfo->complete = true;
      }
    }
  }
3244

3245 3246 3247
  pQuery->order.order = pQuery->order.order ^ 1u;
}

H
hjxilinx 已提交
3248
void disableFuncForReverseScan(SQInfo *pQInfo, int32_t order) {
3249 3250
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
3251

3252 3253 3254
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    pRuntimeEnv->pCtx[i].order = (pRuntimeEnv->pCtx[i].order) ^ 1u;
  }
3255

3256
  if (isIntervalQuery(pQuery)) {
3257 3258 3259 3260 3261 3262
//    for (int32_t i = 0; i < pQInfo->groupInfo.numOfTables; ++i) {
//      STableQueryInfo *pTableQueryInfo = pQInfo->pTableDataInfo[i].pTableQInfo;
//      SWindowResInfo * pWindowResInfo = &pTableQueryInfo->windowResInfo;
//
//      doDisableFunctsForSupplementaryScan(pQuery, pWindowResInfo, order);
//    }
3263 3264 3265 3266
  } else {
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
    doDisableFunctsForSupplementaryScan(pQuery, pWindowResInfo, order);
  }
3267

3268 3269 3270
  pQuery->order.order = (pQuery->order.order) ^ 1u;
}

H
hjxilinx 已提交
3271
void enableFuncForForwardScan(SQueryRuntimeEnv *pRuntimeEnv, int32_t order) {
3272
  SQuery *pQuery = pRuntimeEnv->pQuery;
3273

3274 3275 3276
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    pRuntimeEnv->pCtx[i].order = (pRuntimeEnv->pCtx[i].order) ^ 1u;
  }
3277

3278 3279 3280 3281 3282
  pQuery->order.order = (pQuery->order.order) ^ 1u;
}

void createQueryResultInfo(SQuery *pQuery, SWindowResult *pResultRow, bool isSTableQuery, SPosInfo *posInfo) {
  int32_t numOfCols = pQuery->numOfOutputCols;
3283

3284 3285
  pResultRow->resultInfo = calloc((size_t)numOfCols, sizeof(SResultInfo));
  pResultRow->pos = *posInfo;
3286

3287 3288 3289 3290 3291 3292
  // set the intermediate result output buffer
  setWindowResultInfo(pResultRow->resultInfo, pQuery, isSTableQuery);
}

void resetCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3293

3294 3295 3296
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
    pCtx->aOutputBuf = pQuery->sdata[i]->data;
3297

3298 3299 3300 3301 3302 3303
    /*
     * set the output buffer information and intermediate buffer
     * not all queries require the interResultBuf, such as COUNT/TAGPRJ/PRJ/TAG etc.
     */
    resetResultInfo(&pRuntimeEnv->resultInfo[i]);
    pCtx->resultInfo = &pRuntimeEnv->resultInfo[i];
3304

3305 3306 3307 3308 3309
    // set the timestamp output buffer for top/bottom/diff query
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
      pCtx->ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
    }
3310

3311
    memset(pQuery->sdata[i]->data, 0, (size_t) pQuery->pSelectExpr[i].resBytes * pQuery->rec.capacity);
3312
  }
3313

3314 3315 3316 3317 3318
  initCtxOutputBuf(pRuntimeEnv);
}

void forwardCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, int64_t output) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3319

3320 3321 3322 3323
  // reset the execution contexts
  for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
    int32_t functionId = pQuery->pSelectExpr[j].pBase.functionId;
    assert(functionId != TSDB_FUNC_DIFF);
3324

3325 3326 3327 3328
    // set next output position
    if (IS_OUTER_FORWARD(aAggs[functionId].nStatus)) {
      pRuntimeEnv->pCtx[j].aOutputBuf += pRuntimeEnv->pCtx[j].outputBytes * output;
    }
3329

3330 3331 3332 3333 3334 3335 3336 3337 3338 3339
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
      /*
       * NOTE: for top/bottom query, the value of first column of output (timestamp) are assigned
       * in the procedure of top/bottom routine
       * the output buffer in top/bottom routine is ptsOutputBuf, so we need to forward the output buffer
       *
       * diff function is handled in multi-output function
       */
      pRuntimeEnv->pCtx[j].ptsOutputBuf += TSDB_KEYSIZE * output;
    }
3340

3341 3342 3343 3344 3345 3346
    resetResultInfo(pRuntimeEnv->pCtx[j].resultInfo);
  }
}

void initCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3347

3348 3349
  for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
    int32_t functionId = pQuery->pSelectExpr[j].pBase.functionId;
3350
    
3351 3352 3353 3354 3355 3356 3357
    pRuntimeEnv->pCtx[j].currentStage = 0;
    aAggs[functionId].init(&pRuntimeEnv->pCtx[j]);
  }
}

void doSkipResults(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3358
  if (pQuery->rec.rows == 0 || pQuery->limit.offset == 0) {
3359 3360
    return;
  }
3361

3362 3363
  if (pQuery->rec.rows <= pQuery->limit.offset) {
    pQuery->limit.offset -= pQuery->rec.rows;
3364

3365
    pQuery->rec.rows = 0;
3366 3367
    //    pQuery->pointsOffset = pQuery->rec.pointsToRead;  // clear all data in result buffer

3368
    resetCtxOutputBuf(pRuntimeEnv);
3369

3370 3371 3372 3373
    // clear the buffer is full flag if exists
    pQuery->status &= (~QUERY_RESBUF_FULL);
  } else {
    int32_t numOfSkip = (int32_t)pQuery->limit.offset;
3374
    pQuery->rec.rows -= numOfSkip;
3375

3376 3377 3378 3379
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
      int32_t bytes = pRuntimeEnv->pCtx[i].outputBytes;
      assert(0);
H
hjxilinx 已提交
3380
      //      memmove(pQuery->sdata[i]->data, pQuery->sdata[i]->data + bytes * numOfSkip, pQuery->size * bytes);
3381
      pRuntimeEnv->pCtx[i].aOutputBuf += bytes * numOfSkip;
3382

3383 3384 3385 3386
      if (functionId == TSDB_FUNC_DIFF || functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
        pRuntimeEnv->pCtx[i].ptsOutputBuf += TSDB_KEYSIZE * numOfSkip;
      }
    }
3387

3388 3389 3390 3391 3392
    pQuery->limit.offset = 0;
  }
}

typedef struct SQueryStatus {
3393 3394 3395
  int8_t    overStatus;
  TSKEY     lastKey;
  STSCursor cur;
3396 3397 3398 3399 3400
} SQueryStatus;

// todo refactor
static void queryStatusSave(SQueryRuntimeEnv *pRuntimeEnv, SQueryStatus *pStatus) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3401

3402 3403
  pStatus->overStatus = pQuery->status;
  pStatus->lastKey = pQuery->lastKey;
3404

3405
  pStatus->cur = tsBufGetCursor(pRuntimeEnv->pTSBuf);  // save the cursor
3406

3407 3408 3409 3410
  if (pRuntimeEnv->pTSBuf) {
    pRuntimeEnv->pTSBuf->cur.order ^= 1u;
    tsBufNextPos(pRuntimeEnv->pTSBuf);
  }
3411

3412
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
3413

3414 3415 3416 3417 3418 3419 3420
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
  pQuery->lastKey = pQuery->window.skey;
}

static void queryStatusRestore(SQueryRuntimeEnv *pRuntimeEnv, SQueryStatus *pStatus) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
3421

3422 3423
  pQuery->lastKey = pStatus->lastKey;
  pQuery->status = pStatus->overStatus;
3424

3425 3426 3427 3428 3429 3430
  tsBufSetCursor(pRuntimeEnv->pTSBuf, &pStatus->cur);
}

static void doSingleMeterSupplementScan(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *     pQuery = pRuntimeEnv->pQuery;
  SQueryStatus qStatus = {0};
3431

H
hjxilinx 已提交
3432
  if (!needReverseScan(pQuery)) {
3433 3434
    return;
  }
3435

3436 3437
  dTrace("QInfo:%p start to supp scan", GET_QINFO_ADDR(pQuery));
  SET_SUPPLEMENT_SCAN_FLAG(pRuntimeEnv);
3438

3439 3440 3441
  // close necessary function execution during supplementary scan
  disableFunctForTableSuppleScan(pRuntimeEnv, pQuery->order.order);
  queryStatusSave(pRuntimeEnv, &qStatus);
3442

3443
  STimeWindow w = {.skey = pQuery->window.skey, .ekey = pQuery->window.ekey};
3444

3445
  // reverse scan from current position
H
hzcheng 已提交
3446
  TsdbPosT current = tsdbDataBlockTell(pRuntimeEnv->pQueryHandle);
3447
  tsdbResetQuery(pRuntimeEnv->pQueryHandle, &w, current, pQuery->order.order);
3448

3449
  doScanAllDataBlocks(pRuntimeEnv);
3450

3451
  queryStatusRestore(pRuntimeEnv, &qStatus);
H
hjxilinx 已提交
3452
  enableFuncForForwardScan(pRuntimeEnv, pQuery->order.order);
3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467
  SET_MASTER_SCAN_FLAG(pRuntimeEnv);
}

void setQueryStatus(SQuery *pQuery, int8_t status) {
  if (status == QUERY_NOT_COMPLETED) {
    pQuery->status = status;
  } else {
    // QUERY_NOT_COMPLETED is not compatible with any other status, so clear its position first
    pQuery->status &= (~QUERY_NOT_COMPLETED);
    pQuery->status |= status;
  }
}

bool needScanDataBlocksAgain(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
3468 3469
  
  bool toContinue = false;
3470 3471 3472
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr) || isIntervalQuery(pQuery)) {
    // for each group result, call the finalize function for each column
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
3473

3474 3475 3476 3477 3478
    for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
      SWindowResult *pResult = getWindowResult(pWindowResInfo, i);
      if (!pResult->status.closed) {
        continue;
      }
3479

3480
      setWindowResOutputBuf(pRuntimeEnv, pResult);
3481

3482 3483 3484 3485 3486
      for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
        int16_t functId = pQuery->pSelectExpr[j].pBase.functionId;
        if (functId == TSDB_FUNC_TS) {
          continue;
        }
3487

3488 3489
        aAggs[functId].xNextStep(&pRuntimeEnv->pCtx[j]);
        SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
3490

3491 3492 3493 3494 3495 3496 3497 3498 3499
        toContinue |= (!pResInfo->complete);
      }
    }
  } else {
    for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
      int16_t functId = pQuery->pSelectExpr[j].pBase.functionId;
      if (functId == TSDB_FUNC_TS) {
        continue;
      }
3500

3501 3502
      aAggs[functId].xNextStep(&pRuntimeEnv->pCtx[j]);
      SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
3503

3504 3505 3506
      toContinue |= (!pResInfo->complete);
    }
  }
3507

3508 3509 3510
  return toContinue;
}

H
hjxilinx 已提交
3511
void scanAllDataBlocks(SQueryRuntimeEnv *pRuntimeEnv) {
3512 3513
  SQuery *pQuery = pRuntimeEnv->pQuery;
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
3514

3515
  // store the start query position
3516 3517
  SQInfo* pQInfo = (SQInfo*) GET_QINFO_ADDR(pRuntimeEnv);
  
3518 3519 3520
  int64_t skey = pQuery->lastKey;
  int32_t status = pQuery->status;
  int32_t activeSlot = pRuntimeEnv->windowResInfo.curIndex;
3521

3522 3523
  SET_MASTER_SCAN_FLAG(pRuntimeEnv);
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
3524

3525 3526
  while (1) {
    doScanAllDataBlocks(pRuntimeEnv);
3527

3528
    if (!needScanDataBlocksAgain(pRuntimeEnv)) {
3529
      
3530 3531 3532 3533
      // restore the status
      if (pRuntimeEnv->scanFlag == REPEAT_SCAN) {
        pQuery->status = status;
      }
3534

3535 3536
      break;
    }
3537

H
hjxilinx 已提交
3538
    // set the correct start position, and load the corresponding block in buffer for next round scan all data blocks.
3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551
//    /*int32_t ret =*/ tsdbDataBlockSeek(pRuntimeEnv->pQueryHandle, pos);
  
    STsdbQueryCond cond = {
        .twindow = {pQuery->window.skey, pQuery->lastKey},
        .order   = pQuery->order.order,
        .colList = pQuery->colList,
    };
  
    SArray *cols = taosArrayInit(pQuery->numOfCols, sizeof(pQuery->colList[0]));
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
      taosArrayPush(cols, &pQuery->colList[i]);
    }
  
3552
    if (pRuntimeEnv->pSecQueryHandle != NULL) {
3553
      pRuntimeEnv->pSecQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &pQInfo->groupInfo, cols);
3554 3555
    }
    
3556 3557
    taosArrayDestroy(cols);
  
3558 3559
    status = pQuery->status;
    pRuntimeEnv->windowResInfo.curIndex = activeSlot;
3560

3561 3562
    setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
    pRuntimeEnv->scanFlag = REPEAT_SCAN;
3563

3564
    // check if query is killed or not
3565
    if (isQueryKilled(GET_QINFO_ADDR(pRuntimeEnv))) {
3566 3567 3568
      return;
    }
  }
3569

3570 3571 3572
  // no need to set the end key
  TSKEY lkey = pQuery->lastKey;
  TSKEY ekey = pQuery->window.ekey;
3573

3574 3575
  pQuery->window.skey = skey;
  pQuery->window.ekey = pQuery->lastKey - step;
3576
  /*tsdbpos_t current =*/ tsdbDataBlockTell(pRuntimeEnv->pQueryHandle);
3577

3578
  doSingleMeterSupplementScan(pRuntimeEnv);
3579

H
hjxilinx 已提交
3580
  // update the pQuery->window.skey and pQuery->window.ekey to limit the scan scope of sliding query during reverse scan
3581 3582
  pQuery->lastKey = lkey;
  pQuery->window.ekey = ekey;
3583

3584
//  STimeWindow win = {.skey = pQuery->window.skey, .ekey = pQuery->window.ekey};
H
[td-32]  
hjxilinx 已提交
3585 3586
//  tsdbResetQuery(pRuntimeEnv->pQueryHandle, &win, current, pQuery->order.order);
//  tsdbNextDataBlock(pRuntimeEnv->pQueryHandle);
3587 3588
}

H
hjxilinx 已提交
3589
void finalizeQueryResult(SQueryRuntimeEnv *pRuntimeEnv) {
3590
  SQuery *pQuery = pRuntimeEnv->pQuery;
3591

3592 3593 3594 3595 3596 3597
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr) || isIntervalQuery(pQuery)) {
    // for each group result, call the finalize function for each column
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
    if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
      closeAllTimeWindow(pWindowResInfo);
    }
3598

3599 3600 3601 3602 3603
    for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
      SWindowResult *buf = &pWindowResInfo->pResult[i];
      if (!isWindowResClosed(pWindowResInfo, i)) {
        continue;
      }
3604

3605
      setWindowResOutputBuf(pRuntimeEnv, buf);
3606

3607 3608 3609
      for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
        aAggs[pQuery->pSelectExpr[j].pBase.functionId].xFinalize(&pRuntimeEnv->pCtx[j]);
      }
3610

3611 3612 3613 3614 3615 3616
      /*
       * set the number of output results for group by normal columns, the number of output rows usually is 1 except
       * the top and bottom query
       */
      buf->numOfRows = getNumOfResult(pRuntimeEnv);
    }
3617

3618 3619 3620 3621 3622 3623 3624 3625 3626 3627
  } else {
    for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
      aAggs[pQuery->pSelectExpr[j].pBase.functionId].xFinalize(&pRuntimeEnv->pCtx[j]);
    }
  }
}

static bool hasMainOutput(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
3628

3629 3630 3631 3632
    if (functionId != TSDB_FUNC_TS && functionId != TSDB_FUNC_TAG && functionId != TSDB_FUNC_TAGPRJ) {
      return true;
    }
  }
3633

3634 3635 3636
  return false;
}

H
hjxilinx 已提交
3637
STableQueryInfo *createTableQueryInfo(SQueryRuntimeEnv *pRuntimeEnv, int32_t tid, STimeWindow win) {
3638
  STableQueryInfo *pTableQueryInfo = calloc(1, sizeof(STableQueryInfo));
3639

H
hjxilinx 已提交
3640 3641
  pTableQueryInfo->win = win;
  pTableQueryInfo->lastKey = win.skey;
3642

H
hjxilinx 已提交
3643
  pTableQueryInfo->tid = tid;
3644
  pTableQueryInfo->cur.vnodeIndex = -1;
3645

3646 3647 3648 3649
  initWindowResInfo(&pTableQueryInfo->windowResInfo, pRuntimeEnv, 100, 100, TSDB_DATA_TYPE_INT);
  return pTableQueryInfo;
}

3650
UNUSED_FUNC void destroyMeterQueryInfo(STableQueryInfo *pTableQueryInfo, int32_t numOfCols) {
3651 3652 3653
  if (pTableQueryInfo == NULL) {
    return;
  }
3654

3655 3656 3657 3658
  cleanupTimeWindowInfo(&pTableQueryInfo->windowResInfo, numOfCols);
  free(pTableQueryInfo);
}

H
hjxilinx 已提交
3659
void changeMeterQueryInfoForSuppleQuery(SQuery *pQuery, STableQueryInfo *pTableQueryInfo) {
3660 3661 3662
  if (pTableQueryInfo == NULL) {
    return;
  }
3663

3664 3665 3666 3667 3668 3669 3670
  // order has change already!
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
  if (!QUERY_IS_ASC_QUERY(pQuery)) {
    assert(pTableQueryInfo->win.ekey >= pTableQueryInfo->lastKey + step);
  } else {
    assert(pTableQueryInfo->win.ekey <= pTableQueryInfo->lastKey + step);
  }
3671

3672
  pTableQueryInfo->win.ekey = pTableQueryInfo->lastKey + step;
3673

3674 3675
  SWAP(pTableQueryInfo->win.skey, pTableQueryInfo->win.ekey, TSKEY);
  pTableQueryInfo->lastKey = pTableQueryInfo->win.skey;
3676

3677 3678 3679 3680 3681 3682
  pTableQueryInfo->cur.order = pTableQueryInfo->cur.order ^ 1u;
  pTableQueryInfo->cur.vnodeIndex = -1;
}

void restoreIntervalQueryRange(SQueryRuntimeEnv *pRuntimeEnv, STableQueryInfo *pTableQueryInfo) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3683

3684
  pQuery->window = pTableQueryInfo->win;
3685
  pQuery->lastKey = pTableQueryInfo->lastKey;
3686

3687
  assert(((pQuery->lastKey >= pQuery->window.skey) && QUERY_IS_ASC_QUERY(pQuery)) ||
3688
         ((pQuery->lastKey <= pQuery->window.skey) && !QUERY_IS_ASC_QUERY(pQuery)));
3689 3690 3691 3692 3693
}

/**
 * set output buffer for different group
 * @param pRuntimeEnv
3694
 * @param pDataBlockInfo
3695
 */
3696
void setExecutionContext(SQInfo *pQInfo, STableQueryInfo *pTableQueryInfo, STable* pTable, int32_t groupIdx,
3697
                         TSKEY nextKey) {
3698 3699 3700
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SWindowResInfo *  pWindowResInfo = &pRuntimeEnv->windowResInfo;
  int32_t           GROUPRESULTID = 1;
3701

3702 3703 3704 3705
  SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, pWindowResInfo, (char *)&groupIdx, sizeof(groupIdx));
  if (pWindowRes == NULL) {
    return;
  }
3706

3707 3708 3709 3710 3711 3712 3713 3714 3715 3716
  /*
   * not assign result buffer yet, add new result buffer
   * all group belong to one result set, and each group result has different group id so set the id to be one
   */
  if (pWindowRes->pos.pageId == -1) {
    if (addNewWindowResultBuf(pWindowRes, pRuntimeEnv->pResultBuf, GROUPRESULTID, pRuntimeEnv->numOfRowsPerPage) !=
        TSDB_CODE_SUCCESS) {
      return;
    }
  }
3717

3718 3719
  setWindowResOutputBuf(pRuntimeEnv, pWindowRes);
  initCtxOutputBuf(pRuntimeEnv);
3720

3721
  pTableQueryInfo->lastKey = nextKey;
3722
  setAdditionalInfo(pQInfo, pTable, pTableQueryInfo);
3723 3724 3725 3726
}

static void setWindowResOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pResult) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3727

3728 3729 3730 3731
  // Note: pResult->pos[i]->numOfElems == 0, there is only fixed number of results for each group
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
    pCtx->aOutputBuf = getPosInResultPage(pRuntimeEnv, i, pResult);
3732

3733 3734 3735 3736
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
      pCtx->ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
    }
3737

3738 3739 3740 3741 3742
    /*
     * set the output buffer information and intermediate buffer
     * not all queries require the interResultBuf, such as COUNT
     */
    pCtx->resultInfo = &pResult->resultInfo[i];
3743

3744 3745 3746 3747 3748 3749
    // set super table query flag
    SResultInfo *pResInfo = GET_RES_INFO(pCtx);
    pResInfo->superTableQ = pRuntimeEnv->stableQuery;
  }
}

3750
int32_t setAdditionalInfo(SQInfo *pQInfo, STable* pTable, STableQueryInfo *pTableQueryInfo) {
3751 3752
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  assert(pTableQueryInfo->lastKey > 0);
3753

3754
  setTagVal(pRuntimeEnv, pTable->tableId, pQInfo->tsdb);
3755

3756 3757 3758 3759
  // both the master and supplement scan needs to set the correct ts comp start position
  if (pRuntimeEnv->pTSBuf != NULL) {
    if (pTableQueryInfo->cur.vnodeIndex == -1) {
      pTableQueryInfo->tag = pRuntimeEnv->pCtx[0].tag.i64Key;
3760

3761
      tsBufGetElemStartPos(pRuntimeEnv->pTSBuf, 0, pTableQueryInfo->tag);
3762

3763 3764 3765 3766 3767 3768
      // keep the cursor info of current meter
      pTableQueryInfo->cur = pRuntimeEnv->pTSBuf->cur;
    } else {
      tsBufSetCursor(pRuntimeEnv->pTSBuf, &pTableQueryInfo->cur);
    }
  }
3769

3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784
  return 0;
}

/*
 * There are two cases to handle:
 *
 * 1. Query range is not set yet (queryRangeSet = 0). we need to set the query range info, including pQuery->lastKey,
 *    pQuery->window.skey, and pQuery->eKey.
 * 2. Query range is set and query is in progress. There may be another result with the same query ranges to be
 *    merged during merge stage. In this case, we need the pTableQueryInfo->lastResRows to decide if there
 *    is a previous result generated or not.
 */
void setIntervalQueryRange(STableQueryInfo *pTableQueryInfo, SQInfo *pQInfo, TSKEY key) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
3785

3786 3787 3788 3789 3790
  if (pTableQueryInfo->queryRangeSet) {
    pQuery->lastKey = key;
    pTableQueryInfo->lastKey = key;
  } else {
    pQuery->window.skey = key;
3791
    STimeWindow win = {.skey = key, .ekey = pQuery->window.ekey};
3792

3793 3794 3795 3796 3797
    // for too small query range, no data in this interval.
    if ((QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.ekey < pQuery->window.skey)) ||
        (!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.skey < pQuery->window.ekey))) {
      return;
    }
3798

3799 3800 3801 3802 3803 3804
    /**
     * In handling the both ascending and descending order super table query, we need to find the first qualified
     * timestamp of this table, and then set the first qualified start timestamp.
     * In ascending query, key is the first qualified timestamp. However, in the descending order query, additional
     * operations involve.
     */
3805 3806
    TSKEY           skey1, ekey1;
    STimeWindow     w = {0};
3807
    SWindowResInfo *pWindowResInfo = &pTableQueryInfo->windowResInfo;
3808

3809
    getAlignQueryTimeWindow(pQuery, win.skey, win.skey, win.ekey, &skey1, &ekey1, &w);
3810
    pWindowResInfo->startTime = pQuery->window.skey;  // windowSKey may be 0 in case of 1970 timestamp
3811

3812 3813 3814 3815 3816 3817 3818 3819
    if (pWindowResInfo->prevSKey == 0) {
      if (QUERY_IS_ASC_QUERY(pQuery)) {
        pWindowResInfo->prevSKey = w.skey;
      } else {
        assert(win.ekey == pQuery->window.skey);
        pWindowResInfo->prevSKey = w.skey;
      }
    }
3820

3821 3822 3823
    pTableQueryInfo->queryRangeSet = 1;
    pTableQueryInfo->lastKey = pQuery->window.skey;
    pTableQueryInfo->win.skey = pQuery->window.skey;
3824

3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845
    pQuery->lastKey = pQuery->window.skey;
  }
}

bool requireTimestamp(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutputCols; i++) {
    int32_t functionId = pQuery->pSelectExpr[i].pBase.functionId;
    if ((aAggs[functionId].nStatus & TSDB_FUNCSTATE_NEED_TS) != 0) {
      return true;
    }
  }
  return false;
}

bool needPrimaryTimestampCol(SQuery *pQuery, SDataBlockInfo *pDataBlockInfo) {
  /*
   * 1. if skey or ekey locates in this block, we need to load the timestamp column to decide the precise position
   * 2. if there are top/bottom, first_dst/last_dst functions, we need to load timestamp column in any cases;
   */
  STimeWindow *w = &pDataBlockInfo->window;
  bool         loadPrimaryTS = (pQuery->lastKey >= w->skey && pQuery->lastKey <= w->ekey) ||
3846 3847
                       (pQuery->window.ekey >= w->skey && pQuery->window.ekey <= w->ekey) || requireTimestamp(pQuery);

3848 3849 3850 3851 3852 3853 3854 3855 3856
  return loadPrimaryTS;
}

bool onDemandLoadDatablock(SQuery *pQuery, int16_t queryRangeSet) {
  return (pQuery->intervalTime == 0) || ((queryRangeSet == 1) && (isIntervalQuery(pQuery)));
}

static int32_t getNumOfSubset(SQInfo *pQInfo) {
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
3857

3858 3859 3860 3861
  int32_t totalSubset = 0;
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr) || (isIntervalQuery(pQuery))) {
    totalSubset = numOfClosedTimeWindow(&pQInfo->runtimeEnv.windowResInfo);
  } else {
3862
    totalSubset = taosArrayGetSize(pQInfo->groupInfo.pGroupList);
3863
  }
3864

3865 3866 3867 3868 3869 3870
  return totalSubset;
}

static int32_t doCopyToSData(SQInfo *pQInfo, SWindowResult *result, int32_t orderType) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
3871

3872 3873 3874
  int32_t numOfResult = 0;
  int32_t startIdx = 0;
  int32_t step = -1;
3875

H
hjxilinx 已提交
3876
  dTrace("QInfo:%p start to copy data from windowResInfo to query buf", GET_QINFO_ADDR(pQuery));
3877
  int32_t totalSubset = getNumOfSubset(pQInfo);
3878

3879
  if (orderType == TSDB_ORDER_ASC) {
3880 3881 3882 3883 3884 3885
    startIdx = pQInfo->subgroupIdx;
    step = 1;
  } else {  // desc order copy all data
    startIdx = totalSubset - pQInfo->subgroupIdx - 1;
    step = -1;
  }
3886

3887 3888 3889 3890 3891 3892
  for (int32_t i = startIdx; (i < totalSubset) && (i >= 0); i += step) {
    if (result[i].numOfRows == 0) {
      pQInfo->offset = 0;
      pQInfo->subgroupIdx += 1;
      continue;
    }
3893

3894
    assert(result[i].numOfRows >= 0 && pQInfo->offset <= 1);
3895

3896 3897
    int32_t numOfRowsToCopy = result[i].numOfRows - pQInfo->offset;
    int32_t oldOffset = pQInfo->offset;
3898

3899 3900 3901 3902
    /*
     * current output space is not enough to keep all the result data of this group, only copy partial results
     * to SQuery object's result buffer
     */
3903 3904 3905 3906 3907 3908 3909
    if (numOfRowsToCopy > pQuery->rec.capacity - numOfResult) {
      numOfRowsToCopy = pQuery->rec.capacity - numOfResult;
      pQInfo->offset += numOfRowsToCopy;
    } else {
      pQInfo->offset = 0;
      pQInfo->subgroupIdx += 1;
    }
3910

3911 3912
    for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
      int32_t size = pRuntimeEnv->pCtx[j].outputBytes;
3913

3914 3915 3916 3917
      char *out = pQuery->sdata[j]->data + numOfResult * size;
      char *in = getPosInResultPage(pRuntimeEnv, j, &result[i]);
      memcpy(out, in + oldOffset * size, size * numOfRowsToCopy);
    }
3918

3919
    numOfResult += numOfRowsToCopy;
3920 3921 3922
    if (numOfResult == pQuery->rec.capacity) {
      break;
    }
3923
  }
3924

H
hjxilinx 已提交
3925
  dTrace("QInfo:%p copy data to query buf completed", pQInfo);
3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943

#ifdef _DEBUG_VIEW
  displayInterResult(pQuery->sdata, pQuery, numOfResult);
#endif
  return numOfResult;
}

/**
 * copyFromWindowResToSData support copy data in ascending/descending order
 * For interval query of both super table and table, copy the data in ascending order, since the output results are
 * ordered in SWindowResutl already. While handling the group by query for both table and super table,
 * all group result are completed already.
 *
 * @param pQInfo
 * @param result
 */
void copyFromWindowResToSData(SQInfo *pQInfo, SWindowResult *result) {
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
3944

3945
  int32_t orderType = (pQuery->pGroupbyExpr != NULL) ? pQuery->pGroupbyExpr->orderType : TSDB_ORDER_ASC;
3946
  int32_t numOfResult = doCopyToSData(pQInfo, result, orderType);
3947

3948
  pQuery->rec.rows += numOfResult;
H
hjxilinx 已提交
3949
  
3950
  assert(pQuery->rec.rows <= pQuery->rec.capacity);
3951 3952 3953 3954
}

static void updateWindowResNumOfRes(SQueryRuntimeEnv *pRuntimeEnv, STableDataInfo *pTableDataInfo) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3955

3956 3957 3958 3959
  // update the number of result for each, only update the number of rows for the corresponding window result.
  if (pQuery->intervalTime == 0) {
    int32_t g = pTableDataInfo->groupIdx;
    assert(pRuntimeEnv->windowResInfo.size > 0);
3960

3961 3962 3963 3964 3965 3966 3967
    SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, &pRuntimeEnv->windowResInfo, (char *)&g, sizeof(g));
    if (pWindowRes->numOfRows == 0) {
      pWindowRes->numOfRows = getNumOfResult(pRuntimeEnv);
    }
  }
}

H
hjxilinx 已提交
3968
void stableApplyFunctionsOnBlock(SQueryRuntimeEnv* pRuntimeEnv, STableDataInfo *pTableDataInfo, SDataBlockInfo *pDataBlockInfo,
3969
                                  SDataStatis *pStatis, SArray *pDataBlock, __block_search_fn_t searchFn) {
3970 3971 3972
  SQuery *          pQuery = pRuntimeEnv->pQuery;
  STableQueryInfo * pTableQueryInfo = pTableDataInfo->pTableQInfo;
  SWindowResInfo *  pWindowResInfo = &pTableQueryInfo->windowResInfo;
3973

3974
  if (pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL) {
3975
    //    numOfRes = rowwiseApplyAllFunctions(pRuntimeEnv, &forwardStep, pFields, pDataBlockInfo, pWindowResInfo);
3976
  } else {
3977
    blockwiseApplyAllFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, searchFn, pDataBlock);
3978
  }
3979

3980 3981 3982 3983 3984 3985
  updateWindowResNumOfRes(pRuntimeEnv, pTableDataInfo);
  updatelastkey(pQuery, pTableQueryInfo);
}

bool vnodeHasRemainResults(void *handle) {
  SQInfo *pQInfo = (SQInfo *)handle;
3986

3987 3988 3989
  if (pQInfo == NULL || pQInfo->runtimeEnv.pQuery->interpoType == TSDB_INTERPO_NONE) {
    return false;
  }
3990

3991 3992
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
3993

3994
  SInterpolationInfo *pInterpoInfo = &pRuntimeEnv->interpoInfo;
3995
  if (pQuery->limit.limit > 0 && pQuery->rec.rows >= pQuery->limit.limit) {
3996 3997
    return false;
  }
3998

3999 4000 4001 4002 4003 4004 4005
  int32_t remain = taosNumOfRemainPoints(pInterpoInfo);
  if (remain > 0) {
    return true;
  } else {
    if (pRuntimeEnv->pInterpoBuf == NULL) {
      return false;
    }
4006

4007
    // query has completed
H
hjxilinx 已提交
4008
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
4009
      /*TSKEY ekey =*/ taosGetRevisedEndKey(pQuery->window.ekey, pQuery->order.order, pQuery->intervalTime,
4010 4011 4012 4013 4014 4015
                                        pQuery->slidingTimeUnit, pQuery->precision);
      //      int32_t numOfTotal = taosGetNumOfResultWithInterpo(pInterpoInfo, (TSKEY
      //      *)pRuntimeEnv->pInterpoBuf[0]->data,
      //                                                         remain, pQuery->intervalTime, ekey,
      //                                                         pQuery->pointsToRead);
      //      return numOfTotal > 0;
4016 4017 4018
      assert(0);
      return false;
    }
4019

4020 4021 4022 4023
    return false;
  }
}

4024
static UNUSED_FUNC int32_t resultInterpolate(SQInfo *pQInfo, tFilePage **data, tFilePage **pDataSrc, int32_t numOfRows,
4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060
                                 int32_t outputRows) {
#if 0
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *pQuery = &pRuntimeEnv->pQuery;
  
  assert(pRuntimeEnv->pCtx[0].outputBytes == TSDB_KEYSIZE);
  
  // build support structure for performing interpolation
  SSchema *pSchema = calloc(1, sizeof(SSchema) * pQuery->numOfOutputCols);
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    pSchema[i].bytes = pRuntimeEnv->pCtx[i].outputBytes;
    pSchema[i].type = pQuery->pSelectExpr[i].resType;
  }
  
//  SColumnModel *pModel = createColumnModel(pSchema, pQuery->numOfOutputCols, pQuery->pointsToRead);
  
  char *  srcData[TSDB_MAX_COLUMNS] = {0};
  int32_t functions[TSDB_MAX_COLUMNS] = {0};
  
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    srcData[i] = pDataSrc[i]->data;
    functions[i] = pQuery->pSelectExpr[i].pBase.functionId;
  }
  
  assert(0);
//  int32_t numOfRes = taosDoInterpoResult(&pRuntimeEnv->interpoInfo, pQuery->interpoType, data, numOfRows, outputRows,
//                                         pQuery->intervalTime, (int64_t *)pDataSrc[0]->data, pModel, srcData,
//                                         pQuery->defaultVal, functions, pRuntimeEnv->pTabObj->pointsPerFileBlock);
  
  destroyColumnModel(pModel);
  free(pSchema);
#endif
  return 0;
}

static void doCopyQueryResultToMsg(SQInfo *pQInfo, int32_t numOfRows, char *data) {
4061 4062
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
  for (int32_t col = 0; col < pQuery->numOfOutputCols; ++col) {
4063 4064 4065 4066 4067
    int32_t bytes = pQuery->pSelectExpr[col].resBytes;
    
    memmove(data, pQuery->sdata[col]->data, bytes * numOfRows);
    data += bytes * numOfRows;
  }
H
hjxilinx 已提交
4068 4069 4070 4071 4072
  
  // all data returned, set query over
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
    setQueryStatus(pQuery, QUERY_OVER);
  }
4073 4074 4075 4076
}

int32_t vnodeQueryResultInterpolate(SQInfo *pQInfo, tFilePage **pDst, tFilePage **pDataSrc, int32_t numOfRows,
                                    int32_t *numOfInterpo) {
4077 4078
//  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
//  SQuery *          pQuery = pRuntimeEnv->pQuery;
4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115
#if 0
  while (1) {
    numOfRows = taosNumOfRemainPoints(&pRuntimeEnv->interpoInfo);
    
    TSKEY   ekey = taosGetRevisedEndKey(pQuery->window.skey, pQuery->order.order, pQuery->intervalTime,
                                        pQuery->slidingTimeUnit, pQuery->precision);
    int32_t numOfFinalRows = taosGetNumOfResultWithInterpo(&pRuntimeEnv->interpoInfo, (TSKEY *)pDataSrc[0]->data,
                                                           numOfRows, pQuery->intervalTime, ekey, pQuery->pointsToRead);
    
    int32_t ret = resultInterpolate(pQInfo, pDst, pDataSrc, numOfRows, numOfFinalRows);
    assert(ret == numOfFinalRows);
    
    /* reached the start position of according to offset value, return immediately */
    if (pQuery->limit.offset == 0) {
      return ret;
    }
    
    if (pQuery->limit.offset < ret) {
      ret -= pQuery->limit.offset;
      // todo !!!!there exactly number of interpo is not valid.
      // todo refactor move to the beginning of buffer
      for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
        memmove(pDst[i]->data, pDst[i]->data + pQuery->pSelectExpr[i].resBytes * pQuery->limit.offset,
                ret * pQuery->pSelectExpr[i].resBytes);
      }
      pQuery->limit.offset = 0;
      return ret;
    } else {
      pQuery->limit.offset -= ret;
      ret = 0;
    }
    
    if (!vnodeHasRemainResults(pQInfo)) {
      return ret;
    }
  }
#endif
4116 4117

  return 0;
4118 4119 4120
}

void vnodePrintQueryStatistics(SQInfo *pQInfo) {
4121
#if 0
4122
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4123

4124
  SQuery *pQuery = pRuntimeEnv->pQuery;
4125

4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165
  SQueryCostSummary *pSummary = &pRuntimeEnv->summary;
  if (pRuntimeEnv->pResultBuf == NULL) {
    pSummary->tmpBufferInDisk = 0;
  } else {
    pSummary->tmpBufferInDisk = getResBufSize(pRuntimeEnv->pResultBuf);
  }
  
  dTrace("QInfo:%p statis: comp blocks:%d, size:%d Bytes, elapsed time:%.2f ms", pQInfo, pSummary->readCompInfo,
         pSummary->totalCompInfoSize, pSummary->loadCompInfoUs / 1000.0);
  
  dTrace("QInfo:%p statis: field info: %d, size:%d Bytes, avg size:%.2f Bytes, elapsed time:%.2f ms", pQInfo,
         pSummary->readField, pSummary->totalFieldSize, (double)pSummary->totalFieldSize / pSummary->readField,
         pSummary->loadFieldUs / 1000.0);
  
  dTrace(
      "QInfo:%p statis: file blocks:%d, size:%d Bytes, elapsed time:%.2f ms, skipped:%d, in-memory gen null:%d Bytes",
      pQInfo, pSummary->readDiskBlocks, pSummary->totalBlockSize, pSummary->loadBlocksUs / 1000.0,
      pSummary->skippedFileBlocks, pSummary->totalGenData);
  
  dTrace("QInfo:%p statis: cache blocks:%d", pQInfo, pSummary->blocksInCache, 0);
  dTrace("QInfo:%p statis: temp file:%d Bytes", pQInfo, pSummary->tmpBufferInDisk);
  
  dTrace("QInfo:%p statis: file:%d, table:%d", pQInfo, pSummary->numOfFiles, pSummary->numOfTables);
  dTrace("QInfo:%p statis: seek ops:%d", pQInfo, pSummary->numOfSeek);
  
  double total = pSummary->fileTimeUs + pSummary->cacheTimeUs;
  double io = pSummary->loadCompInfoUs + pSummary->loadBlocksUs + pSummary->loadFieldUs;
  
  // todo add the intermediate result save cost!!
  double computing = total - io;
  
  dTrace(
      "QInfo:%p statis: total elapsed time:%.2f ms, file:%.2f ms(%.2f%), cache:%.2f ms(%.2f%). io:%.2f ms(%.2f%),"
      "comput:%.2fms(%.2f%)",
      pQInfo, total / 1000.0, pSummary->fileTimeUs / 1000.0, pSummary->fileTimeUs * 100 / total,
      pSummary->cacheTimeUs / 1000.0, pSummary->cacheTimeUs * 100 / total, io / 1000.0, io * 100 / total,
      computing / 1000.0, computing * 100 / total);
#endif
}

4166
int32_t doInitQInfo(SQInfo *pQInfo, void *param, void* tsdb, bool isSTableQuery) {
4167 4168
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  
4169 4170
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
  int32_t code = TSDB_CODE_SUCCESS;
4171

4172 4173
  setScanLimitationByResultBuffer(pQuery);
  changeExecuteScanOrder(pQuery, false);
4174

4175 4176
  // dataInCache requires lastKey value
  pQuery->lastKey = pQuery->window.skey;
4177

4178 4179 4180
  STsdbQueryCond cond = {
    .twindow = pQuery->window,
    .order   = pQuery->order.order,
H
hjxilinx 已提交
4181
    .colList = pQuery->colList,
4182
  };
4183

4184 4185 4186 4187
  SArray *cols = taosArrayInit(pQuery->numOfCols, sizeof(pQuery->colList[0]));
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
    taosArrayPush(cols, &pQuery->colList[i]);
  }
4188
  
4189
  pRuntimeEnv->pQueryHandle = tsdbQueryTables(tsdb, &cond, &pQInfo->groupInfo, cols);
4190
  taosArrayDestroy(cols);
4191
  pQInfo->tsdb = tsdb;
4192
  
4193 4194 4195
  pRuntimeEnv->pQuery = pQuery;
  pRuntimeEnv->pTSBuf = param;
  pRuntimeEnv->cur.vnodeIndex = -1;
4196 4197
  pRuntimeEnv->stableQuery = isSTableQuery;
  
4198
  if (param != NULL) {
4199
    int16_t order = (pQuery->order.order == pRuntimeEnv->pTSBuf->tsOrder) ? TSDB_ORDER_ASC : TSDB_ORDER_DESC;
4200 4201
    tsBufSetTraverseOrder(pRuntimeEnv->pTSBuf, order);
  }
4202

4203
  // create runtime environment
4204
  code = setupQueryRuntimeEnv(pRuntimeEnv, NULL, pQuery->order.order);
4205 4206 4207
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
4208

4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230
  pRuntimeEnv->numOfRowsPerPage = getNumOfRowsInResultPage(pQuery, isSTableQuery);
  
  if (isSTableQuery) {
    int32_t rows = getInitialPageNum(pQInfo);
    code = createDiskbasedResultBuffer(&pRuntimeEnv->pResultBuf, rows, pQuery->rowSize);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
  
    if (pQuery->intervalTime == 0) {
      int16_t type = TSDB_DATA_TYPE_NULL;
    
      if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {  // group by columns not tags;
        type = getGroupbyColumnType(pQuery, pQuery->pGroupbyExpr);
      } else {
        type = TSDB_DATA_TYPE_INT;  // group id
      }
    
      initWindowResInfo(&pRuntimeEnv->windowResInfo, pRuntimeEnv, 512, 4096, type);
    }
  
  } else if (isGroupbyNormalCol(pQuery->pGroupbyExpr) || isIntervalQuery(pQuery)) {
4231 4232 4233 4234 4235
    int32_t rows = getInitialPageNum(pQInfo);
    code = createDiskbasedResultBuffer(&pRuntimeEnv->pResultBuf, rows, pQuery->rowSize);
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4236

4237 4238 4239 4240 4241 4242
    int16_t type = TSDB_DATA_TYPE_NULL;
    if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
      type = getGroupbyColumnType(pQuery, pQuery->pGroupbyExpr);
    } else {
      type = TSDB_DATA_TYPE_TIMESTAMP;
    }
4243

4244 4245
    initWindowResInfo(&pRuntimeEnv->windowResInfo, pRuntimeEnv, rows, 4096, type);
  }
4246

4247
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
4248

4249 4250
  SPointInterpoSupporter interpInfo = {0};
  pointInterpSupporterInit(pQuery, &interpInfo);
4251

4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262
  /*
   * in case of last_row query without query range, we set the query timestamp to
   * pMeterObj->lastKey. Otherwise, keep the initial query time range unchanged.
   */
  if (isFirstLastRowQuery(pQuery) && notHasQueryTimeRange(pQuery)) {
    if (!normalizeUnBoundLastRowQuery(pQInfo, &interpInfo)) {
      sem_post(&pQInfo->dataReady);
      pointInterpSupporterDestroy(&interpInfo);
      return TSDB_CODE_SUCCESS;
    }
  }
4263

4264 4265 4266 4267 4268 4269
  /*
   * here we set the value for before and after the specified time into the
   * parameter for interpolation query
   */
  pointInterpSupporterSetData(pQInfo, &interpInfo);
  pointInterpSupporterDestroy(&interpInfo);
4270

4271 4272 4273 4274
  // todo move to other location
  //  if (!forwardQueryStartPosIfNeeded(pQInfo, pQInfo, dataInDisk, dataInCache)) {
  //    return TSDB_CODE_SUCCESS;
  //  }
4275

4276 4277 4278
  int64_t rs = taosGetIntervalStartTimestamp(pQuery->window.skey, pQuery->intervalTime, pQuery->slidingTimeUnit,
                                             pQuery->precision);
  taosInitInterpoInfo(&pRuntimeEnv->interpoInfo, pQuery->order.order, rs, 0, 0);
4279 4280
  //  allocMemForInterpo(pQInfo, pQuery, pMeterObj);

4281 4282 4283
  if (!isPointInterpoQuery(pQuery)) {
    //    assert(pQuery->pos >= 0 && pQuery->slot >= 0);
  }
4284

4285 4286 4287 4288 4289
  // the pQuery->window.skey is changed during normalizedFirstQueryRange, so set the newest lastkey value
  pQuery->lastKey = pQuery->window.skey;
  return TSDB_CODE_SUCCESS;
}

4290
static UNUSED_FUNC bool isGroupbyEachTable(SSqlGroupbyExpr *pGroupbyExpr, STableGroupInfo *pSidset) {
4291 4292 4293
  if (pGroupbyExpr == NULL || pGroupbyExpr->numOfGroupCols == 0) {
    return false;
  }
4294

4295
  for (int32_t i = 0; i < pGroupbyExpr->numOfGroupCols; ++i) {
4296
    SColIndex *pColIndex = &pGroupbyExpr->columnInfo[i];
4297
    if (pColIndex->flag == TSDB_COL_TAG) {
4298
      //      assert(pSidset->numOfTables == pSidset->numOfSubSet);
4299 4300 4301
      return true;
    }
  }
4302

4303 4304 4305
  return false;
}

4306
static UNUSED_FUNC bool doCheckWithPrevQueryRange(SQuery *pQuery, TSKEY nextKey) {
4307 4308 4309 4310
  if ((nextKey > pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
      (nextKey < pQuery->window.ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
    return false;
  }
4311

4312 4313 4314
  return true;
}

4315
static void enableExecutionForNextTable(SQueryRuntimeEnv *pRuntimeEnv) {
4316
  SQuery *pQuery = pRuntimeEnv->pQuery;
4317

4318 4319 4320 4321 4322 4323 4324 4325
  for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
    SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[i]);
    if (pResInfo != NULL) {
      pResInfo->complete = false;
    }
  }
}

H
hjxilinx 已提交
4326
static int64_t queryOnDataBlocks(SQInfo *pQInfo) {
4327 4328
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
4329 4330 4331
  
  int64_t st = taosGetTimestampMs();
  
H
hzcheng 已提交
4332
  TsdbQueryHandleT *pQueryHandle = pRuntimeEnv->pQueryHandle;
4333
  while (tsdbNextDataBlock(pQueryHandle)) {
4334
    if (isQueryKilled(pQInfo)) {
4335 4336
      break;
    }
4337

4338
    SDataBlockInfo blockInfo = tsdbRetrieveDataBlockInfo(pQueryHandle);
H
hjxilinx 已提交
4339
    STableDataInfo* pTableDataInfo = NULL;
4340
    STable* pTable = NULL;
H
hjxilinx 已提交
4341
    
4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356
    // todo opt performance using hash table
    size_t numOfGroup = taosArrayGetSize(pQInfo->groupInfo.pGroupList);
    for(int32_t i = 0; i < numOfGroup; ++i) {
      SArray* group = taosArrayGetP(pQInfo->groupInfo.pGroupList, i);
      
      size_t num = taosArrayGetSize(group);
      for(int32_t j = 0; j < num; ++j) {
        SPair* p = taosArrayGet(group, j);
        STableDataInfo* pInfo = p->sec;
        
        if (pInfo->pTableQInfo->tid == blockInfo.sid) {
          pTableDataInfo = p->sec;
          pTable = p->first;
          break;
        }
H
hjxilinx 已提交
4357 4358
      }
    }
4359
    
H
hjxilinx 已提交
4360
    assert(pTableDataInfo != NULL && pTableDataInfo->pTableQInfo != NULL);
4361
    STableQueryInfo *pTableQueryInfo = pTableDataInfo->pTableQInfo;
4362

4363
    restoreIntervalQueryRange(pRuntimeEnv, pTableQueryInfo);
4364

4365
    SDataStatis *pStatis = NULL;
4366 4367
    SArray *     pDataBlock = loadDataBlockOnDemand(pRuntimeEnv, &blockInfo, &pStatis);

4368
    TSKEY nextKey = blockInfo.window.ekey;
4369
    if (!isIntervalQuery(pQuery)) {
4370
      setExecutionContext(pQInfo, pTableQueryInfo, pTable, pTableDataInfo->groupIdx, nextKey);
4371 4372
    } else {  // interval query
      setIntervalQueryRange(pTableQueryInfo, pQInfo, nextKey);
4373
      int32_t ret = setAdditionalInfo(pQInfo, pTable, pTableQueryInfo);
H
hjxilinx 已提交
4374
      
4375
      if (ret != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
4376 4377
        pQInfo->code = ret;
        return taosGetTimestampMs() - st;
4378 4379
      }
    }
4380

H
hjxilinx 已提交
4381
    stableApplyFunctionsOnBlock(pRuntimeEnv, pTableDataInfo, &blockInfo, pStatis, pDataBlock, binarySearchForKey);
4382
  }
H
hjxilinx 已提交
4383 4384 4385
  
  int64_t et = taosGetTimestampMs();
  return et - st;
4386 4387
}

4388 4389 4390 4391
static bool multiTableMultioutputHelper(SQInfo *pQInfo, int32_t index) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *pQuery = pRuntimeEnv->pQuery;

4392
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
4393 4394
  SArray* group = taosArrayGetP(pQInfo->groupInfo.pGroupList, 0);
  SPair* p = taosArrayGet(group, index);
4395
  
4396 4397
  STable* pTable = p->first;
  STableDataInfo* pInfo = p->sec;
4398
  
4399
  setTagVal(pRuntimeEnv, pTable->tableId, pQInfo->tsdb);
4400
  
4401 4402
  dTrace("QInfo:%p query on (%d): uid:%" PRIu64 ", tid:%d, qrange:%" PRId64 "-%" PRId64, pQInfo, index,
         pTable->tableId.uid, pInfo->pTableQInfo->lastKey, pInfo->pTableQInfo->win.ekey);
4403
  
4404 4405 4406 4407 4408
  STsdbQueryCond cond = {
      .twindow = {pInfo->pTableQInfo->lastKey, pInfo->pTableQInfo->win.ekey},
      .order   = pQuery->order.order,
      .colList = pQuery->colList,
  };
4409
  
4410 4411 4412
  SArray *cols = taosArrayInit(pQuery->numOfCols, sizeof(pQuery->colList[0]));
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
    taosArrayPush(cols, &pQuery->colList[i]);
4413 4414
  }
  
4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427
  SArray* g1 = taosArrayInit(1, POINTER_BYTES);
  STableGroupInfo gp = {.numOfTables = 1, .pGroupList = g1};
  
  SArray* tx = taosArrayInit(1, sizeof(SPair));
  taosArrayPush(tx, p);
  
  taosArrayPush(g1, &tx);
  // include only current table
  pRuntimeEnv->pQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &gp, cols);

//  vnodeUpdateQueryColumnIndex(pQuery, pRuntimeEnv->pMeterObj);
//  vnodeUpdateFilterColumnIndex(pQuery);

4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440
  if (pRuntimeEnv->pTSBuf != NULL) {
    if (pRuntimeEnv->cur.vnodeIndex == -1) {
      int64_t tag = pRuntimeEnv->pCtx[0].tag.i64Key;
      STSElem elem = tsBufGetElemStartPos(pRuntimeEnv->pTSBuf, 0, tag);
      
      // failed to find data with the specified tag value
      if (elem.vnode < 0) {
        return false;
      }
    } else {
      tsBufSetCursor(pRuntimeEnv->pTSBuf, &pRuntimeEnv->cur);
    }
  }
4441

4442
  initCtxOutputBuf(pRuntimeEnv);
4443 4444 4445
  return true;
}

4446
static UNUSED_FUNC int64_t doCheckMetersInGroup(SQInfo *pQInfo, int32_t index, int32_t start) {
4447
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4448 4449
  SQuery *          pQuery = pRuntimeEnv->pQuery;

4450
  if (!multiTableMultioutputHelper(pQInfo, index)) {
4451 4452
    return 0;
  }
4453

4454 4455 4456 4457
  SPointInterpoSupporter pointInterpSupporter = {0};
  pointInterpSupporterInit(pQuery, &pointInterpSupporter);
  assert(0);

4458 4459 4460 4461 4462
  //  if (!normalizedFirstQueryRange(dataInDisk, dataInCache, pSupporter, &pointInterpSupporter, NULL)) {
  //    pointInterpSupporterDestroy(&pointInterpSupporter);
  //    return 0;
  //  }

4463 4464 4465 4466 4467 4468
  /*
   * here we set the value for before and after the specified time into the
   * parameter for interpolation query
   */
  pointInterpSupporterSetData(pQInfo, &pointInterpSupporter);
  pointInterpSupporterDestroy(&pointInterpSupporter);
4469

H
hjxilinx 已提交
4470
  scanAllDataBlocks(pRuntimeEnv);
4471

4472
  // first/last_row query, do not invoke the finalize for super table query
H
hjxilinx 已提交
4473
  finalizeQueryResult(pRuntimeEnv);
4474

4475 4476
  int64_t numOfRes = getNumOfResult(pRuntimeEnv);
  assert(numOfRes == 1 || numOfRes == 0);
4477

4478 4479
  // accumulate the point interpolation result
  if (numOfRes > 0) {
4480
    pQuery->rec.rows += numOfRes;
4481 4482
    forwardCtxOutputBuf(pRuntimeEnv, numOfRes);
  }
4483

4484 4485 4486 4487 4488 4489 4490 4491 4492 4493
  return numOfRes;
}

/**
 * super table query handler
 * 1. super table projection query, group-by on normal columns query, ts-comp query
 * 2. point interpolation query, last row query
 *
 * @param pQInfo
 */
4494
static void sequentialTableProcess(SQInfo *pQInfo) {
4495
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4496
  SQuery* pQuery = pRuntimeEnv->pQuery;
4497 4498
  setQueryStatus(pQuery, QUERY_COMPLETED);
  
4499
  size_t numOfGroups = taosArrayGetSize(pQInfo->groupInfo.pGroupList);
4500 4501 4502 4503
  
  if (isPointInterpoQuery(pQuery)) {
    resetCtxOutputBuf(pRuntimeEnv);
    assert(pQuery->limit.offset == 0 && pQuery->limit.limit != 0);
4504 4505 4506 4507 4508 4509 4510

#if 0
    while (pQInfo->subgroupIdx < numOfGroups) {

      SArray* group = taosArrayGetP(pQInfo->groupInfo.pGroupList, pQInfo->subgroupIdx);
      size_t numOfTable = taosArrayGetSize(group);

4511
      if (isFirstLastRowQuery(pQuery)) {
H
hjxilinx 已提交
4512
        dTrace("QInfo:%p last_row query on vid:%d, numOfGroups:%d, current group:%d", pQInfo, vid, pTableIdList->numOfSubSet,
4513
               pQInfo->subgroupIdx);
4514 4515 4516 4517 4518
        
        TSKEY   key = -1;
        int32_t index = -1;
        
        // choose the last key for one group
4519
        pQInfo->tableIndex = 0;
4520
        
4521
        for (int32_t k = 0; k < numOfTable; ++k, pQInfo->tableIndex++) {
4522
          if (isQueryKilled(pQInfo)) {
4523 4524 4525 4526
            return;
          }
        }
        
4527 4528
        pQuery->window.skey = key;
        pQuery->window.ekey = key;
4529
        
4530 4531
//        int64_t num = doCheckMetersInGroup(pQInfo, index, start);
//        assert(num >= 0);
4532
      } else {
H
hjxilinx 已提交
4533
        dTrace("QInfo:%p interp query on vid:%d, numOfGroups:%d, current group:%d", pQInfo, vid, pTableIdList->numOfSubSet,
4534
               pQInfo->subgroupIdx);
4535 4536
        
        for (int32_t k = start; k <= end; ++k) {
4537
          if (isQueryKilled(pQInfo)) {
4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554
            setQueryStatus(pQuery, QUERY_NO_DATA_TO_CHECK);
            return;
          }
          
          pQuery->skey = pSupporter->rawSKey;
          pQuery->ekey = pSupporter->rawEKey;
          
          int64_t num = doCheckMetersInGroup(pQInfo, k, start);
          if (num == 1) {
            break;
          }
        }
      }
      
      pSupporter->subgroupIdx++;
      
      // output buffer is full, return to client
H
hjxilinx 已提交
4555
      if (pQuery->size >= pQuery->pointsToRead) {
4556 4557 4558
        break;
      }
    }
4559 4560
#endif
  
4561
  } else {
4562
    createTableDataInfo(pQInfo);
4563 4564
    
    /*
4565
     * 1. super table projection query, 2. group-by on normal columns query, 3. ts-comp query
4566 4567 4568
     * if the subgroup index is larger than 0, results generated by group by tbname,k is existed.
     * we need to return it to client in the first place.
     */
4569
    if (pQInfo->subgroupIdx > 0) {
4570
      copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
4571
      pQuery->rec.total += pQuery->rec.rows;
4572
      
4573
      if (pQuery->rec.rows > 0) {
4574 4575 4576 4577
        return;
      }
    }
    
4578 4579
    // all data have returned already
    if (pQInfo->tableIndex >= pQInfo->groupInfo.numOfTables) {
4580 4581 4582 4583 4584 4585
      return;
    }
    
    resetCtxOutputBuf(pRuntimeEnv);
    resetTimeWindowInfo(pRuntimeEnv, &pRuntimeEnv->windowResInfo);
    
4586 4587 4588 4589 4590
    SArray* group = taosArrayGetP(pQInfo->groupInfo.pGroupList, 0);
    assert(taosArrayGetSize(group) == pQInfo->groupInfo.numOfTables && 1 == taosArrayGetSize(pQInfo->groupInfo.pGroupList));
    
    while (pQInfo->tableIndex < pQInfo->groupInfo.numOfTables) {
      int32_t k = pQInfo->tableIndex;
4591
      
4592
      if (isQueryKilled(pQInfo)) {
4593 4594 4595
        return;
      }
      
4596 4597 4598 4599
      SPair *p = taosArrayGet(group, k);
      STableDataInfo* pInfo = p->sec;
      
      TSKEY skey = pInfo->pTableQInfo->lastKey;
4600
      if (skey > 0) {
4601
        pQuery->window.skey = skey;
4602 4603
      }
      
4604 4605
      if (!multiTableMultioutputHelper(pQInfo, k)) {
        pQInfo->tableIndex++;
4606 4607 4608
        continue;
      }
      
4609
//      SPointInterpoSupporter pointInterpSupporter = {0};
4610 4611 4612
      
      // TODO handle the limit problem
      if (pQuery->numOfFilterCols == 0 && pQuery->limit.offset > 0) {
4613
//        forwardQueryStartPosition(pRuntimeEnv);
4614
        
4615 4616
        if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
          pQInfo->tableIndex++;
4617 4618 4619 4620
          continue;
        }
      }
      
H
hjxilinx 已提交
4621
      scanAllDataBlocks(pRuntimeEnv);
4622
      
4623
      pQuery->rec.rows = getNumOfResult(pRuntimeEnv);
4624 4625 4626 4627
      doSkipResults(pRuntimeEnv);
      
      // the limitation of output result is reached, set the query completed
      if (doRevisedResultsByLimit(pQInfo)) {
4628
        pQInfo->tableIndex = pQInfo->groupInfo.numOfTables;
4629 4630 4631 4632 4633 4634
        break;
      }
      
      // enable execution for next table, when handling the projection query
      enableExecutionForNextTable(pRuntimeEnv);
      
4635
      if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
4636 4637 4638 4639 4640 4641
        /*
         * query range is identical in terms of all meters involved in query,
         * so we need to restore them at the *beginning* of query on each meter,
         * not the consecutive query on meter on which is aborted due to buffer limitation
         * to ensure that, we can reset the query range once query on a meter is completed.
         */
4642 4643
        pQInfo->tableIndex++;
        pInfo->pTableQInfo->lastKey = pQuery->lastKey;
4644 4645
        
        // if the buffer is full or group by each table, we need to jump out of the loop
4646 4647
        if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL) /*||
            isGroupbyEachTable(pQuery->pGroupbyExpr, pSupporter->pSidSet)*/) {
4648 4649 4650 4651
          break;
        }
        
      } else {  // forward query range
4652
        pQuery->window.skey = pQuery->lastKey;
4653 4654
        
        // all data in the result buffer are skipped due to the offset, continue to retrieve data from current meter
4655 4656
        if (pQuery->rec.rows == 0) {
          assert(!Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL));
4657 4658
          continue;
        } else {
4659 4660 4661 4662
//          pQInfo->pTableQuerySupporter->pMeterSidExtInfo[k]->key = pQuery->lastKey;
//          // buffer is full, wait for the next round to retrieve data from current meter
//          assert(Q_STATUS_EQUAL(pQuery->over, QUERY_RESBUF_FULL));
//          break;
4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679
        }
      }
    }
  }
  
  /*
   * 1. super table projection query, group-by on normal columns query, ts-comp query
   * 2. point interpolation query, last row query
   *
   * group-by on normal columns query and last_row query do NOT invoke the finalizer here,
   * since the finalize stage will be done at the client side.
   *
   * projection query, point interpolation query do not need the finalizer.
   *
   * Only the ts-comp query requires the finalizer function to be executed here.
   */
  if (isTSCompQuery(pQuery)) {
H
hjxilinx 已提交
4680
    finalizeQueryResult(pRuntimeEnv);
4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700
  }
  
  if (pRuntimeEnv->pTSBuf != NULL) {
    pRuntimeEnv->cur = pRuntimeEnv->pTSBuf->cur;
  }
  
  // todo refactor
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
    
    for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
      SWindowStatus *pStatus = &pWindowResInfo->pResult[i].status;
      pStatus->closed = true;  // enable return all results for group by normal columns
      
      SWindowResult *pResult = &pWindowResInfo->pResult[i];
      for (int32_t j = 0; j < pQuery->numOfOutputCols; ++j) {
        pResult->numOfRows = MAX(pResult->numOfRows, pResult->resultInfo[j].numOfRes);
      }
    }
    
4701 4702
    pQInfo->subgroupIdx = 0;
    pQuery->rec.rows = 0;
4703 4704 4705
    copyFromWindowResToSData(pQInfo, pWindowResInfo->pResult);
  }
  
4706
  pQuery->rec.total += pQuery->rec.rows;
4707
  
4708 4709 4710
  dTrace( "QInfo %p, numOfTables:%d, index:%d, numOfGroups:%d, %d points returned, total:%d totalReturn:%d,"
      " offset:%" PRId64, pQInfo, pQInfo->groupInfo.numOfTables, pQInfo->tableIndex, numOfGroups,
      pQuery->rec.rows, pQuery->rec.total, pQuery->limit.offset);
4711 4712
}

H
hjxilinx 已提交
4713 4714
static void createTableDataInfo(SQInfo* pQInfo) {
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
4715
  
H
hjxilinx 已提交
4716
  // todo make sure the table are added the reference count to gauranteed that all involved tables are valid
4717
  size_t numOfGroups = taosArrayGetSize(pQInfo->groupInfo.pGroupList);
4718
  
4719 4720 4721
  int32_t index = 0;
  for (int32_t i = 0; i < numOfGroups; ++i) {  // load all meter meta info
    SArray *group = *(SArray**) taosArrayGet(pQInfo->groupInfo.pGroupList, i);
4722
    
4723 4724 4725
    size_t s = taosArrayGetSize(group);
    for(int32_t j = 0; j < s; ++j) {
      SPair* p = (SPair*) taosArrayGet(group, j);
4726 4727 4728 4729 4730 4731
      
      // STableDataInfo has been created for each table
      if (p->sec != NULL) {  // todo refactor
        return;
      }
      
4732
      STableDataInfo* pInfo = calloc(1, sizeof(STableDataInfo));
H
hjxilinx 已提交
4733
      
4734 4735 4736 4737
      setTableDataInfo(pInfo, index, i);
      pInfo->pTableQInfo = createTableQueryInfo(&pQInfo->runtimeEnv, ((STable*)(p->first))->tableId.tid, pQuery->window);
      
      p->sec = pInfo;
4738
      
4739
      index += 1;
4740 4741 4742 4743
    }
  }
}

H
hjxilinx 已提交
4744
static void prepareQueryInfoForReverseScan(SQInfo *pQInfo) {
4745
//  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
hjxilinx 已提交
4746
  
4747 4748 4749 4750
//  for (int32_t i = 0; i < pQInfo->groupInfo.numOfTables; ++i) {
//    STableQueryInfo *pTableQueryInfo = pQInfo->pTableDataInfo[i].pTableQInfo;
//    changeMeterQueryInfoForSuppleQuery(pQuery, pTableQueryInfo);
//  }
4751 4752
}

H
hjxilinx 已提交
4753 4754 4755 4756
static void doSaveContext(SQInfo* pQInfo) {
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery* pQuery = pRuntimeEnv->pQuery;
  
4757
  SET_SUPPLEMENT_SCAN_FLAG(pRuntimeEnv);
H
hjxilinx 已提交
4758 4759
  disableFuncForReverseScan(pQInfo, pQuery->order.order);
  
4760 4761 4762 4763
  if (pRuntimeEnv->pTSBuf != NULL) {
    pRuntimeEnv->pTSBuf->cur.order = pRuntimeEnv->pTSBuf->cur.order ^ 1u;
  }
  
H
hjxilinx 已提交
4764 4765 4766 4767 4768 4769 4770
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
  prepareQueryInfoForReverseScan(pQInfo);
}

static void doRestoreContext(SQInfo* pQInfo) {
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery* pQuery = pRuntimeEnv->pQuery;
4771
  
H
hjxilinx 已提交
4772
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
4773 4774 4775 4776
  
  if (pRuntimeEnv->pTSBuf != NULL) {
    pRuntimeEnv->pTSBuf->cur.order = pRuntimeEnv->pTSBuf->cur.order ^ 1;
  }
H
hjxilinx 已提交
4777 4778
  
  enableFuncForForwardScan(pRuntimeEnv, pQuery->order.order);
4779 4780 4781
  SET_MASTER_SCAN_FLAG(pRuntimeEnv);
}

H
hjxilinx 已提交
4782 4783 4784 4785
static void doCloseAllTimeWindowAfterScan(SQInfo* pQInfo) {
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
  
  if (isIntervalQuery(pQuery)) {
4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800
//    for (int32_t i = 0; i < pQInfo->groupInfo.numOfTables; ++i) {
//      STableQueryInfo *pTableQueryInfo = pQInfo->pTableDataInfo[i].pTableQInfo;
//      closeAllTimeWindow(&pTableQueryInfo->windowResInfo);
//    }
    size_t numOfGroup = taosArrayGetSize(pQInfo->groupInfo.pGroupList);
    for(int32_t i = 0; i < numOfGroup; ++i) {
      SArray* group = taosArrayGetP(pQInfo->groupInfo.pGroupList, i);
    
      size_t num = taosArrayGetSize(group);
      for(int32_t j = 0; j < num; ++j) {
        SPair* p = taosArrayGet(group, j);
        STableDataInfo* pInfo = p->sec;
        
        closeAllTimeWindow(&pInfo->pTableQInfo->windowResInfo);
      }
H
hjxilinx 已提交
4801 4802 4803 4804 4805 4806 4807
    }
  } else {  // close results for group result
    closeAllTimeWindow(&pQInfo->runtimeEnv.windowResInfo);
  }
}

static void multiTableQueryProcess(SQInfo *pQInfo) {
4808 4809 4810
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

4811 4812 4813 4814 4815
  if (pQInfo->subgroupIdx > 0) {
    /*
     * if the subgroupIdx > 0, the query process must be completed yet, we only need to
     * copy the data into output buffer
     */
H
hjxilinx 已提交
4816
    if (isIntervalQuery(pQuery)) {
4817 4818 4819 4820 4821 4822 4823 4824
      copyResToQueryResultBuf(pQInfo, pQuery);

#ifdef _DEBUG_VIEW
      displayInterResult(pQuery->sdata, pQuery, pQuery->sdata[0]->len);
#endif
    } else {
      copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
    }
4825

4826
    pQuery->rec.rows += pQuery->rec.rows;
4827

4828
    if (pQuery->rec.rows == 0) {
4829
      //      vnodePrintQueryStatistics(pSupporter);
4830
    }
4831

H
hjxilinx 已提交
4832
    dTrace("QInfo:%p current:%lld, total:%lld", pQInfo, pQuery->rec.rows, pQuery->rec.total);
4833 4834 4835
    return;
  }
  
H
hjxilinx 已提交
4836 4837
  dTrace("QInfo:%p query start, qrange:%" PRId64 "-%" PRId64 ", order:%d, forward scan start", pQInfo, pQuery->window.skey,
         pQuery->window.ekey, pQuery->order.order);
4838
  
H
hjxilinx 已提交
4839 4840
  // create the query support structures
  createTableDataInfo(pQInfo);
4841
  
H
hjxilinx 已提交
4842 4843 4844
  // do check all qualified data blocks
  int64_t el = queryOnDataBlocks(pQInfo);
  dTrace("QInfo:%p forward scan completed, elapsed time: %lldms, reversed scan start, order:%d", pQInfo, el,
4845 4846
         pQuery->order.order ^ 1u);
  
H
hjxilinx 已提交
4847 4848 4849 4850
  // query error occurred or query is killed, abort current execution
  if (pQInfo->code != TSDB_CODE_SUCCESS || isQueryKilled(pQInfo)) {
    dTrace("QInfo:%p query killed or error occurred, code:%d, abort", pQInfo, pQInfo->code);
    return;
4851 4852
  }
  
H
hjxilinx 已提交
4853 4854
  // close all time window results
  doCloseAllTimeWindowAfterScan(pQInfo);
4855
  
H
hjxilinx 已提交
4856 4857 4858 4859 4860 4861 4862 4863 4864
  if (needReverseScan(pQuery)) {
    doSaveContext(pQInfo);
    
    el = queryOnDataBlocks(pQInfo);
    dTrace("QInfo:%p reversed scan completed, elapsed time: %lldms", pQInfo, el);
    
    doRestoreContext(pQInfo);
  } else {
    dTrace("QInfo:%p no need to do reversed scan, query completed", pQInfo);
4865 4866
  }
  
4867 4868
  setQueryStatus(pQuery, QUERY_COMPLETED);
  
H
hjxilinx 已提交
4869 4870 4871 4872 4873 4874 4875
  if (pQInfo->code != TSDB_CODE_SUCCESS || isQueryKilled(pQInfo)) {
    dTrace("QInfo:%p query killed or error occurred, code:%d, abort", pQInfo, pQInfo->code);
    return;
  }
  
  if (isIntervalQuery(pQuery) || isSumAvgRateQuery(pQuery)) {
//    assert(pSupporter->subgroupIdx == 0 && pSupporter->numOfGroupResultPages == 0);
4876
    
4877
    if (mergeIntoGroupResult(pQInfo) == TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
4878
      copyResToQueryResultBuf(pQInfo, pQuery);
4879 4880 4881 4882 4883 4884 4885 4886 4887 4888

#ifdef _DEBUG_VIEW
      displayInterResult(pQuery->sdata, pQuery, pQuery->sdata[0]->len);
#endif
    }
  } else {  // not a interval query
    copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
  }
  
  // handle the limitation of output buffer
4889
  dTrace("QInfo:%p points returned:%d, total:%d", pQInfo, pQuery->rec.rows, pQuery->rec.total + pQuery->rec.rows);
4890 4891 4892 4893 4894 4895 4896 4897
}

/*
 * in each query, this function will be called only once, no retry for further result.
 *
 * select count(*)/top(field,k)/avg(field name) from table_name [where ts>now-1a];
 * select count(*) from table_name group by status_column;
 */
H
hjxilinx 已提交
4898
static void tableFixedOutputProcess(SQInfo *pQInfo) {
4899 4900
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
4901

H
hjxilinx 已提交
4902
  scanAllDataBlocks(pRuntimeEnv);
H
hjxilinx 已提交
4903
  finalizeQueryResult(pRuntimeEnv);
4904

4905
  if (isQueryKilled(pQInfo)) {
4906 4907
    return;
  }
4908

4909
  // since the numOfOutputElems must be identical for all sql functions that are allowed to be executed simutanelously.
4910
  pQuery->rec.rows = getNumOfResult(pRuntimeEnv);
4911

4912 4913 4914 4915
  // must be top/bottom query if offset > 0
  if (pQuery->limit.offset > 0) {
    assert(isTopBottomQuery(pQuery));
  }
4916

4917 4918 4919 4920
  doSkipResults(pRuntimeEnv);
  doRevisedResultsByLimit(pQInfo);
}

4921
static void tableMultiOutputProcess(SQInfo *pQInfo) {
4922
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4923
  SQuery *pQuery = pRuntimeEnv->pQuery;
4924 4925 4926 4927 4928 4929 4930

  // for ts_comp query, re-initialized is not allowed
  if (!isTSCompQuery(pQuery)) {
    resetCtxOutputBuf(pRuntimeEnv);
  }

  while (1) {
H
hjxilinx 已提交
4931
    scanAllDataBlocks(pRuntimeEnv);
H
hjxilinx 已提交
4932
    finalizeQueryResult(pRuntimeEnv);
4933

4934
    if (isQueryKilled(pQInfo)) {
4935 4936 4937
      return;
    }

4938 4939
    pQuery->rec.rows = getNumOfResult(pRuntimeEnv);
    if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols > 0 && pQuery->rec.rows > 0) {
4940 4941 4942 4943
      doSkipResults(pRuntimeEnv);
    }

    /*
H
hjxilinx 已提交
4944 4945
     * 1. if pQuery->size == 0, pQuery->limit.offset >= 0, still need to check data
     * 2. if pQuery->size > 0, pQuery->limit.offset must be 0
4946
     */
4947
    if (pQuery->rec.rows > 0 || Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
4948 4949 4950 4951
      break;
    }

    dTrace("QInfo:%p vid:%d sid:%d id:%s, skip current result, offset:%" PRId64 ", next qrange:%" PRId64 "-%" PRId64,
4952
           pQInfo, pQuery->limit.offset, pQuery->lastKey);
4953 4954 4955 4956 4957

    resetCtxOutputBuf(pRuntimeEnv);
  }

  doRevisedResultsByLimit(pQInfo);
4958
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
4959 4960
    dTrace("QInfo:%p query paused due to output limitation, next qrange:%" PRId64 "-%" PRId64,
        pQInfo, pQuery->lastKey, pQuery->window.ekey);
4961 4962
  }

4963
//  dTrace("QInfo:%p vid:%d sid:%d id:%s, %d points returned, totalRead:%d totalReturn:%d", pQInfo, pMeterObj->vnode,
H
hjxilinx 已提交
4964
//         pMeterObj->sid, pMeterObj->meterId, pQuery->size, pQInfo->size, pQInfo->pointsReturned);
4965

4966 4967 4968
  if (!isTSCompQuery(pQuery)) {
    assert(pQuery->rec.rows <= pQuery->rec.capacity);
  }
4969 4970
}

H
hjxilinx 已提交
4971
static void tableIntervalProcessImpl(SQueryRuntimeEnv *pRuntimeEnv) {
4972
  SQuery *pQuery = pRuntimeEnv->pQuery;
4973

4974
  while (1) {
H
hjxilinx 已提交
4975
    scanAllDataBlocks(pRuntimeEnv);
4976

4977
    if (isQueryKilled(GET_QINFO_ADDR(pRuntimeEnv))) {
4978 4979
      return;
    }
4980

4981
    assert(!Q_STATUS_EQUAL(pQuery->status, QUERY_NOT_COMPLETED));
H
hjxilinx 已提交
4982
    finalizeQueryResult(pRuntimeEnv);
4983

4984 4985 4986 4987 4988 4989
    // here we can ignore the records in case of no interpolation
    // todo handle offset, in case of top/bottom interval query
    if ((pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL) && pQuery->limit.offset > 0 &&
        pQuery->interpoType == TSDB_INTERPO_NONE) {
      // maxOutput <= 0, means current query does not generate any results
      int32_t numOfClosed = numOfClosedTimeWindow(&pRuntimeEnv->windowResInfo);
4990

4991 4992 4993 4994
      int32_t c = MIN(numOfClosed, pQuery->limit.offset);
      clearFirstNTimeWindow(pRuntimeEnv, c);
      pQuery->limit.offset -= c;
    }
4995

H
hjxilinx 已提交
4996
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED|QUERY_RESBUF_FULL)) {
4997 4998 4999 5000 5001
      break;
    }
  }
}

5002
// handle time interval query on table
H
hjxilinx 已提交
5003
static void tableIntervalProcess(SQInfo *pQInfo) {
5004 5005 5006
  SQueryRuntimeEnv *pRuntimeEnv = &(pQInfo->runtimeEnv);
  SQuery *          pQuery = pRuntimeEnv->pQuery;

5007
  int32_t numOfInterpo = 0;
5008

5009
  while (1) {
H
hjxilinx 已提交
5010
    tableIntervalProcessImpl(pRuntimeEnv);
5011

H
hjxilinx 已提交
5012
    if (isIntervalQuery(pQuery)) {
5013
      pQInfo->subgroupIdx = 0;  // always start from 0
5014
      pQuery->rec.rows = 0;
5015
      copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
5016

5017 5018
      clearFirstNTimeWindow(pRuntimeEnv, pQInfo->subgroupIdx);
    }
5019

5020 5021 5022 5023 5024
    // the offset is handled at prepare stage if no interpolation involved
    if (pQuery->interpoType == TSDB_INTERPO_NONE) {
      doRevisedResultsByLimit(pQInfo);
      break;
    } else {
5025
      taosInterpoSetStartInfo(&pRuntimeEnv->interpoInfo, pQuery->rec.rows, pQuery->interpoType);
5026
      SData **pInterpoBuf = pRuntimeEnv->pInterpoBuf;
5027

5028
      for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
5029
        memcpy(pInterpoBuf[i]->data, pQuery->sdata[i]->data, pQuery->rec.rows * pQuery->pSelectExpr[i].resBytes);
5030
      }
5031

5032
      numOfInterpo = 0;
5033 5034
      pQuery->rec.rows = vnodeQueryResultInterpolate(
          pQInfo, (tFilePage **)pQuery->sdata, (tFilePage **)pInterpoBuf, pQuery->rec.rows, &numOfInterpo);
5035

5036 5037
      dTrace("QInfo: %p interpo completed, final:%d", pQInfo, pQuery->rec.rows);
      if (pQuery->rec.rows > 0 || Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
5038 5039 5040
        doRevisedResultsByLimit(pQInfo);
        break;
      }
5041

5042
      // no result generated yet, continue retrieve data
5043
      pQuery->rec.rows = 0;
5044 5045
    }
  }
5046

5047 5048 5049
  // all data scanned, the group by normal column can return
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {  // todo refactor with merge interval time result
    pQInfo->subgroupIdx = 0;
5050
    pQuery->rec.rows = 0;
5051 5052 5053
    copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
    clearFirstNTimeWindow(pRuntimeEnv, pQInfo->subgroupIdx);
  }
5054

5055 5056 5057
  pQInfo->pointsInterpo += numOfInterpo;
}

5058
static void tableQueryImpl(SQInfo* pQInfo) {
H
hjxilinx 已提交
5059 5060
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery* pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
5061
  
5062 5063 5064 5065 5066 5067 5068
  if (vnodeHasRemainResults(pQInfo)) {
    /*
     * There are remain results that are not returned due to result interpolation
     * So, we do keep in this procedure instead of launching retrieve procedure for next results.
     */
    int32_t numOfInterpo = 0;
    int32_t remain = taosNumOfRemainPoints(&pRuntimeEnv->interpoInfo);
5069
    pQuery->rec.rows = vnodeQueryResultInterpolate(pQInfo, (tFilePage **)pQuery->sdata,
H
hjxilinx 已提交
5070 5071
                                                   (tFilePage **)pRuntimeEnv->pInterpoBuf, remain, &numOfInterpo);
    
5072
    doRevisedResultsByLimit(pQInfo);
H
hjxilinx 已提交
5073
    
5074
    pQInfo->pointsInterpo += numOfInterpo;
5075
    dTrace("QInfo:%p current:%d returned, total:%d", pQInfo, pQuery->rec.rows, pQuery->rec.total);
5076 5077 5078
    sem_post(&pQInfo->dataReady);
    return;
  }
H
hjxilinx 已提交
5079
  
5080
  // here we have scan all qualified data in both data file and cache
H
hjxilinx 已提交
5081
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
5082 5083
    // continue to get push data from the group result
    if (isGroupbyNormalCol(pQuery->pGroupbyExpr) ||
H
hjxilinx 已提交
5084 5085
          ((isIntervalQuery(pQuery) && pQuery->rec.total < pQuery->limit.limit))) {
      
5086
      // todo limit the output for interval query?
5087
      pQuery->rec.rows = 0;
5088
      pQInfo->subgroupIdx = 0;  // always start from 0
H
hjxilinx 已提交
5089
      
5090 5091
      if (pRuntimeEnv->windowResInfo.size > 0) {
        copyFromWindowResToSData(pQInfo, pRuntimeEnv->windowResInfo.pResult);
5092
        pQuery->rec.rows += pQuery->rec.rows;
H
hjxilinx 已提交
5093
        
5094
        clearFirstNTimeWindow(pRuntimeEnv, pQInfo->subgroupIdx);
H
hjxilinx 已提交
5095
        
5096 5097
        if (pQuery->rec.rows > 0) {
          dTrace("QInfo:%p %d rows returned from group results, total:%d", pQInfo, pQuery->rec.rows, pQuery->rec.total);
5098 5099 5100 5101 5102
          sem_post(&pQInfo->dataReady);
          return;
        }
      }
    }
H
hjxilinx 已提交
5103 5104
    
    dTrace("QInfo:%p query over, %d rows are returned", pQInfo, pQuery->rec.total);
5105
    //    vnodePrintQueryStatistics(pSupporter);
5106 5107 5108
    sem_post(&pQInfo->dataReady);
    return;
  }
H
hjxilinx 已提交
5109
  
H
hjxilinx 已提交
5110
  // number of points returned during this query
5111
  pQuery->rec.rows = 0;
5112
  int64_t st = taosGetTimestampUs();
H
hjxilinx 已提交
5113
  
5114
  // group by normal column, sliding window query, interval query are handled by interval query processor
H
[td-98]  
hjxilinx 已提交
5115
  if (isIntervalQuery(pQuery) || isGroupbyNormalCol(pQuery->pGroupbyExpr)) {  // interval (down sampling operation)
H
hjxilinx 已提交
5116
    tableIntervalProcess(pQInfo);
5117
  } else if (isFixedOutputQuery(pQuery)) {
H
hjxilinx 已提交
5118
    tableFixedOutputProcess(pQInfo);
5119 5120 5121
  } else {  // diff/add/multiply/subtract/division
    assert(pQuery->checkBuffer == 1);
    tableMultiOutputProcess(pQInfo);
5122
  }
H
hjxilinx 已提交
5123
  
5124 5125
  // record the total elapsed time
  pQInfo->elapsedTime += (taosGetTimestampUs() - st);
5126
  assert(pQInfo->groupInfo.numOfTables == 1);
H
hjxilinx 已提交
5127
  
5128
  /* check if query is killed or not */
5129
  if (isQueryKilled(pQInfo)) {
5130 5131
    dTrace("QInfo:%p query is killed", pQInfo);
  } else {
5132 5133 5134
//    STableId* pTableId = taosArrayGet(pQInfo->groupInfo, 0);
//    dTrace("QInfo:%p uid:%" PRIu64 " tid:%d, query completed, %" PRId64 " rows returned, numOfTotal:%" PRId64 " rows",
//        pQInfo, pTableId->uid, pTableId->tid, pQuery->rec.rows, pQuery->rec.total + pQuery->rec.rows);
5135
  }
H
hjxilinx 已提交
5136
  
5137 5138 5139
  sem_post(&pQInfo->dataReady);
}

5140
static void stableQueryImpl(SQInfo* pQInfo) {
H
hjxilinx 已提交
5141
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
5142
  pQuery->rec.rows = 0;
5143 5144
  
  int64_t st = taosGetTimestampUs();
H
hjxilinx 已提交
5145
  
H
hjxilinx 已提交
5146
  if (isIntervalQuery(pQuery) ||
5147
      (isFixedOutputQuery(pQuery) && (!isPointInterpoQuery(pQuery)) && !isGroupbyNormalCol(pQuery->pGroupbyExpr))) {
H
hjxilinx 已提交
5148
    multiTableQueryProcess(pQInfo);
5149
  } else {
5150
    assert((pQuery->checkBuffer == 1 && pQuery->intervalTime == 0) || isPointInterpoQuery(pQuery) ||
5151 5152
        isGroupbyNormalCol(pQuery->pGroupbyExpr));
    
5153
    sequentialTableProcess(pQInfo);
5154 5155
  }
  
H
hjxilinx 已提交
5156
  // record the total elapsed time
5157
  pQInfo->elapsedTime += (taosGetTimestampUs() - st);
H
hjxilinx 已提交
5158
//  taosInterpoSetStartInfo(&pQInfo->runtimeEnv.interpoInfo, pQuery->size, pQInfo->query.interpoType);
5159
  
5160
  if (pQuery->rec.rows == 0) {
5161
    dTrace("QInfo:%p over, %d tables queried, %d points are returned", pQInfo, pQInfo->groupInfo.numOfTables, pQuery->rec.total);
5162 5163 5164 5165
//    vnodePrintQueryStatistics(pSupporter);
  }
  
  sem_post(&pQInfo->dataReady);
H
hjxilinx 已提交
5166 5167
}

H
hjxilinx 已提交
5168
static int32_t getColumnIndexInSource(SQueryTableMsg *pQueryMsg, SSqlFuncExprMsg *pExprMsg) {
5169 5170
  int32_t j = 0;

H
hjxilinx 已提交
5171 5172
  while (j < pQueryMsg->numOfCols) {
    if (pExprMsg->colInfo.colId == pQueryMsg->colList[j].colId) {
5173 5174 5175 5176 5177 5178 5179 5180 5181
      break;
    }

    j += 1;
  }

  return j;
}

H
hjxilinx 已提交
5182 5183 5184
bool vnodeValidateExprColumnInfo(SQueryTableMsg *pQueryMsg, SSqlFuncExprMsg *pExprMsg) {
  int32_t j = getColumnIndexInSource(pQueryMsg, pExprMsg);
  return j < pQueryMsg->numOfCols;
5185 5186
}

H
hjxilinx 已提交
5187
static int32_t validateQueryMsg(SQueryTableMsg *pQueryMsg) {
H
hjxilinx 已提交
5188 5189
  if (pQueryMsg->intervalTime < 0) {
    dError("qmsg:%p illegal value of aggTimeInterval %" PRId64 "", pQueryMsg, pQueryMsg->intervalTime);
5190 5191 5192
    return -1;
  }

H
hjxilinx 已提交
5193 5194
  if (pQueryMsg->numOfCols <= 0 || pQueryMsg->numOfCols > TSDB_MAX_COLUMNS) {
    dError("qmsg:%p illegal value of numOfCols %d", pQueryMsg, pQueryMsg->numOfCols);
5195 5196 5197
    return -1;
  }

H
hjxilinx 已提交
5198 5199
  if (pQueryMsg->numOfTables <= 0) {
    dError("qmsg:%p illegal value of numOfTables %d", pQueryMsg, pQueryMsg->numOfTables);
5200 5201 5202
    return -1;
  }

H
hjxilinx 已提交
5203 5204
  if (pQueryMsg->numOfGroupCols < 0) {
    dError("qmsg:%p illegal value of numOfGroupbyCols %d", pQueryMsg, pQueryMsg->numOfGroupCols);
5205 5206 5207
    return -1;
  }

H
hjxilinx 已提交
5208 5209
  if (pQueryMsg->numOfOutputCols > TSDB_MAX_COLUMNS || pQueryMsg->numOfOutputCols <= 0) {
    dError("qmsg:%p illegal value of output columns %d", pQueryMsg, pQueryMsg->numOfOutputCols);
5210 5211 5212 5213 5214 5215
    return -1;
  }

  return 0;
}

H
hjxilinx 已提交
5216 5217
static char* createTableIdList(SQueryTableMsg* pQueryMsg, char* pMsg, SArray** pTableIdList) {
  assert(pQueryMsg->numOfTables > 0);
H
hjxilinx 已提交
5218
  
5219
  *pTableIdList = taosArrayInit(pQueryMsg->numOfTables, sizeof(STableId));
H
hjxilinx 已提交
5220 5221 5222 5223 5224 5225
  
  STableIdInfo *pTableIdInfo = (STableIdInfo *)pMsg;
  pTableIdInfo->sid = htonl(pTableIdInfo->sid);
  pTableIdInfo->uid = htobe64(pTableIdInfo->uid);
  pTableIdInfo->key = htobe64(pTableIdInfo->key);
  
5226 5227 5228
  STableId id = {.uid = pTableIdInfo->uid, .tid = pTableIdInfo->sid};
  taosArrayPush(*pTableIdList, &id);
  
H
hjxilinx 已提交
5229 5230
  pMsg += sizeof(STableIdInfo);
  
H
hjxilinx 已提交
5231
  for (int32_t j = 1; j < pQueryMsg->numOfTables; ++j) {
H
hjxilinx 已提交
5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243
    pTableIdInfo = (STableIdInfo *)pMsg;
    
    pTableIdInfo->sid = htonl(pTableIdInfo->sid);
    pTableIdInfo->uid = htobe64(pTableIdInfo->uid);
    pTableIdInfo->key = htobe64(pTableIdInfo->key);
    
    taosArrayPush(*pTableIdList, pTableIdInfo);
    pMsg += sizeof(STableIdInfo);
  }
  
  return pMsg;
}
5244

5245
/**
H
hjxilinx 已提交
5246
 * pQueryMsg->head has been converted before this function is called.
5247
 *
H
hjxilinx 已提交
5248
 * @param pQueryMsg
5249 5250 5251 5252
 * @param pTableIdList
 * @param pExpr
 * @return
 */
H
hjxilinx 已提交
5253
static int32_t convertQueryMsg(SQueryTableMsg *pQueryMsg, SArray **pTableIdList, SSqlFuncExprMsg ***pExpr,
5254
      char** tagCond, SColIndex** groupbyCols) {
H
hjxilinx 已提交
5255 5256 5257 5258 5259 5260 5261 5262
  pQueryMsg->numOfTables   = htonl(pQueryMsg->numOfTables);

  pQueryMsg->window.skey   = htobe64(pQueryMsg->window.skey);
  pQueryMsg->window.ekey   = htobe64(pQueryMsg->window.ekey);
  pQueryMsg->intervalTime  = htobe64(pQueryMsg->intervalTime);
  pQueryMsg->slidingTime   = htobe64(pQueryMsg->slidingTime);
  pQueryMsg->limit         = htobe64(pQueryMsg->limit);
  pQueryMsg->offset        = htobe64(pQueryMsg->offset);
H
hjxilinx 已提交
5263
  
H
hjxilinx 已提交
5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275
  pQueryMsg->order         = htons(pQueryMsg->order);
  pQueryMsg->orderColId    = htons(pQueryMsg->orderColId);
  pQueryMsg->queryType     = htons(pQueryMsg->queryType);

  pQueryMsg->numOfCols     = htons(pQueryMsg->numOfCols);
  pQueryMsg->numOfOutputCols = htons(pQueryMsg->numOfOutputCols);
  pQueryMsg->numOfGroupCols = htons(pQueryMsg->numOfGroupCols);
  pQueryMsg->tagCondLen    = htons(pQueryMsg->tagCondLen);
  pQueryMsg->tsOffset      = htonl(pQueryMsg->tsOffset);
  pQueryMsg->tsLen         = htonl(pQueryMsg->tsLen);
  pQueryMsg->tsNumOfBlocks = htonl(pQueryMsg->tsNumOfBlocks);
  pQueryMsg->tsOrder       = htonl(pQueryMsg->tsOrder);
5276

5277
  // query msg safety check
H
hjxilinx 已提交
5278
  if (validateQueryMsg(pQueryMsg) != 0) {
5279 5280 5281
    return TSDB_CODE_INVALID_QUERY_MSG;
  }

H
hjxilinx 已提交
5282
  char *pMsg = (char *)(pQueryMsg->colList) + sizeof(SColumnInfo) * pQueryMsg->numOfCols;
5283

H
hjxilinx 已提交
5284 5285
  for (int32_t col = 0; col < pQueryMsg->numOfCols; ++col) {
    SColumnInfo* pColInfo = &pQueryMsg->colList[col];
H
hjxilinx 已提交
5286 5287 5288 5289 5290
    
    pColInfo->colId = htons(pColInfo->colId);
    pColInfo->type  = htons(pColInfo->type);
    pColInfo->bytes = htons(pColInfo->bytes);
    pColInfo->numOfFilters = htons(pColInfo->numOfFilters);
5291

H
hjxilinx 已提交
5292
    assert(pColInfo->type >= TSDB_DATA_TYPE_BOOL && pColInfo->type <= TSDB_DATA_TYPE_NCHAR);
5293

H
hjxilinx 已提交
5294
    int32_t numOfFilters = pColInfo->numOfFilters;
5295
    if (numOfFilters > 0) {
H
hjxilinx 已提交
5296
      pColInfo->filters = calloc(numOfFilters, sizeof(SColumnFilterInfo));
5297 5298 5299 5300
    }

    for (int32_t f = 0; f < numOfFilters; ++f) {
      SColumnFilterInfo *pFilterInfo = (SColumnFilterInfo *)pMsg;
H
hjxilinx 已提交
5301
      SColumnFilterInfo *pDestFilterInfo = &pColInfo->filters[f];
5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324

      pDestFilterInfo->filterOnBinary = htons(pFilterInfo->filterOnBinary);

      pMsg += sizeof(SColumnFilterInfo);

      if (pDestFilterInfo->filterOnBinary) {
        pDestFilterInfo->len = htobe64(pFilterInfo->len);

        pDestFilterInfo->pz = (int64_t)calloc(1, pDestFilterInfo->len + 1);
        memcpy((void *)pDestFilterInfo->pz, pMsg, pDestFilterInfo->len + 1);
        pMsg += (pDestFilterInfo->len + 1);
      } else {
        pDestFilterInfo->lowerBndi = htobe64(pFilterInfo->lowerBndi);
        pDestFilterInfo->upperBndi = htobe64(pFilterInfo->upperBndi);
      }

      pDestFilterInfo->lowerRelOptr = htons(pFilterInfo->lowerRelOptr);
      pDestFilterInfo->upperRelOptr = htons(pFilterInfo->upperRelOptr);
    }
  }

  bool hasArithmeticFunction = false;

H
hjxilinx 已提交
5325
  *pExpr = calloc(pQueryMsg->numOfOutputCols, POINTER_BYTES);
5326 5327
  SSqlFuncExprMsg *pExprMsg = (SSqlFuncExprMsg *)pMsg;

H
hjxilinx 已提交
5328
  for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
5329
    (*pExpr)[i] = pExprMsg;
5330

5331
    pExprMsg->colInfo.colIndex = htons(pExprMsg->colInfo.colIndex);
5332 5333 5334 5335
    pExprMsg->colInfo.colId  = htons(pExprMsg->colInfo.colId);
    pExprMsg->colInfo.flag   = htons(pExprMsg->colInfo.flag);
    pExprMsg->functionId     = htons(pExprMsg->functionId);
    pExprMsg->numOfParams    = htons(pExprMsg->numOfParams);
5336 5337 5338 5339

    pMsg += sizeof(SSqlFuncExprMsg);

    for (int32_t j = 0; j < pExprMsg->numOfParams; ++j) {
5340
      pExprMsg->arg[j].argType  = htons(pExprMsg->arg[j].argType);
5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358
      pExprMsg->arg[j].argBytes = htons(pExprMsg->arg[j].argBytes);

      if (pExprMsg->arg[j].argType == TSDB_DATA_TYPE_BINARY) {
        pExprMsg->arg[j].argValue.pz = pMsg;
        pMsg += pExprMsg->arg[j].argBytes + 1;  // one more for the string terminated char.
      } else {
        pExprMsg->arg[j].argValue.i64 = htobe64(pExprMsg->arg[j].argValue.i64);
      }
    }

    if (pExprMsg->functionId == TSDB_FUNC_ARITHM) {
      hasArithmeticFunction = true;
    } else if (pExprMsg->functionId == TSDB_FUNC_TAG || pExprMsg->functionId == TSDB_FUNC_TAGPRJ ||
               pExprMsg->functionId == TSDB_FUNC_TAG_DUMMY) {
      if (pExprMsg->colInfo.flag != TSDB_COL_TAG) {  // ignore the column  index check for arithmetic expression.
        return TSDB_CODE_INVALID_QUERY_MSG;
      }
    } else {
H
hjxilinx 已提交
5359
      if (!vnodeValidateExprColumnInfo(pQueryMsg, pExprMsg)) {
5360 5361 5362 5363 5364 5365 5366
        return TSDB_CODE_INVALID_QUERY_MSG;
      }
    }

    pExprMsg = (SSqlFuncExprMsg *)pMsg;
  }

H
hjxilinx 已提交
5367
  pQueryMsg->colNameLen = htonl(pQueryMsg->colNameLen);
5368
  if (hasArithmeticFunction) {  // column name array
H
hjxilinx 已提交
5369 5370 5371
    assert(pQueryMsg->colNameLen > 0);
    pQueryMsg->colNameList = (int64_t)pMsg;
    pMsg += pQueryMsg->colNameLen;
5372
  }
H
hjxilinx 已提交
5373
  
H
hjxilinx 已提交
5374
  pMsg = createTableIdList(pQueryMsg, pMsg, pTableIdList);
5375

H
hjxilinx 已提交
5376
  if (pQueryMsg->numOfGroupCols > 0) {  // group by tag columns
5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392
    *groupbyCols = malloc(pQueryMsg->numOfGroupCols*sizeof(SColIndex));
    
    for(int32_t i = 0; i < pQueryMsg->numOfGroupCols; ++i) {
      (*groupbyCols)[i].colId = *(int16_t*) pMsg;
      pMsg += sizeof((*groupbyCols)[i].colId);
      
      (*groupbyCols)[i].colIndex = *(int16_t*) pMsg;
      pMsg += sizeof((*groupbyCols)[i].colIndex);

      (*groupbyCols)[i].flag = *(int16_t*) pMsg;
      pMsg += sizeof((*groupbyCols)[i].flag);

      memcpy((*groupbyCols)[i].name, pMsg, tListLen(groupbyCols[i]->name));
      pMsg += tListLen((*groupbyCols)[i].name);
    }
    
H
hjxilinx 已提交
5393 5394
    pQueryMsg->orderByIdx = htons(pQueryMsg->orderByIdx);
    pQueryMsg->orderType = htons(pQueryMsg->orderType);
5395 5396
  }

H
hjxilinx 已提交
5397 5398 5399
  pQueryMsg->interpoType = htons(pQueryMsg->interpoType);
  if (pQueryMsg->interpoType != TSDB_INTERPO_NONE) {
    pQueryMsg->defaultVal = (uint64_t)(pMsg);
5400 5401

    int64_t *v = (int64_t *)pMsg;
H
hjxilinx 已提交
5402
    for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
5403 5404
      v[i] = htobe64(v[i]);
    }
5405
    
H
hjxilinx 已提交
5406
    pMsg += sizeof(int64_t) * pQueryMsg->numOfOutputCols;
5407 5408 5409
  }
  
  // the tag query condition expression string is located at the end of query msg
H
hjxilinx 已提交
5410
  if (pQueryMsg->tagCondLen > 0) {
5411 5412
    *tagCond = calloc(1, pQueryMsg->tagCondLen);
    memcpy(*tagCond, pMsg, pQueryMsg->tagCondLen);
H
hjxilinx 已提交
5413
  }
5414 5415 5416
  
  dTrace("qmsg:%p query on %d table(s), qrange:%" PRId64 "-%" PRId64 ", numOfGroupbyTagCols:%d, ts order:%d, "
         "outputCols:%d, numOfCols:%d, interval:%d" PRId64 ", fillType:%d, comptsLen:%d, limit:%" PRId64 ", offset:%" PRId64,
H
hjxilinx 已提交
5417
         pQueryMsg, pQueryMsg->numOfTables, pQueryMsg->window.skey, pQueryMsg->window.ekey,
5418
         pQueryMsg->numOfGroupCols, pQueryMsg->order, pQueryMsg->numOfOutputCols,
H
hjxilinx 已提交
5419 5420
         pQueryMsg->numOfCols, pQueryMsg->intervalTime, pQueryMsg->interpoType, pQueryMsg->tsLen,
         pQueryMsg->limit, pQueryMsg->offset);
5421 5422 5423 5424 5425

  return 0;
}

static int32_t buildAirthmeticExprFromMsg(SSqlFunctionExpr *pExpr, SQueryTableMsg *pQueryMsg) {
5426 5427
//  SSqlBinaryExprInfo *pBinaryExprInfo = &pExpr->binExprInfo;
//  SColumnInfo *       pColMsg = pQueryMsg->colList;
5428
#if 0
H
hjxilinx 已提交
5429
  tExprNode* pBinExpr = NULL;
5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461
  SSchema*        pSchema = toSchema(pQueryMsg, pColMsg, pQueryMsg->numOfCols);
  
  dTrace("qmsg:%p create binary expr from string:%s", pQueryMsg, pExpr->pBase.arg[0].argValue.pz);
  tSQLBinaryExprFromString(&pBinExpr, pSchema, pQueryMsg->numOfCols, pExpr->pBase.arg[0].argValue.pz,
                           pExpr->pBase.arg[0].argBytes);
  
  if (pBinExpr == NULL) {
    dError("qmsg:%p failed to create arithmetic expression string from:%s", pQueryMsg, pExpr->pBase.arg[0].argValue.pz);
    return TSDB_CODE_APP_ERROR;
  }
  
  pBinaryExprInfo->pBinExpr = pBinExpr;
  
  int32_t num = 0;
  int16_t ids[TSDB_MAX_COLUMNS] = {0};
  
  tSQLBinaryExprTrv(pBinExpr, &num, ids);
  qsort(ids, num, sizeof(int16_t), id_compar);
  
  int32_t i = 0, j = 0;
  
  while (i < num && j < num) {
    if (ids[i] == ids[j]) {
      j++;
    } else {
      ids[++i] = ids[j++];
    }
  }
  assert(i <= num);
  
  // there may be duplicated referenced columns.
  num = i + 1;
5462
  pBinaryExprInfo->pReqColumns = malloc(sizeof(SColIndex) * num);
5463 5464
  
  for (int32_t k = 0; k < num; ++k) {
5465
    SColIndex* pColIndex = &pBinaryExprInfo->pReqColumns[k];
5466 5467 5468 5469 5470 5471 5472 5473 5474 5475
    pColIndex->colId = ids[k];
  }
  
  pBinaryExprInfo->numOfCols = num;
  free(pSchema);
#endif

  return TSDB_CODE_SUCCESS;
}

5476
static int32_t createSqlFunctionExprFromMsg(SQueryTableMsg *pQueryMsg, SSqlFunctionExpr **pSqlFuncExpr, SSqlFuncExprMsg** pExprMsg) {
5477
  *pSqlFuncExpr = NULL;
H
hjxilinx 已提交
5478
  int32_t code = TSDB_CODE_SUCCESS;
5479

5480 5481 5482 5483 5484 5485 5486 5487 5488
  SSqlFunctionExpr *pExprs = (SSqlFunctionExpr *)calloc(1, sizeof(SSqlFunctionExpr) * pQueryMsg->numOfOutputCols);
  if (pExprs == NULL) {
    return TSDB_CODE_SERV_OUT_OF_MEMORY;
  }

  bool    isSuperTable = QUERY_IS_STABLE_QUERY(pQueryMsg->queryType);
  int16_t tagLen = 0;

  for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
5489
    pExprs[i].pBase = *pExprMsg[i];
5490 5491 5492 5493 5494
    pExprs[i].resBytes = 0;

    int16_t type = 0;
    int16_t bytes = 0;

5495 5496 5497
    // parse the arithmetic expression
    if (pExprs[i].pBase.functionId == TSDB_FUNC_ARITHM) {
      code = buildAirthmeticExprFromMsg(&pExprs[i], pQueryMsg);
5498

5499 5500 5501
      if (code != TSDB_CODE_SUCCESS) {
        tfree(pExprs);
        return code;
5502 5503
      }

5504 5505 5506 5507 5508
      type = TSDB_DATA_TYPE_DOUBLE;
      bytes = tDataTypeDesc[type].nSize;
    } else {  // parse the normal column
      int32_t j = getColumnIndexInSource(pQueryMsg, &pExprs[i].pBase);
      assert(j < pQueryMsg->numOfCols);
H
hjxilinx 已提交
5509

5510 5511 5512
      SColumnInfo *pCol = &pQueryMsg->colList[j];
      type = pCol->type;
      bytes = pCol->bytes;
5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531
    }

    int32_t param = pExprs[i].pBase.arg[0].argValue.i64;
    if (getResultDataInfo(type, bytes, pExprs[i].pBase.functionId, param, &pExprs[i].resType, &pExprs[i].resBytes,
                          &pExprs[i].interResBytes, 0, isSuperTable) != TSDB_CODE_SUCCESS) {
      tfree(pExprs);
      return TSDB_CODE_INVALID_QUERY_MSG;
    }

    if (pExprs[i].pBase.functionId == TSDB_FUNC_TAG_DUMMY || pExprs[i].pBase.functionId == TSDB_FUNC_TS_DUMMY) {
      tagLen += pExprs[i].resBytes;
    }
    assert(isValidDataType(pExprs[i].resType, pExprs[i].resBytes));
  }

  // get the correct result size for top/bottom query, according to the number of tags columns in selection clause

  // TODO refactor
  for (int32_t i = 0; i < pQueryMsg->numOfOutputCols; ++i) {
5532
    pExprs[i].pBase = *pExprMsg[i];
5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548
    int16_t functId = pExprs[i].pBase.functionId;
    if (functId == TSDB_FUNC_TOP || functId == TSDB_FUNC_BOTTOM) {
      int32_t j = getColumnIndexInSource(pQueryMsg, &pExprs[i].pBase);
      assert(j < pQueryMsg->numOfCols);

      SColumnInfo *pCol = &pQueryMsg->colList[j];
      int16_t      type = pCol->type;
      int16_t      bytes = pCol->bytes;

      int32_t ret =
          getResultDataInfo(type, bytes, pExprs[i].pBase.functionId, pExprs[i].pBase.arg[0].argValue.i64,
                            &pExprs[i].resType, &pExprs[i].resBytes, &pExprs[i].interResBytes, tagLen, isSuperTable);
      assert(ret == TSDB_CODE_SUCCESS);
    }
  }

5549
  tfree(pExprMsg);
5550 5551 5552 5553 5554
  *pSqlFuncExpr = pExprs;

  return TSDB_CODE_SUCCESS;
}

5555
static SSqlGroupbyExpr *createGroupbyExprFromMsg(SQueryTableMsg *pQueryMsg, SColIndex* pColIndex, int32_t *code) {
5556 5557 5558 5559 5560
  if (pQueryMsg->numOfGroupCols == 0) {
    return NULL;
  }

  // using group by tag columns
5561
  SSqlGroupbyExpr *pGroupbyExpr = (SSqlGroupbyExpr *)calloc(1, sizeof(SSqlGroupbyExpr));
5562 5563 5564 5565 5566 5567 5568 5569 5570
  if (pGroupbyExpr == NULL) {
    *code = TSDB_CODE_SERV_OUT_OF_MEMORY;
    return NULL;
  }

  pGroupbyExpr->numOfGroupCols = pQueryMsg->numOfGroupCols;
  pGroupbyExpr->orderType = pQueryMsg->orderType;
  pGroupbyExpr->orderIndex = pQueryMsg->orderByIdx;

5571
  pGroupbyExpr->columnInfo = pColIndex;
5572 5573 5574
  return pGroupbyExpr;
}

H
hjxilinx 已提交
5575
static int32_t vnodeCreateFilterInfo(void *pQInfo, SQuery *pQuery) {
5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
    if (pQuery->colList[i].info.numOfFilters > 0) {
      pQuery->numOfFilterCols++;
    }
  }

  if (pQuery->numOfFilterCols == 0) {
    return TSDB_CODE_SUCCESS;
  }

  pQuery->pFilterInfo = calloc(1, sizeof(SSingleColumnFilterInfo) * pQuery->numOfFilterCols);

  for (int32_t i = 0, j = 0; i < pQuery->numOfCols; ++i) {
    if (pQuery->colList[i].info.numOfFilters > 0) {
      SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[j];

H
hjxilinx 已提交
5592
      memcpy(&pFilterInfo->info, &pQuery->colList[i], sizeof(SColumnInfoData));
5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620
      pFilterInfo->info.info.filters = NULL;

      pFilterInfo->numOfFilters = pQuery->colList[i].info.numOfFilters;
      pFilterInfo->pFilters = calloc(pFilterInfo->numOfFilters, sizeof(SColumnFilterElem));

      for (int32_t f = 0; f < pFilterInfo->numOfFilters; ++f) {
        SColumnFilterElem *pSingleColFilter = &pFilterInfo->pFilters[f];
        pSingleColFilter->filterInfo = pQuery->colList[i].info.filters[f];

        int32_t lower = pSingleColFilter->filterInfo.lowerRelOptr;
        int32_t upper = pSingleColFilter->filterInfo.upperRelOptr;

        if (lower == TSDB_RELATION_INVALID && upper == TSDB_RELATION_INVALID) {
          dError("QInfo:%p invalid filter info", pQInfo);
          return TSDB_CODE_INVALID_QUERY_MSG;
        }

        int16_t type = pQuery->colList[i].info.type;
        int16_t bytes = pQuery->colList[i].info.bytes;

        __filter_func_t *rangeFilterArray = NULL;  // vnodeGetRangeFilterFuncArray(type);
        __filter_func_t *filterArray = NULL;       // vnodeGetValueFilterFuncArray(type);

        if (rangeFilterArray == NULL && filterArray == NULL) {
          dError("QInfo:%p failed to get filter function, invalid data type:%d", pQInfo, type);
          return TSDB_CODE_INVALID_QUERY_MSG;
        }

5621
        if ((lower == TSDB_RELATION_GREATER_EQUAL || lower == TSDB_RELATION_GREATER) &&
5622
            (upper == TSDB_RELATION_LESS_EQUAL || upper == TSDB_RELATION_LESS)) {
5623
          if (lower == TSDB_RELATION_GREATER_EQUAL) {
5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658
            if (upper == TSDB_RELATION_LESS_EQUAL) {
              pSingleColFilter->fp = rangeFilterArray[4];
            } else {
              pSingleColFilter->fp = rangeFilterArray[2];
            }
          } else {
            if (upper == TSDB_RELATION_LESS_EQUAL) {
              pSingleColFilter->fp = rangeFilterArray[3];
            } else {
              pSingleColFilter->fp = rangeFilterArray[1];
            }
          }
        } else {  // set callback filter function
          if (lower != TSDB_RELATION_INVALID) {
            pSingleColFilter->fp = filterArray[lower];

            if (upper != TSDB_RELATION_INVALID) {
              dError("pQInfo:%p failed to get filter function, invalid filter condition", pQInfo, type);
              return TSDB_CODE_INVALID_QUERY_MSG;
            }
          } else {
            pSingleColFilter->fp = filterArray[upper];
          }
        }
        assert(pSingleColFilter->fp != NULL);
        pSingleColFilter->bytes = bytes;
      }

      j++;
    }
  }

  return TSDB_CODE_SUCCESS;
}

5659 5660 5661 5662 5663
static void doUpdateExprColumnIndex(SQuery* pQuery) {
  assert(pQuery->pSelectExpr != NULL && pQuery != NULL);
//  int32_t i = 0, j = 0;
//  while (i < pQuery->numOfCols && j < pMeterObj->numOfColumns) {
//    if (pQuery->colList[i].data.colId == pMeterObj->schema[j].colId) {
5664
//      pQuery->colList[i++].colIndex = (int16_t)j++;
5665
//    } else if (pQuery->colList[i].data.colId < pMeterObj->schema[j].colId) {
5666
//      pQuery->colList[i++].colIndex = -1;
5667 5668 5669 5670 5671 5672
//    } else if (pQuery->colList[i].data.colId > pMeterObj->schema[j].colId) {
//      j++;
//    }
//  }

//  while (i < pQuery->numOfCols) {
5673
//    pQuery->colList[i++].colIndex = -1;  // not such column in current meter
5674 5675 5676 5677 5678 5679 5680 5681
//  }
  
  for(int32_t k = 0; k < pQuery->numOfOutputCols; ++k) {
    SSqlFuncExprMsg* pSqlExprMsg = &pQuery->pSelectExpr[k].pBase;
    if (pSqlExprMsg->functionId == TSDB_FUNC_ARITHM || pSqlExprMsg->colInfo.flag == TSDB_COL_TAG) {
      continue;
    }
    
5682
    SColIndex* pColIndexEx = &pSqlExprMsg->colInfo;
5683 5684
    for(int32_t f = 0; f < pQuery->numOfCols; ++f) {
      if (pColIndexEx->colId == pQuery->colList[f].info.colId) {
5685
        pColIndexEx->colIndex = f;
5686 5687 5688 5689 5690 5691
        break;
      }
    }
  }
}

5692
static SQInfo *createQInfoImpl(SQueryTableMsg *pQueryMsg, SSqlGroupbyExpr *pGroupbyExpr, SSqlFunctionExpr *pExprs,
5693
                               STableGroupInfo *groupInfo) {
5694 5695
  SQInfo *pQInfo = (SQInfo *)calloc(1, sizeof(SQInfo));
  if (pQInfo == NULL) {
5696
    return NULL;
5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727
  }

  SQuery *pQuery = calloc(1, sizeof(SQuery));
  pQInfo->runtimeEnv.pQuery = pQuery;

  int16_t numOfCols = pQueryMsg->numOfCols;
  int16_t numOfOutputCols = pQueryMsg->numOfOutputCols;

  pQuery->numOfCols = numOfCols;
  pQuery->numOfOutputCols = numOfOutputCols;

  pQuery->limit.limit = pQueryMsg->limit;
  pQuery->limit.offset = pQueryMsg->offset;

  pQuery->order.order = pQueryMsg->order;
  pQuery->order.orderColId = pQueryMsg->orderColId;

  pQuery->pSelectExpr = pExprs;
  pQuery->pGroupbyExpr = pGroupbyExpr;

  pQuery->intervalTime = pQueryMsg->intervalTime;

  pQuery->slidingTime = pQueryMsg->slidingTime;
  pQuery->slidingTimeUnit = pQueryMsg->slidingTimeUnit;

  pQuery->interpoType = pQueryMsg->interpoType;

  pQuery->colList = calloc(1, sizeof(SSingleColumnFilterInfo) * numOfCols);
  if (pQuery->colList == NULL) {
    goto _clean_memory;
  }
5728

H
hjxilinx 已提交
5729 5730
  for (int16_t i = 0; i < numOfCols; ++i) {
    pQuery->colList[i].info = pQueryMsg->colList[i];
5731 5732 5733
    
    SColumnInfo *pColInfo = &pQuery->colList[i].info;
    pColInfo->filters = NULL;
5734 5735 5736 5737 5738 5739 5740 5741 5742
    //    if (colList[i].numOfFilters > 0) {
    //      pColInfo->filters = calloc(1, colList[i].numOfFilters * sizeof(SColumnFilterInfo));
    //
    //      for (int32_t j = 0; j < colList[i].numOfFilters; ++j) {
    //        tscColumnFilterInfoCopy(&pColInfo->filters[j], &colList[i].filters[j]);
    //      }
    //    } else {
    //      pQuery->colList[i].data.filters = NULL;
    //    }
H
hjxilinx 已提交
5743
  }
5744 5745 5746 5747 5748 5749

  // calculate the result row size
  for (int16_t col = 0; col < numOfOutputCols; ++col) {
    assert(pExprs[col].resBytes > 0);
    pQuery->rowSize += pExprs[col].resBytes;
  }
5750 5751
  
  doUpdateExprColumnIndex(pQuery);
5752 5753 5754 5755 5756 5757 5758

  int32_t ret = vnodeCreateFilterInfo(pQInfo, pQuery);
  if (ret != TSDB_CODE_SUCCESS) {
    goto _clean_memory;
  }

  // prepare the result buffer
5759
  pQuery->sdata = (SData **)calloc(pQuery->numOfOutputCols, POINTER_BYTES);
5760 5761 5762 5763
  if (pQuery->sdata == NULL) {
    goto _clean_memory;
  }

H
hjxilinx 已提交
5764
  // set the output buffer capacity
H
hjxilinx 已提交
5765
  pQuery->rec.capacity = 4096;
5766
  pQuery->rec.threshold = 4000;
H
hjxilinx 已提交
5767
  
5768 5769 5770 5771
  for (int32_t col = 0; col < pQuery->numOfOutputCols; ++col) {
    assert(pExprs[col].interResBytes >= pExprs[col].resBytes);

    // allocate additional memory for interResults that are usually larger then final results
H
hjxilinx 已提交
5772
    size_t size = (pQuery->rec.capacity + 1) * pExprs[col].resBytes + pExprs[col].interResBytes + sizeof(SData);
5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789
    pQuery->sdata[col] = (SData *)calloc(1, size);
    if (pQuery->sdata[col] == NULL) {
      goto _clean_memory;
    }
  }

  if (pQuery->interpoType != TSDB_INTERPO_NONE) {
    pQuery->defaultVal = malloc(sizeof(int64_t) * pQuery->numOfOutputCols);
    if (pQuery->defaultVal == NULL) {
      goto _clean_memory;
    }

    // the first column is the timestamp
    memcpy(pQuery->defaultVal, (char *)pQueryMsg->defaultVal, pQuery->numOfOutputCols * sizeof(int64_t));
  }

  // to make sure third party won't overwrite this structure
5790
  pQInfo->signature = pQInfo;
5791
  pQInfo->groupInfo = *groupInfo;
5792

5793
  pQuery->pos = -1;
H
hjxilinx 已提交
5794
  
5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805
  pQuery->window.skey = pQueryMsg->window.skey;
  pQuery->window.ekey = pQueryMsg->window.ekey;
  pQuery->lastKey     = pQuery->window.skey;
  
  if (sem_init(&pQInfo->dataReady, 0, 0) != 0) {
    dError("QInfo:%p init dataReady sem failed, reason:%s", pQInfo, strerror(errno));
    goto _clean_memory;
  }
  
  vnodeParametersSafetyCheck(pQuery);
  
H
hjxilinx 已提交
5806
  dTrace("qmsg:%p QInfo:%p created", pQueryMsg, pQInfo);
5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829
  return pQInfo;

_clean_memory:
  tfree(pQuery->defaultVal);

  if (pQuery->sdata != NULL) {
    for (int16_t col = 0; col < pQuery->numOfOutputCols; ++col) {
      tfree(pQuery->sdata[col]);
    }
  }

  tfree(pQuery->sdata);
  tfree(pQuery->pFilterInfo);
  tfree(pQuery->colList);

  tfree(pExprs);
  tfree(pGroupbyExpr);

  tfree(pQInfo);

  return NULL;
}

H
hjxilinx 已提交
5830
static bool isValidQInfo(void *param) {
H
hjxilinx 已提交
5831 5832 5833 5834
  SQInfo *pQInfo = (SQInfo *)param;
  if (pQInfo == NULL) {
    return false;
  }
5835

H
hjxilinx 已提交
5836 5837 5838 5839
  /*
   * pQInfo->signature may be changed by another thread, so we assign value of signature
   * into local variable, then compare by using local variable
   */
5840
  uint64_t sig = (uint64_t) pQInfo->signature;
H
hjxilinx 已提交
5841 5842 5843
  return (sig == (uint64_t)pQInfo);
}

H
hjxilinx 已提交
5844
static void freeQInfo(SQInfo *pQInfo);
5845
static int32_t initQInfo(SQueryTableMsg *pQueryMsg, void* tsdb, SQInfo *pQInfo, bool isSTable) {
H
hjxilinx 已提交
5846
  int32_t code = TSDB_CODE_SUCCESS;
5847
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
hjxilinx 已提交
5848 5849 5850 5851 5852
  
  STSBuf *pTSBuf = NULL;
  if (pQueryMsg->tsLen > 0) {  // open new file to save the result
    char *tsBlock = (char *)pQueryMsg + pQueryMsg->tsOffset;
    pTSBuf = tsBufCreateFromCompBlocks(tsBlock, pQueryMsg->tsNumOfBlocks, pQueryMsg->tsLen, pQueryMsg->tsOrder);
5853
    
H
hjxilinx 已提交
5854 5855 5856
    tsBufResetPos(pTSBuf);
    tsBufNextPos(pTSBuf);
  }
5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867
  
  // only the successful complete requries the sem_post/over = 1 operations.
  if ((QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.skey > pQuery->window.ekey)) ||
      (!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.ekey > pQuery->window.skey))) {
    dTrace("QInfo:%p no result in time range %" PRId64 "-%" PRId64 ", order %d", pQInfo, pQuery->window.skey,
           pQuery->window.ekey, pQuery->order.order);
    
    sem_post(&pQInfo->dataReady);
    setQueryStatus(pQuery, QUERY_COMPLETED);
    return TSDB_CODE_SUCCESS;
  }
H
hjxilinx 已提交
5868 5869

  // filter the qualified
5870
  if ((code = doInitQInfo(pQInfo, pTSBuf, tsdb, isSTable)) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
5871 5872 5873 5874 5875 5876 5877 5878 5879
    goto _error;
  }

  // dTrace("QInfo:%p set query flag and prepare runtime environment completed, ref:%d, wait for schedule", pQInfo,
  //       pQInfo->refCount);
  return code;

_error:
  // table query ref will be decrease during error handling
5880
  freeQInfo(pQInfo);
H
hjxilinx 已提交
5881 5882 5883 5884 5885 5886 5887 5888
  return code;
}

static void freeQInfo(SQInfo *pQInfo) {
  if (!isValidQInfo(pQInfo)) {
    return;
  }
  
H
hjxilinx 已提交
5889 5890 5891
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
  setQueryKilled(pQInfo);
  
5892
  dTrace("QInfo:%p start to free QInfo", pQInfo);
H
hjxilinx 已提交
5893
  for (int32_t col = 0; col < pQuery->numOfOutputCols; ++col) {
H
hjxilinx 已提交
5894 5895
    tfree(pQuery->sdata[col]);
  }
H
hjxilinx 已提交
5896
  
H
hjxilinx 已提交
5897
  sem_destroy(&(pQInfo->dataReady));
5898 5899
  teardownQueryRuntimeEnv(&pQInfo->runtimeEnv);
  
5900 5901 5902 5903 5904 5905
//  if (pQInfo->pTableDataInfo != NULL) {
    //    size_t num = taosHashGetSize(pQInfo->groupInfo);
//    for (int32_t j = 0; j < 0; ++j) {
//      destroyMeterQueryInfo(pQInfo->pTableDataInfo[j].pTableQInfo, pQuery->numOfOutputCols);
//    }
//  }
H
hjxilinx 已提交
5906
  
H
hjxilinx 已提交
5907 5908 5909 5910 5911 5912
  for (int32_t i = 0; i < pQuery->numOfFilterCols; ++i) {
    SSingleColumnFilterInfo *pColFilter = &pQuery->pFilterInfo[i];
    if (pColFilter->numOfFilters > 0) {
      tfree(pColFilter->pFilters);
    }
  }
H
hjxilinx 已提交
5913
  
H
hjxilinx 已提交
5914 5915 5916
  tfree(pQuery->pFilterInfo);
  tfree(pQuery->colList);
  tfree(pQuery->sdata);
H
hjxilinx 已提交
5917
  
H
hjxilinx 已提交
5918 5919 5920
  if (pQuery->pSelectExpr != NULL) {
    for (int32_t i = 0; i < pQuery->numOfOutputCols; ++i) {
      SSqlBinaryExprInfo *pBinExprInfo = &pQuery->pSelectExpr[i].binExprInfo;
H
hjxilinx 已提交
5921
      
H
hjxilinx 已提交
5922 5923
      if (pBinExprInfo->numOfCols > 0) {
        tfree(pBinExprInfo->pReqColumns);
5924
        tExprTreeDestroy(&pBinExprInfo->pBinExpr, NULL);
H
hjxilinx 已提交
5925 5926
      }
    }
H
hjxilinx 已提交
5927
    
H
hjxilinx 已提交
5928 5929
    tfree(pQuery->pSelectExpr);
  }
H
hjxilinx 已提交
5930
  
H
hjxilinx 已提交
5931 5932 5933
  if (pQuery->defaultVal != NULL) {
    tfree(pQuery->defaultVal);
  }
H
hjxilinx 已提交
5934
  
H
hjxilinx 已提交
5935 5936
  tfree(pQuery->pGroupbyExpr);
  tfree(pQuery);
H
hjxilinx 已提交
5937
  
5938
  taosArrayDestroy(pQInfo->groupInfo.pGroupList);
H
hjxilinx 已提交
5939
  
H
hjxilinx 已提交
5940
  dTrace("QInfo:%p QInfo is freed", pQInfo);
H
hjxilinx 已提交
5941
  
5942
  // destroy signature, in order to avoid the query process pass the object safety check
H
hjxilinx 已提交
5943 5944 5945 5946
  memset(pQInfo, 0, sizeof(SQInfo));
  tfree(pQInfo);
}

H
hjxilinx 已提交
5947 5948
static size_t getResultSize(SQInfo *pQInfo, int64_t *numOfRows) {
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
5949
  
H
hjxilinx 已提交
5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965
  /*
   * get the file size and set the numOfRows to be the file size, since for tsComp query,
   * the returned row size is equalled to 1
   * TODO handle the case that the file is too large to send back one time
   */
  if (isTSCompQuery(pQuery) && (*numOfRows) > 0) {
    struct stat fstat;
    if (stat(pQuery->sdata[0]->data, &fstat) == 0) {
      *numOfRows = fstat.st_size;
      return fstat.st_size;
    } else {
      dError("QInfo:%p failed to get file info, path:%s, reason:%s", pQInfo, pQuery->sdata[0]->data, strerror(errno));
      return 0;
    }
  } else {
    return pQuery->rowSize * (*numOfRows);
5966
  }
H
hjxilinx 已提交
5967
}
5968

H
hjxilinx 已提交
5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991
static int32_t doDumpQueryResult(SQInfo *pQInfo, char *data) {
  // the remained number of retrieved rows, not the interpolated result
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
  
  // load data from file to msg buffer
  if (isTSCompQuery(pQuery)) {
    int32_t fd = open(pQuery->sdata[0]->data, O_RDONLY, 0666);
    
    // make sure file exist
    if (FD_VALID(fd)) {
      size_t s = lseek(fd, 0, SEEK_END);
      dTrace("QInfo:%p ts comp data return, file:%s, size:%zu", pQInfo, pQuery->sdata[0]->data, s);
      
      lseek(fd, 0, SEEK_SET);
      read(fd, data, s);
      close(fd);
      
      unlink(pQuery->sdata[0]->data);
    } else {
      dError("QInfo:%p failed to open tmp file to send ts-comp data to client, path:%s, reason:%s", pQInfo,
             pQuery->sdata[0]->data, strerror(errno));
    }
  } else {
5992
    doCopyQueryResultToMsg(pQInfo, pQuery->rec.rows, data);
5993
  }
H
hjxilinx 已提交
5994
  
5995 5996
  pQuery->rec.total += pQuery->rec.rows;
  dTrace("QInfo:%p current:%d, total:%d", pQInfo, pQuery->rec.rows, pQuery->rec.total);
H
hjxilinx 已提交
5997 5998 5999 6000
  
  return TSDB_CODE_SUCCESS;
  
  // todo if interpolation exists, the result may be dump to client by several rounds
6001 6002
}

H
hjxilinx 已提交
6003
int32_t qCreateQueryInfo(void* tsdb, SQueryTableMsg *pQueryMsg, qinfo_t *pQInfo) {
H
hjxilinx 已提交
6004
  assert(pQueryMsg != NULL);
6005 6006

  int32_t code = TSDB_CODE_SUCCESS;
H
hjxilinx 已提交
6007
  
6008
  char* tagCond = NULL;
6009
  SArray *pTableIdList = NULL;
6010
  SSqlFuncExprMsg** pExprMsg = NULL;
6011 6012 6013
  SColIndex* pGroupColIndex = NULL;
  
  if ((code = convertQueryMsg(pQueryMsg, &pTableIdList, &pExprMsg, &tagCond, &pGroupColIndex)) != TSDB_CODE_SUCCESS) {
6014 6015 6016
    return code;
  }

H
hjxilinx 已提交
6017 6018
  if (pQueryMsg->numOfTables <= 0) {
    dError("Invalid number of tables to query, numOfTables:%d", pQueryMsg->numOfTables);
6019 6020 6021 6022 6023
    code = TSDB_CODE_INVALID_QUERY_MSG;
    goto _query_over;
  }

  // todo check vnode status
H
hjxilinx 已提交
6024
  if (pTableIdList == NULL || taosArrayGetSize(pTableIdList) == 0) {
H
hjxilinx 已提交
6025
    dError("qmsg:%p, SQueryTableMsg wrong format", pQueryMsg);
6026 6027 6028 6029 6030
    code = TSDB_CODE_INVALID_QUERY_MSG;
    goto _query_over;
  }

  SSqlFunctionExpr *pExprs = NULL;
H
hjxilinx 已提交
6031
  if ((code = createSqlFunctionExprFromMsg(pQueryMsg, &pExprs, pExprMsg)) != TSDB_CODE_SUCCESS) {
6032 6033 6034
    goto _query_over;
  }

6035
  SSqlGroupbyExpr *pGroupbyExpr = createGroupbyExprFromMsg(pQueryMsg, pGroupColIndex, &code);
H
hjxilinx 已提交
6036
  if ((pGroupbyExpr == NULL && pQueryMsg->numOfGroupCols != 0) || code != TSDB_CODE_SUCCESS) {
6037 6038
    goto _query_over;
  }
6039
  
6040
  bool isSTableQuery = false;
6041
  STableGroupInfo* groupInfo = calloc(1, sizeof(STableGroupInfo));
6042
  
H
hjxilinx 已提交
6043
  if ((pQueryMsg->queryType & TSDB_QUERY_TYPE_STABLE_QUERY) != 0) {
6044 6045
    isSTableQuery = true;
    
6046
    STableId* id = taosArrayGet(pTableIdList, 0);
6047
    id->uid = -1;  //todo fix me
6048
    
6049 6050
    /*int32_t ret =*/ tsdbQueryTags(tsdb, id->uid, tagCond, pQueryMsg->tagCondLen, groupInfo, pGroupColIndex, pQueryMsg->numOfGroupCols);
    if (groupInfo->numOfTables == 0) { // no qualified tables no need to do query
6051 6052 6053
      code = TSDB_CODE_SUCCESS;
      goto _query_over;
    }
H
hjxilinx 已提交
6054
  } else {
6055
    assert(taosArrayGetSize(pTableIdList) == 1);
6056 6057
  
    STableId* id = taosArrayGet(pTableIdList, 0);
6058
    if ((code = tsdbGetOneTableGroup(tsdb, id->uid, groupInfo)) != TSDB_CODE_SUCCESS) {
6059 6060
      goto _query_over;
    }
6061 6062
  }
  
6063
  (*pQInfo) = createQInfoImpl(pQueryMsg, pGroupbyExpr, pExprs, groupInfo);
6064 6065 6066 6067
  if ((*pQInfo) == NULL) {
    code = TSDB_CODE_SERV_OUT_OF_MEMORY;
  }
  
6068
  code = initQInfo(pQueryMsg, tsdb, *pQInfo, isSTableQuery);
6069
  
6070
_query_over:
H
hjxilinx 已提交
6071 6072 6073
  if (code != TSDB_CODE_SUCCESS) {
    taosArrayDestroy(pTableIdList);
  }
6074

6075 6076
  // if failed to add ref for all meters in this query, abort current query
  //  if (code != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
6077
  //    vnodeDecQueryRefCount(pQueryMsg, pMeterObjList, incNumber);
6078 6079
  //  }
  //
H
hjxilinx 已提交
6080
  //  tfree(pQueryMsg->pSqlFuncExprs);
6081 6082 6083
  //  tfree(pMeterObjList);
  //  ret = vnodeSendQueryRspMsg(pObj, code, pObj->qhandle);
  //
H
hjxilinx 已提交
6084 6085 6086
  //  tfree(pQueryMsg->pSidExtInfo);
  //  for(int32_t i = 0; i < pQueryMsg->numOfCols; ++i) {
  //    vnodeFreeColumnInfo(&pQueryMsg->colList[i]);
6087 6088 6089 6090
  //  }
  //
  //  atomic_fetch_add_32(&vnodeSelectReqNum, 1);
  return TSDB_CODE_SUCCESS;
H
hjxilinx 已提交
6091 6092
}

H
hjxilinx 已提交
6093
void qDestroyQueryInfo(qinfo_t pQInfo) {
6094
  dTrace("QInfo:%p query completed", pQInfo);
6095 6096 6097
  freeQInfo(pQInfo);
}

H
hjxilinx 已提交
6098 6099 6100
void qTableQuery(qinfo_t qinfo) {
  SQInfo* pQInfo = (SQInfo*) qinfo;
  
H
hjxilinx 已提交
6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112
  if (pQInfo == NULL || pQInfo->signature != pQInfo) {
    dTrace("%p freed abort query", pQInfo);
    return;
  }
  
  if (isQueryKilled(pQInfo)) {
    dTrace("QInfo:%p it is already killed, abort", pQInfo);
    return;
  }
  
  dTrace("QInfo:%p query task is launched", pQInfo);
  
6113 6114
  if (pQInfo->runtimeEnv.stableQuery) {
    stableQueryImpl(pQInfo);
H
hjxilinx 已提交
6115
  } else {
6116
    tableQueryImpl(pQInfo);
H
hjxilinx 已提交
6117 6118 6119 6120 6121
  }
  
  //  vnodeDecRefCount(pQInfo);
}

H
hjxilinx 已提交
6122 6123 6124
int32_t qRetrieveQueryResultInfo(qinfo_t qinfo) {
  SQInfo* pQInfo = (SQInfo*) qinfo;
  
H
hjxilinx 已提交
6125
  if (pQInfo == NULL || !isValidQInfo(pQInfo)) {
H
hjxilinx 已提交
6126 6127
    return TSDB_CODE_INVALID_QHANDLE;
  }
H
hjxilinx 已提交
6128 6129
  
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
6130
  if (isQueryKilled(pQInfo)) {
H
hjxilinx 已提交
6131
    dTrace("QInfo:%p query is killed, code:%d", pQInfo, pQInfo->code);
H
hjxilinx 已提交
6132
    return pQInfo->code;
H
hjxilinx 已提交
6133
  }
6134

H
hjxilinx 已提交
6135
  sem_wait(&pQInfo->dataReady);
6136
  dTrace("QInfo:%p retrieve result info, rowsize:%d, rows:%d, code:%d", pQInfo, pQuery->rowSize, pQuery->rec.rows,
H
hjxilinx 已提交
6137
      pQInfo->code);
H
hjxilinx 已提交
6138
  
H
hjxilinx 已提交
6139
  return pQInfo->code;
H
hjxilinx 已提交
6140
}
6141

H
hjxilinx 已提交
6142 6143 6144
bool qHasMoreResultsToRetrieve(qinfo_t qinfo) {
  SQInfo* pQInfo = (SQInfo*) qinfo;
  
H
hjxilinx 已提交
6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157
  if (pQInfo == NULL || pQInfo->signature != pQInfo || pQInfo->code != TSDB_CODE_SUCCESS) {
    return false;
  }
  
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_OVER)) {
    return false;
  } else if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
    return true;
  } else if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
    return true;
  } else {
    assert(0);
6158 6159 6160
  }
}

H
hjxilinx 已提交
6161 6162 6163
int32_t qDumpRetrieveResult(qinfo_t qinfo, SRetrieveTableRsp** pRsp, int32_t* contLen) {
  SQInfo* pQInfo = (SQInfo*) qinfo;
  
H
hjxilinx 已提交
6164
  if (pQInfo == NULL || !isValidQInfo(pQInfo)) {
6165 6166 6167 6168
    return TSDB_CODE_INVALID_QHANDLE;
  }
  
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
6169
  size_t size = getResultSize(pQInfo, &pQuery->rec.rows);
6170 6171 6172 6173
  *contLen = size + sizeof(SRetrieveTableRsp);
  
  // todo handle failed to allocate memory
  *pRsp = (SRetrieveTableRsp *)rpcMallocCont(*contLen);
6174
  (*pRsp)->numOfRows = htonl(pQuery->rec.rows);
6175 6176 6177 6178 6179 6180 6181 6182 6183 6184
  
  int32_t code = pQInfo->code;
  if (code == TSDB_CODE_SUCCESS) {
    (*pRsp)->offset = htobe64(pQuery->limit.offset);
    (*pRsp)->useconds = htobe64(pQInfo->elapsedTime);
  } else {
    (*pRsp)->offset = 0;
    (*pRsp)->useconds = 0;
  }
  
6185
  if (pQuery->rec.rows > 0 && code == TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
6186
    code = doDumpQueryResult(pQInfo, (*pRsp)->data);
6187
  } else {
H
hjxilinx 已提交
6188
    setQueryStatus(pQuery, QUERY_OVER);
6189
    code = pQInfo->code;
6190 6191
  }
  
H
hjxilinx 已提交
6192
  if (isQueryKilled(pQInfo) || Q_STATUS_EQUAL(pQuery->status, QUERY_OVER)) {
H
hjxilinx 已提交
6193 6194 6195 6196
    (*pRsp)->completed = 1; // notify no more result to client
  }
  
  return code;
6197 6198 6199 6200 6201 6202
  
//  if (numOfRows == 0 && (pRetrieve->qhandle == (uint64_t)pObj->qhandle) && (code != TSDB_CODE_ACTION_IN_PROGRESS)) {
//    dTrace("QInfo:%p %s free qhandle code:%d", pObj->qhandle, __FUNCTION__, code);
//    vnodeDecRefCount(pObj->qhandle);
//    pObj->qhandle = NULL;
//  }
6203
}