qExecutor.c 223.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#include "os.h"
H
Haojun Liao 已提交
16 17
#include "qFill.h"
#include "taosmsg.h"
18 19
#include "tcache.h"
#include "tglobal.h"
20

H
Haojun Liao 已提交
21
#include "exception.h"
22
#include "hash.h"
H
Haojun Liao 已提交
23 24 25 26
#include "qAst.h"
#include "qExecutor.h"
#include "qResultbuf.h"
#include "qUtil.h"
H
hjxilinx 已提交
27
#include "query.h"
S
slguan 已提交
28
#include "queryLog.h"
29 30 31
#include "tlosertree.h"
#include "tscompression.h"
#include "ttime.h"
32 33 34 35 36

/**
 * check if the primary column is load by default, otherwise, the program will
 * forced to load primary column explicitly.
 */
37 38
#define Q_STATUS_EQUAL(p, s)  (((p) & (s)) != 0)
#define TSDB_COL_IS_TAG(f)    (((f)&TSDB_COL_TAG) != 0)
39 40
#define QUERY_IS_ASC_QUERY(q) (GET_FORWARD_DIRECTION_FACTOR((q)->order.order) == QUERY_ASC_FORWARD_STEP)

41
#define IS_MASTER_SCAN(runtime)        ((runtime)->scanFlag == MASTER_SCAN)
H
hjxilinx 已提交
42
#define IS_REVERSE_SCAN(runtime)       ((runtime)->scanFlag == REVERSE_SCAN)
43
#define SET_MASTER_SCAN_FLAG(runtime)  ((runtime)->scanFlag = MASTER_SCAN)
H
hjxilinx 已提交
44
#define SET_REVERSE_SCAN_FLAG(runtime) ((runtime)->scanFlag = REVERSE_SCAN)
45

H
Haojun Liao 已提交
46
#define GET_QINFO_ADDR(x) ((SQInfo *)((char *)(x)-offsetof(SQInfo, runtimeEnv)))
47

48
#define GET_COL_DATA_POS(query, index, step) ((query)->pos + (index) * (step))
49
#define SWITCH_ORDER(n) (((n) = ((n) == TSDB_ORDER_ASC) ? TSDB_ORDER_DESC : TSDB_ORDER_ASC))
50

H
Haojun Liao 已提交
51 52
#define SDATA_BLOCK_INITIALIZER (SDataBlockInfo) {{0}, 0}

53
enum {
H
hjxilinx 已提交
54
  // when query starts to execute, this status will set
55 56
  QUERY_NOT_COMPLETED = 0x1u,

H
hjxilinx 已提交
57 58
  /* result output buffer is full, current query is paused.
   * this status is only exist in group-by clause and diff/add/division/multiply/ query.
59
   */
60 61
  QUERY_RESBUF_FULL = 0x2u,

H
hjxilinx 已提交
62 63 64
  /* query is over
   * 1. this status is used in one row result query process, e.g., count/sum/first/last/ avg...etc.
   * 2. when all data within queried time window, it is also denoted as query_completed
65
   */
66
  QUERY_COMPLETED = 0x4u,
67

H
hjxilinx 已提交
68 69
  /* when the result is not completed return to client, this status will be
   * usually used in case of interval query with interpolation option
70
   */
71
  QUERY_OVER = 0x8u,
72
};
73 74

enum {
75 76
  TS_JOIN_TS_EQUAL       = 0,
  TS_JOIN_TS_NOT_EQUALS  = 1,
77 78 79
  TS_JOIN_TAG_NOT_EQUALS = 2,
};

80
typedef struct {
81 82 83 84 85 86
  int32_t     status;       // query status
  TSKEY       lastKey;      // the lastKey value before query executed
  STimeWindow w;            // whole query time window
  STimeWindow curWindow;    // current query window
  int32_t     windowIndex;  // index of active time window result for interval query
  STSCursor   cur;
87 88
} SQueryStatusInfo;

H
Haojun Liao 已提交
89
#if 0
H
Haojun Liao 已提交
90
static UNUSED_FUNC void *u_malloc (size_t __size) {
H
Haojun Liao 已提交
91 92 93 94
  uint32_t v = rand();
  if (v % 5 <= 1) {
    return NULL;
  } else {
H
Haojun Liao 已提交
95
    return malloc(__size);
H
Haojun Liao 已提交
96
  }
H
Haojun Liao 已提交
97 98
}

H
Haojun Liao 已提交
99 100 101 102 103 104 105 106 107 108
static UNUSED_FUNC void* u_calloc(size_t num, size_t __size) {
  uint32_t v = rand();
  if (v % 5 <= 1) {
    return NULL;
  } else {
    return calloc(num, __size);
  }
}

#define calloc  u_calloc
H
Haojun Liao 已提交
109
#define malloc  u_malloc
H
Haojun Liao 已提交
110
#endif
H
Haojun Liao 已提交
111

112
#define CLEAR_QUERY_STATUS(q, st)   ((q)->status &= (~(st)))
H
Haojun Liao 已提交
113 114 115
#define GET_NUM_OF_TABLEGROUP(q)    taosArrayGetSize((q)->tableqinfoGroupInfo.pGroupList)
#define GET_TABLEGROUP(q, _index)   ((SArray*) taosArrayGetP((q)->tableqinfoGroupInfo.pGroupList, (_index)))

116
static void setQueryStatus(SQuery *pQuery, int8_t status);
H
Haojun Liao 已提交
117
static void finalizeQueryResult(SQueryRuntimeEnv *pRuntimeEnv);
118

H
Haojun Liao 已提交
119
#define QUERY_IS_INTERVAL_QUERY(_q) ((_q)->intervalTime > 0)
120

H
Haojun Liao 已提交
121 122 123 124 125 126 127 128
// previous time window may not be of the same size of pQuery->intervalTime
#define GET_NEXT_TIMEWINDOW(_q, tw)                                   \
  do {                                                                \
    int32_t factor = GET_FORWARD_DIRECTION_FACTOR((_q)->order.order); \
    (tw)->skey += ((_q)->slidingTime * factor);                       \
    (tw)->ekey = (tw)->skey + ((_q)->intervalTime - 1);               \
  } while (0)

H
hjxilinx 已提交
129
// todo move to utility
130
static int32_t mergeIntoGroupResultImpl(SQInfo *pQInfo, SArray *group);
131

H
hjxilinx 已提交
132
static void setWindowResOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pResult);
H
Haojun Liao 已提交
133
static void setWindowResOutputBufInitCtx(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pResult);
134 135
static void resetMergeResultBuf(SQuery *pQuery, SQLFunctionCtx *pCtx, SResultInfo *pResultInfo);
static bool functionNeedToExecute(SQueryRuntimeEnv *pRuntimeEnv, SQLFunctionCtx *pCtx, int32_t functionId);
136

137 138 139
static void setExecParams(SQuery *pQuery, SQLFunctionCtx *pCtx, void* inputData, TSKEY *tsCol, SDataBlockInfo* pBlockInfo,
                          SDataStatis *pStatis, void *param, int32_t colIndex);

140
static void initCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv);
H
Haojun Liao 已提交
141
static void destroyTableQueryInfo(STableQueryInfo *pTableQueryInfo);
142 143
static void resetCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv);
static bool hasMainOutput(SQuery *pQuery);
H
hjxilinx 已提交
144
static void buildTagQueryResult(SQInfo *pQInfo);
145

146
static int32_t setAdditionalInfo(SQInfo *pQInfo, void *pTable, STableQueryInfo *pTableQueryInfo);
147
static int32_t flushFromResultBuf(SQInfo *pQInfo);
148

149
bool doFilterData(SQuery *pQuery, int32_t elemPos) {
150 151
  for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
    SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
152

153 154
    char *pElem = pFilterInfo->pData + pFilterInfo->info.bytes * elemPos;
    if (isNull(pElem, pFilterInfo->info.type)) {
155 156
      return false;
    }
157

158 159
    bool qualified = false;
    for (int32_t j = 0; j < pFilterInfo->numOfFilters; ++j) {
160
      SColumnFilterElem *pFilterElem = &pFilterInfo->pFilters[j];
161

162 163 164 165 166
      if (pFilterElem->fp(pFilterElem, pElem, pElem)) {
        qualified = true;
        break;
      }
    }
167

168 169 170 171
    if (!qualified) {
      return false;
    }
  }
172

173 174 175 176 177 178
  return true;
}

int64_t getNumOfResult(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
  bool    hasMainFunction = hasMainOutput(pQuery);
179

180
  int64_t maxOutput = 0;
181
  for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
182
    int32_t functionId = pQuery->pSelectExpr[j].base.functionId;
183

184 185 186 187 188 189 190 191
    /*
     * ts, tag, tagprj function can not decide the output number of current query
     * the number of output result is decided by main output
     */
    if (hasMainFunction &&
        (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ)) {
      continue;
    }
192

193 194 195 196 197
    SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
    if (pResInfo != NULL && maxOutput < pResInfo->numOfRes) {
      maxOutput = pResInfo->numOfRes;
    }
  }
198

199
  assert(maxOutput >= 0);
200 201 202
  return maxOutput;
}

203 204 205 206 207 208 209 210 211
/*
 * the value of number of result needs to be update due to offset value upated.
 */
void updateNumOfResult(SQueryRuntimeEnv *pRuntimeEnv, int32_t numOfRes) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
  
  for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
    SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
    
H
Haojun Liao 已提交
212 213 214 215 216 217 218
    int16_t functionId = pRuntimeEnv->pCtx[j].functionId;
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ ||
        functionId == TSDB_FUNC_TS_DUMMY) {
      continue;
    }
    
    assert(pResInfo->numOfRes > numOfRes);
219 220 221 222
    pResInfo->numOfRes = numOfRes;
  }
}

223 224 225 226 227 228 229 230 231
static int32_t getGroupResultId(int32_t groupIndex) {
  int32_t base = 200000;
  return base + (groupIndex * 10000);
}

bool isGroupbyNormalCol(SSqlGroupbyExpr *pGroupbyExpr) {
  if (pGroupbyExpr == NULL || pGroupbyExpr->numOfGroupCols == 0) {
    return false;
  }
232

233
  for (int32_t i = 0; i < pGroupbyExpr->numOfGroupCols; ++i) {
234
    SColIndex *pColIndex = taosArrayGet(pGroupbyExpr->columnInfo, i);
235
    if (pColIndex->flag == TSDB_COL_NORMAL) {
236
      //make sure the normal column locates at the second position if tbname exists in group by clause
237
      if (pGroupbyExpr->numOfGroupCols > 1) {
238
        assert(pColIndex->colIndex > 0);
239
      }
240

241 242 243
      return true;
    }
  }
244

245 246 247 248 249
  return false;
}

int16_t getGroupbyColumnType(SQuery *pQuery, SSqlGroupbyExpr *pGroupbyExpr) {
  assert(pGroupbyExpr != NULL);
250

251 252
  int32_t colId = -2;
  int16_t type = TSDB_DATA_TYPE_NULL;
253

254
  for (int32_t i = 0; i < pGroupbyExpr->numOfGroupCols; ++i) {
255
    SColIndex *pColIndex = taosArrayGet(pGroupbyExpr->columnInfo, i);
256 257 258 259 260
    if (pColIndex->flag == TSDB_COL_NORMAL) {
      colId = pColIndex->colId;
      break;
    }
  }
261

262
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
263 264
    if (colId == pQuery->colList[i].colId) {
      type = pQuery->colList[i].type;
265 266 267
      break;
    }
  }
268

269 270 271 272 273 274
  return type;
}

bool isSelectivityWithTagsQuery(SQuery *pQuery) {
  bool    hasTags = false;
  int32_t numOfSelectivity = 0;
275

276
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
277
    int32_t functId = pQuery->pSelectExpr[i].base.functionId;
278 279 280 281
    if (functId == TSDB_FUNC_TAG_DUMMY || functId == TSDB_FUNC_TS_DUMMY) {
      hasTags = true;
      continue;
    }
282

283 284 285 286
    if ((aAggs[functId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0) {
      numOfSelectivity++;
    }
  }
287

288 289 290
  if (numOfSelectivity > 0 && hasTags) {
    return true;
  }
291

292 293 294
  return false;
}

295 296 297 298 299 300 301 302 303 304 305
bool isProjQuery(SQuery *pQuery) {
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
    int32_t functId = pQuery->pSelectExpr[i].base.functionId;
    if (functId != TSDB_FUNC_PRJ && functId != TSDB_FUNC_TAGPRJ) {
      return false;
    }
  }

  return true;
}

306
bool isTSCompQuery(SQuery *pQuery) { return pQuery->pSelectExpr[0].base.functionId == TSDB_FUNC_TS_COMP; }
307

308 309 310 311
static bool limitResults(SQueryRuntimeEnv* pRuntimeEnv) {
  SQInfo* pQInfo = GET_QINFO_ADDR(pRuntimeEnv);
  SQuery* pQuery = pRuntimeEnv->pQuery;
  
312 313
  if ((pQuery->limit.limit > 0) && (pQuery->rec.total + pQuery->rec.rows > pQuery->limit.limit)) {
    pQuery->rec.rows = pQuery->limit.limit - pQuery->rec.total;
314
    
315
    qDebug("QInfo:%p discard remain data due to result limitation, limit:%"PRId64", current return:%" PRId64 ", total:%"PRId64,
316 317
        pQInfo, pQuery->limit.limit, pQuery->rec.rows, pQuery->rec.total + pQuery->rec.rows);
    assert(pQuery->rec.rows >= 0);
318 319 320
    setQueryStatus(pQuery, QUERY_COMPLETED);
    return true;
  }
321

322 323 324 325
  return false;
}

static bool isTopBottomQuery(SQuery *pQuery) {
326
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
327
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
328 329 330
    if (functionId == TSDB_FUNC_TS) {
      continue;
    }
331

332 333 334 335
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
      return true;
    }
  }
336

337 338 339
  return false;
}

H
Haojun Liao 已提交
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
static bool hasTagValOutput(SQuery* pQuery) {
  SExprInfo *pExprInfo = &pQuery->pSelectExpr[0];
  if (pQuery->numOfOutput == 1 && pExprInfo->base.functionId == TSDB_FUNC_TS_COMP) {
    return true;
  } else {  // set tag value, by which the results are aggregated.
    for (int32_t idx = 0; idx < pQuery->numOfOutput; ++idx) {
      SExprInfo *pLocalExprInfo = &pQuery->pSelectExpr[idx];

      // ts_comp column required the tag value for join filter
      if (TSDB_COL_IS_TAG(pLocalExprInfo->base.colInfo.flag)) {
        return true;
      }
    }
  }

  return false;
}

358 359 360 361 362 363 364 365
/**
 * @param pQuery
 * @param col
 * @param pDataBlockInfo
 * @param pStatis
 * @param pColStatis
 * @return
 */
H
Haojun Liao 已提交
366
static bool hasNullValue(SColIndex* pColIndex, SDataStatis *pStatis, SDataStatis **pColStatis) {
H
Haojun Liao 已提交
367
  if (pStatis != NULL && !TSDB_COL_IS_TAG(pColIndex->flag)) {
H
Haojun Liao 已提交
368 369
    *pColStatis = &pStatis[pColIndex->colIndex];
    assert((*pColStatis)->colId == pColIndex->colId);
H
hjxilinx 已提交
370 371
  } else {
    *pColStatis = NULL;
372
  }
373

H
Haojun Liao 已提交
374 375 376 377
  if (TSDB_COL_IS_TAG(pColIndex->flag) || pColIndex->colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
    return false;
  }

378 379 380
  if ((*pColStatis) != NULL && (*pColStatis)->numOfNull == 0) {
    return false;
  }
381

382 383 384 385
  return true;
}

static SWindowResult *doSetTimeWindowFromKey(SQueryRuntimeEnv *pRuntimeEnv, SWindowResInfo *pWindowResInfo, char *pData,
386
                                             int16_t bytes, bool masterscan) {
387
  SQuery *pQuery = pRuntimeEnv->pQuery;
388

389
  int32_t *p1 = (int32_t *) taosHashGet(pWindowResInfo->hashList, pData, bytes);
390 391
  if (p1 != NULL) {
    pWindowResInfo->curIndex = *p1;
392
  } else {
H
Haojun Liao 已提交
393 394 395
    if (!masterscan) {  // not master scan, do not add new timewindow
      return NULL;
    }
396

H
Haojun Liao 已提交
397 398 399 400
    // more than the capacity, reallocate the resources
    if (pWindowResInfo->size >= pWindowResInfo->capacity) {
      int64_t newCap = pWindowResInfo->capacity * 1.5;
      char *t = realloc(pWindowResInfo->pResult, newCap * sizeof(SWindowResult));
401 402
      if (t == NULL) {
        longjmp(pRuntimeEnv->env, TSDB_CODE_QRY_OUT_OF_MEMORY);
403
      }
404

405 406 407 408 409
      pWindowResInfo->pResult = (SWindowResult *)t;

      int32_t inc = newCap - pWindowResInfo->capacity;
      memset(&pWindowResInfo->pResult[pWindowResInfo->capacity], 0, sizeof(SWindowResult) * inc);

H
Haojun Liao 已提交
410 411 412 413 414
      for (int32_t i = pWindowResInfo->capacity; i < newCap; ++i) {
        createQueryResultInfo(pQuery, &pWindowResInfo->pResult[i], pRuntimeEnv->stableQuery, pRuntimeEnv->interBufSize);
      }

      pWindowResInfo->capacity = newCap;
415
    }
H
Haojun Liao 已提交
416 417 418 419

    // add a new result set for a new group
    pWindowResInfo->curIndex = pWindowResInfo->size++;
    taosHashPut(pWindowResInfo->hashList, pData, bytes, (char *)&pWindowResInfo->curIndex, sizeof(int32_t));
420
  }
421

422 423 424 425 426 427
  return getWindowResult(pWindowResInfo, pWindowResInfo->curIndex);
}

// get the correct time window according to the handled timestamp
static STimeWindow getActiveTimeWindow(SWindowResInfo *pWindowResInfo, int64_t ts, SQuery *pQuery) {
  STimeWindow w = {0};
428

429 430 431 432 433 434 435
  if (pWindowResInfo->curIndex == -1) {  // the first window, from the previous stored value
    w.skey = pWindowResInfo->prevSKey;
    w.ekey = w.skey + pQuery->intervalTime - 1;
  } else {
    int32_t slot = curTimeWindow(pWindowResInfo);
    w = getWindowResult(pWindowResInfo, slot)->window;
  }
436

437 438
  if (w.skey > ts || w.ekey < ts) {
    int64_t st = w.skey;
439

440 441 442
    if (st > ts) {
      st -= ((st - ts + pQuery->slidingTime - 1) / pQuery->slidingTime) * pQuery->slidingTime;
    }
443

444 445 446 447
    int64_t et = st + pQuery->intervalTime - 1;
    if (et < ts) {
      st += ((ts - et + pQuery->slidingTime - 1) / pQuery->slidingTime) * pQuery->slidingTime;
    }
448

449 450 451
    w.skey = st;
    w.ekey = w.skey + pQuery->intervalTime - 1;
  }
452

453 454 455 456 457 458 459
  /*
   * query border check, skey should not be bounded by the query time range, since the value skey will
   * be used as the time window index value. So we only change ekey of time window accordingly.
   */
  if (w.ekey > pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) {
    w.ekey = pQuery->window.ekey;
  }
460

461
  assert(ts >= w.skey && ts <= w.ekey);
462

463 464 465 466 467 468 469 470
  return w;
}

static int32_t addNewWindowResultBuf(SWindowResult *pWindowRes, SDiskbasedResultBuf *pResultBuf, int32_t sid,
                                     int32_t numOfRowsPerPage) {
  if (pWindowRes->pos.pageId != -1) {
    return 0;
  }
471

472
  tFilePage *pData = NULL;
473

474 475 476
  // in the first scan, new space needed for results
  int32_t pageId = -1;
  SIDList list = getDataBufPagesIdList(pResultBuf, sid);
477

H
Haojun Liao 已提交
478
  if (taosArrayGetSize(list) == 0) {
479 480
    pData = getNewDataBuf(pResultBuf, sid, &pageId);
  } else {
H
Haojun Liao 已提交
481
    pageId = getLastPageId(list);
482
    pData = getResBufPage(pResultBuf, pageId);
483

484
    if (pData->num >= numOfRowsPerPage) {
485 486
      pData = getNewDataBuf(pResultBuf, sid, &pageId);
      if (pData != NULL) {
487
        assert(pData->num == 0);  // number of elements must be 0 for new allocated buffer
488 489 490
      }
    }
  }
491

492 493 494
  if (pData == NULL) {
    return -1;
  }
495

496 497 498
  // set the number of rows in current disk page
  if (pWindowRes->pos.pageId == -1) {  // not allocated yet, allocate new buffer
    pWindowRes->pos.pageId = pageId;
499
    pWindowRes->pos.rowId = pData->num++;
500
  }
501

502 503 504 505
  return 0;
}

static int32_t setWindowOutputBufByKey(SQueryRuntimeEnv *pRuntimeEnv, SWindowResInfo *pWindowResInfo, int32_t sid,
506
                                       STimeWindow *win, bool masterscan, bool* newWind) {
507 508
  assert(win->skey <= win->ekey);
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
509

510 511
  SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, pWindowResInfo, (char *)&win->skey,
      TSDB_KEYSIZE, masterscan);
512
  if (pWindowRes == NULL) {
513 514 515
    *newWind = false;

    return masterscan? -1:0;
516
  }
517

518
  *newWind = true;
H
Haojun Liao 已提交
519

520 521 522
  // not assign result buffer yet, add new result buffer
  if (pWindowRes->pos.pageId == -1) {
    int32_t ret = addNewWindowResultBuf(pWindowRes, pResultBuf, sid, pRuntimeEnv->numOfRowsPerPage);
H
Haojun Liao 已提交
523
    if (ret != TSDB_CODE_SUCCESS) {
524 525 526
      return -1;
    }
  }
527

528 529
  // set time window for current result
  pWindowRes->window = *win;
530

H
Haojun Liao 已提交
531
  setWindowResOutputBufInitCtx(pRuntimeEnv, pWindowRes);
532 533 534 535 536 537 538 539
  return TSDB_CODE_SUCCESS;
}

static SWindowStatus *getTimeWindowResStatus(SWindowResInfo *pWindowResInfo, int32_t slot) {
  assert(slot >= 0 && slot < pWindowResInfo->size);
  return &pWindowResInfo->pResult[slot].status;
}

H
Haojun Liao 已提交
540
static FORCE_INLINE int32_t getForwardStepsInBlock(int32_t numOfRows, __block_search_fn_t searchFn, TSKEY ekey, int16_t pos,
541 542
                                      int16_t order, int64_t *pData) {
  int32_t forwardStep = 0;
543

H
Haojun Liao 已提交
544 545 546 547
  if (order == TSDB_ORDER_ASC) {
    int32_t end = searchFn((char*) &pData[pos], numOfRows - pos, ekey, order);
    if (end >= 0) {
      forwardStep = end;
548

H
Haojun Liao 已提交
549 550 551 552 553 554 555 556 557 558 559 560
      if (pData[end + pos] == ekey) {
        forwardStep += 1;
      }
    }
  } else {
    int32_t end = searchFn((char *)pData, pos + 1, ekey, order);
    if (end >= 0) {
      forwardStep = pos - end;

      if (pData[end] == ekey) {
        forwardStep += 1;
      }
561 562
    }
  }
563

H
Haojun Liao 已提交
564
  assert(forwardStep > 0);
565 566 567 568 569 570
  return forwardStep;
}

/**
 * NOTE: the query status only set for the first scan of master scan.
 */
571
static int32_t doCheckQueryCompleted(SQueryRuntimeEnv *pRuntimeEnv, TSKEY lastKey, SWindowResInfo *pWindowResInfo) {
572
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
573
  if (pRuntimeEnv->scanFlag != MASTER_SCAN || (!QUERY_IS_INTERVAL_QUERY(pQuery))) {
574
    return pWindowResInfo->size;
575
  }
576

577
  // no qualified results exist, abort check
578 579
  int32_t numOfClosed = 0;
  
580
  if (pWindowResInfo->size == 0) {
581
    return pWindowResInfo->size;
582
  }
583

584
  // query completed
H
hjxilinx 已提交
585 586
  if ((lastKey >= pQuery->current->win.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
      (lastKey <= pQuery->current->win.ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
587
    closeAllTimeWindow(pWindowResInfo);
588

589 590 591 592
    pWindowResInfo->curIndex = pWindowResInfo->size - 1;
    setQueryStatus(pQuery, QUERY_COMPLETED | QUERY_RESBUF_FULL);
  } else {  // set the current index to be the last unclosed window
    int32_t i = 0;
593
    int64_t skey = TSKEY_INITIAL_VAL;
594

595 596 597
    for (i = 0; i < pWindowResInfo->size; ++i) {
      SWindowResult *pResult = &pWindowResInfo->pResult[i];
      if (pResult->status.closed) {
598
        numOfClosed += 1;
599 600
        continue;
      }
601

602 603 604 605 606 607 608 609
      if ((pResult->window.ekey <= lastKey && QUERY_IS_ASC_QUERY(pQuery)) ||
          (pResult->window.skey >= lastKey && !QUERY_IS_ASC_QUERY(pQuery))) {
        closeTimeWindow(pWindowResInfo, i);
      } else {
        skey = pResult->window.skey;
        break;
      }
    }
610

611
    // all windows are closed, set the last one to be the skey
612
    if (skey == TSKEY_INITIAL_VAL) {
613 614 615 616 617
      assert(i == pWindowResInfo->size);
      pWindowResInfo->curIndex = pWindowResInfo->size - 1;
    } else {
      pWindowResInfo->curIndex = i;
    }
618

619
    pWindowResInfo->prevSKey = pWindowResInfo->pResult[pWindowResInfo->curIndex].window.skey;
620

621 622
    // the number of completed slots are larger than the threshold, return current generated results to client.
    if (numOfClosed > pWindowResInfo->threshold) {
623
      qDebug("QInfo:%p total result window:%d closed:%d, reached the output threshold %d, return",
624 625
          GET_QINFO_ADDR(pRuntimeEnv), pWindowResInfo->size, numOfClosed, pQuery->rec.threshold);
      
626
      setQueryStatus(pQuery, QUERY_RESBUF_FULL);
627
    } else {
628
      qDebug("QInfo:%p total result window:%d already closed:%d", GET_QINFO_ADDR(pRuntimeEnv), pWindowResInfo->size,
629
             numOfClosed);
630 631
    }
  }
632 633 634 635 636 637 638
  
  // output has reached the limitation, set query completed
  if (pQuery->limit.limit > 0 && (pQuery->limit.limit + pQuery->limit.offset) <= numOfClosed &&
      pRuntimeEnv->scanFlag == MASTER_SCAN) {
    setQueryStatus(pQuery, QUERY_COMPLETED);
  }
  
639
  assert(pWindowResInfo->prevSKey != TSKEY_INITIAL_VAL);
640
  return numOfClosed;
641 642 643
}

static int32_t getNumOfRowsInTimeWindow(SQuery *pQuery, SDataBlockInfo *pDataBlockInfo, TSKEY *pPrimaryColumn,
H
hjxilinx 已提交
644
                                        int32_t startPos, TSKEY ekey, __block_search_fn_t searchFn, bool updateLastKey) {
645
  assert(startPos >= 0 && startPos < pDataBlockInfo->rows);
646

647 648 649
  int32_t num = -1;
  int32_t order = pQuery->order.order;
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(order);
650

H
hjxilinx 已提交
651 652
  STableQueryInfo* item = pQuery->current;
  
653 654
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    if (ekey < pDataBlockInfo->window.ekey) {
655
      num = getForwardStepsInBlock(pDataBlockInfo->rows, searchFn, ekey, startPos, order, pPrimaryColumn);
H
Haojun Liao 已提交
656 657
      if (updateLastKey) { // update the last key
        item->lastKey = pPrimaryColumn[startPos + (num - 1)] + step;
658 659
      }
    } else {
660
      num = pDataBlockInfo->rows - startPos;
661
      if (updateLastKey) {
H
hjxilinx 已提交
662
        item->lastKey = pDataBlockInfo->window.ekey + step;
663 664 665 666
      }
    }
  } else {  // desc
    if (ekey > pDataBlockInfo->window.skey) {
667
      num = getForwardStepsInBlock(pDataBlockInfo->rows, searchFn, ekey, startPos, order, pPrimaryColumn);
H
Haojun Liao 已提交
668 669
      if (updateLastKey) {  // update the last key
        item->lastKey = pPrimaryColumn[startPos - (num - 1)] + step;
670 671 672 673
      }
    } else {
      num = startPos + 1;
      if (updateLastKey) {
H
hjxilinx 已提交
674
        item->lastKey = pDataBlockInfo->window.skey + step;
675 676 677
      }
    }
  }
678

H
Haojun Liao 已提交
679
  assert(num > 0);
680 681 682 683
  return num;
}

static void doBlockwiseApplyFunctions(SQueryRuntimeEnv *pRuntimeEnv, SWindowStatus *pStatus, STimeWindow *pWin,
H
Haojun Liao 已提交
684
                                      int32_t offset, int32_t forwardStep, TSKEY *tsBuf, int32_t numOfTotal) {
685 686
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
687

688 689 690
  if (IS_MASTER_SCAN(pRuntimeEnv) || pStatus->closed) {
    for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
      int32_t functionId = pQuery->pSelectExpr[k].base.functionId;
691

692 693 694
      pCtx[k].nStartQueryTimestamp = pWin->skey;
      pCtx[k].size = forwardStep;
      pCtx[k].startOffset = (QUERY_IS_ASC_QUERY(pQuery)) ? offset : offset - (forwardStep - 1);
695

696 697 698
      if ((aAggs[functionId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0) {
        pCtx[k].ptsList = &tsBuf[offset];
      }
699

700 701 702 703
      // not a whole block involved in query processing, statistics data can not be used
      if (forwardStep != numOfTotal) {
        pCtx[k].preAggVals.isSet = false;
      }
H
Haojun Liao 已提交
704

705 706 707
      if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
        aAggs[functionId].xFunction(&pCtx[k]);
      }
708 709 710 711 712 713 714 715
    }
  }
}

static void doRowwiseApplyFunctions(SQueryRuntimeEnv *pRuntimeEnv, SWindowStatus *pStatus, STimeWindow *pWin,
                                    int32_t offset) {
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
716

717 718 719
  if (IS_MASTER_SCAN(pRuntimeEnv) || pStatus->closed) {
    for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
      pCtx[k].nStartQueryTimestamp = pWin->skey;
720

721 722 723 724
      int32_t functionId = pQuery->pSelectExpr[k].base.functionId;
      if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
        aAggs[functionId].xFunctionF(&pCtx[k], offset);
      }
725 726 727 728
    }
  }
}

H
Haojun Liao 已提交
729 730
static int32_t getNextQualifiedWindow(SQueryRuntimeEnv *pRuntimeEnv, STimeWindow *pNext, SDataBlockInfo *pDataBlockInfo,
    TSKEY *primaryKeys, __block_search_fn_t searchFn, int32_t prevPosition) {
731
  SQuery *pQuery = pRuntimeEnv->pQuery;
732

H
Haojun Liao 已提交
733
  GET_NEXT_TIMEWINDOW(pQuery, pNext);
734

H
Haojun Liao 已提交
735
  // next time window is not in current block
H
Haojun Liao 已提交
736 737
  if ((pNext->skey > pDataBlockInfo->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
      (pNext->ekey < pDataBlockInfo->window.skey && !QUERY_IS_ASC_QUERY(pQuery))) {
H
Haojun Liao 已提交
738 739
    return -1;
  }
740

H
Haojun Liao 已提交
741 742
  TSKEY startKey = -1;
  if (QUERY_IS_ASC_QUERY(pQuery)) {
H
Haojun Liao 已提交
743
    startKey = pNext->skey;
H
Haojun Liao 已提交
744 745
    if (startKey < pQuery->window.skey) {
      startKey = pQuery->window.skey;
746
    }
H
Haojun Liao 已提交
747
  } else {
H
Haojun Liao 已提交
748
    startKey = pNext->ekey;
H
Haojun Liao 已提交
749 750
    if (startKey > pQuery->window.skey) {
      startKey = pQuery->window.skey;
751
    }
H
Haojun Liao 已提交
752
  }
753

H
Haojun Liao 已提交
754 755 756 757 758 759 760 761
  int32_t startPos = 0;
  // tumbling time window query, a special case of sliding time window query
  if (pQuery->slidingTime == pQuery->intervalTime && prevPosition != -1) {
    int32_t factor = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
    startPos = prevPosition + factor;
  } else {
    startPos = searchFn((char *)primaryKeys, pDataBlockInfo->rows, startKey, pQuery->order.order);
  }
762

H
Haojun Liao 已提交
763 764 765 766
  /*
   * This time window does not cover any data, try next time window,
   * this case may happen when the time window is too small
   */
H
Haojun Liao 已提交
767
  if (QUERY_IS_ASC_QUERY(pQuery) && primaryKeys[startPos] > pNext->ekey) {
H
Haojun Liao 已提交
768
    TSKEY next = primaryKeys[startPos];
769

H
Haojun Liao 已提交
770 771 772
    pNext->ekey += ((next - pNext->ekey + pQuery->slidingTime - 1)/pQuery->slidingTime) * pQuery->slidingTime;
    pNext->skey = pNext->ekey - pQuery->intervalTime + 1;
  } else if ((!QUERY_IS_ASC_QUERY(pQuery)) && primaryKeys[startPos] < pNext->skey) {
H
Haojun Liao 已提交
773
    TSKEY next = primaryKeys[startPos];
774

H
Haojun Liao 已提交
775 776
    pNext->skey -= ((pNext->skey - next + pQuery->slidingTime - 1) / pQuery->slidingTime) * pQuery->slidingTime;
    pNext->ekey = pNext->skey + pQuery->intervalTime - 1;
777
  }
778

H
Haojun Liao 已提交
779
  return startPos;
780 781
}

H
Haojun Liao 已提交
782
static FORCE_INLINE TSKEY reviseWindowEkey(SQuery *pQuery, STimeWindow *pWindow) {
783 784 785 786 787 788 789 790 791 792 793 794
  TSKEY ekey = -1;
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    ekey = pWindow->ekey;
    if (ekey > pQuery->window.ekey) {
      ekey = pQuery->window.ekey;
    }
  } else {
    ekey = pWindow->skey;
    if (ekey < pQuery->window.ekey) {
      ekey = pQuery->window.ekey;
    }
  }
795

796 797 798
  return ekey;
}

H
hjxilinx 已提交
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813
//todo binary search
static void* getDataBlockImpl(SArray* pDataBlock, int32_t colId) {
  int32_t numOfCols = taosArrayGetSize(pDataBlock);
  
  for (int32_t i = 0; i < numOfCols; ++i) {
    SColumnInfoData *p = taosArrayGet(pDataBlock, i);
    if (colId == p->info.colId) {
      return p->pData;
    }
  }
  
  return NULL;
}

static char *getDataBlock(SQueryRuntimeEnv *pRuntimeEnv, SArithmeticSupport *sas, int32_t col, int32_t size,
814
                    SArray *pDataBlock) {
dengyihao's avatar
dengyihao 已提交
815 816 817
  if (pDataBlock == NULL) {
    return NULL;
  }
818

H
Haojun Liao 已提交
819
  char *dataBlock = NULL;
H
Haojun Liao 已提交
820
  SQuery *pQuery = pRuntimeEnv->pQuery;
821
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
822

823
  int32_t functionId = pQuery->pSelectExpr[col].base.functionId;
824
  if (functionId == TSDB_FUNC_ARITHM) {
825
    sas->pArithExpr = &pQuery->pSelectExpr[col];
826

827 828 829 830 831 832
    // set the start offset to be the lowest start position, no matter asc/desc query order
    if (QUERY_IS_ASC_QUERY(pQuery)) {
      pCtx->startOffset = pQuery->pos;
    } else {
      pCtx->startOffset = pQuery->pos - (size - 1);
    }
833

834 835 836 837
    sas->offset  = 0;
    sas->colList = pQuery->colList;
    sas->numOfCols = pQuery->numOfCols;
    sas->data    = calloc(pQuery->numOfCols, POINTER_BYTES);
838

H
Haojun Liao 已提交
839
    if (sas->data == NULL) {
H
Haojun Liao 已提交
840
      finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
H
Haojun Liao 已提交
841 842 843
      longjmp(pRuntimeEnv->env, TSDB_CODE_QRY_OUT_OF_MEMORY);
    }

844
    // here the pQuery->colList and sas->colList are identical
H
Haojun Liao 已提交
845
    int32_t numOfCols = taosArrayGetSize(pDataBlock);
846
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
847
      SColumnInfo *pColMsg = &pQuery->colList[i];
848

849 850 851 852 853 854 855 856
      dataBlock = NULL;
      for (int32_t k = 0; k < numOfCols; ++k) {  //todo refactor
        SColumnInfoData *p = taosArrayGet(pDataBlock, k);
        if (pColMsg->colId == p->info.colId) {
          dataBlock = p->pData;
          break;
        }
      }
857

858
      assert(dataBlock != NULL);
H
Haojun Liao 已提交
859
      sas->data[i] = dataBlock/* + pQuery->colList[i].bytes*/;  // start from the offset
860
    }
861

862
  } else {  // other type of query function
863
    SColIndex *pCol = &pQuery->pSelectExpr[col].base.colInfo;
H
Haojun Liao 已提交
864
    if (TSDB_COL_IS_TAG(pCol->flag)) {
865 866
      dataBlock = NULL;
    } else {
H
Haojun Liao 已提交
867 868 869 870 871
      SColIndex* pColIndex = &pQuery->pSelectExpr[col].base.colInfo;
      SColumnInfoData *p = taosArrayGet(pDataBlock, pColIndex->colIndex);
      assert(p->info.colId == pColIndex->colId);

      dataBlock = p->pData;
872 873
    }
  }
874

875 876 877 878
  return dataBlock;
}

/**
H
Haojun Liao 已提交
879
 * todo set the last value for pQueryTableInfo as in rowwiseapplyfunctions
880 881
 * @param pRuntimeEnv
 * @param forwardStep
882
 * @param tsCols
883 884 885 886 887
 * @param pFields
 * @param isDiskFileBlock
 * @return                  the incremental number of output value, so it maybe 0 for fixed number of query,
 *                          such as count/min/max etc.
 */
888
static void blockwiseApplyFunctions(SQueryRuntimeEnv *pRuntimeEnv, SDataStatis *pStatis,
889 890
                                       SDataBlockInfo *pDataBlockInfo, SWindowResInfo *pWindowResInfo,
                                       __block_search_fn_t searchFn, SArray *pDataBlock) {
891
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
892 893
  bool masterScan = IS_MASTER_SCAN(pRuntimeEnv);

894 895
  SQuery *pQuery = pRuntimeEnv->pQuery;
  TSKEY  *tsCols = NULL;
896
  if (pDataBlock != NULL) {
897
    SColumnInfoData* pColInfo = taosArrayGet(pDataBlock, 0);
898
    tsCols = (TSKEY *)(pColInfo->pData);
899
  }
900

901
  SArithmeticSupport *sasArray = calloc((size_t)pQuery->numOfOutput, sizeof(SArithmeticSupport));
H
Haojun Liao 已提交
902
  if (sasArray == NULL) {
H
Haojun Liao 已提交
903
    finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
H
Haojun Liao 已提交
904 905
    longjmp(pRuntimeEnv->env, TSDB_CODE_QRY_OUT_OF_MEMORY);
  }
906

907
  for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
H
hjxilinx 已提交
908
    char *dataBlock = getDataBlock(pRuntimeEnv, &sasArray[k], k, pDataBlockInfo->rows, pDataBlock);
909
    setExecParams(pQuery, &pCtx[k], dataBlock, tsCols, pDataBlockInfo, pStatis, &sasArray[k], k);
910
  }
911

912
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
H
Haojun Liao 已提交
913 914
  if (QUERY_IS_INTERVAL_QUERY(pQuery)/* && tsCols != NULL*/) {
    TSKEY ts = TSKEY_INITIAL_VAL;
915

H
Haojun Liao 已提交
916 917 918 919 920 921 922 923
    if (tsCols == NULL) {
      ts = QUERY_IS_ASC_QUERY(pQuery)? pDataBlockInfo->window.skey:pDataBlockInfo->window.ekey;
    } else {
      int32_t offset = GET_COL_DATA_POS(pQuery, 0, step);
      ts = tsCols[offset];
    }

    bool        hasTimeWindow = false;
924
    STimeWindow win = getActiveTimeWindow(pWindowResInfo, ts, pQuery);
H
Haojun Liao 已提交
925 926
    if (setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pDataBlockInfo->tid, &win, masterScan, &hasTimeWindow) !=
        TSDB_CODE_SUCCESS) {
dengyihao's avatar
dengyihao 已提交
927
      tfree(sasArray);
H
hjxilinx 已提交
928
      return;
929
    }
930

H
Haojun Liao 已提交
931 932 933
    int32_t forwardStep = 0;
    int32_t startPos = pQuery->pos;

934
    if (hasTimeWindow) {
H
Haojun Liao 已提交
935
      TSKEY ekey = reviseWindowEkey(pQuery, &win);
H
Haojun Liao 已提交
936
      forwardStep = getNumOfRowsInTimeWindow(pQuery, pDataBlockInfo, tsCols, pQuery->pos, ekey, searchFn, true);
937

938
      SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
H
Haojun Liao 已提交
939
      doBlockwiseApplyFunctions(pRuntimeEnv, pStatus, &win, startPos, forwardStep, tsCols, pDataBlockInfo->rows);
940
    }
941

942 943
    int32_t     index = pWindowResInfo->curIndex;
    STimeWindow nextWin = win;
944

945
    while (1) {
H
Haojun Liao 已提交
946 947
      int32_t prevEndPos = (forwardStep - 1) * step + startPos;
      startPos = getNextQualifiedWindow(pRuntimeEnv, &nextWin, pDataBlockInfo, tsCols, searchFn, prevEndPos);
948 949 950
      if (startPos < 0) {
        break;
      }
951

952
      // null data, failed to allocate more memory buffer
H
Haojun Liao 已提交
953
      hasTimeWindow = false;
H
Haojun Liao 已提交
954 955
      if (setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pDataBlockInfo->tid, &nextWin, masterScan,
                                  &hasTimeWindow) != TSDB_CODE_SUCCESS) {
956 957
        break;
      }
958

959 960 961 962 963
      if (!hasTimeWindow) {
        continue;
      }

      TSKEY ekey = reviseWindowEkey(pQuery, &nextWin);
H
Haojun Liao 已提交
964
      forwardStep = getNumOfRowsInTimeWindow(pQuery, pDataBlockInfo, tsCols, startPos, ekey, searchFn, true);
965

H
Haojun Liao 已提交
966
      SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
967
      doBlockwiseApplyFunctions(pRuntimeEnv, pStatus, &nextWin, startPos, forwardStep, tsCols, pDataBlockInfo->rows);
968
    }
969

970 971 972 973 974 975 976
    pWindowResInfo->curIndex = index;
  } else {
    /*
     * the sqlfunctionCtx parameters should be set done before all functions are invoked,
     * since the selectivity + tag_prj query needs all parameters been set done.
     * tag_prj function are changed to be TSDB_FUNC_TAG_DUMMY
     */
977
    for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
978
      int32_t functionId = pQuery->pSelectExpr[k].base.functionId;
979 980 981 982 983
      if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
        aAggs[functionId].xFunction(&pCtx[k]);
      }
    }
  }
984

985 986 987 988
  for(int32_t i = 0; i < pQuery->numOfOutput; ++i) {
    if (pQuery->pSelectExpr[i].base.functionId != TSDB_FUNC_ARITHM) {
      continue;
    }
989

990 991
    tfree(sasArray[i].data);
  }
992

993 994 995 996 997 998 999
  tfree(sasArray);
}

static int32_t setGroupResultOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, char *pData, int16_t type, int16_t bytes) {
  if (isNull(pData, type)) {  // ignore the null value
    return -1;
  }
1000

1001
  int32_t GROUPRESULTID = 1;
1002

1003
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
1004

1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
  int64_t v = -1;
  // not assign result buffer yet, add new result buffer
  switch(type) {
    case TSDB_DATA_TYPE_BOOL:
    case TSDB_DATA_TYPE_TINYINT:  v = GET_INT8_VAL(pData);  break;
    case TSDB_DATA_TYPE_SMALLINT: v = GET_INT16_VAL(pData); break;
    case TSDB_DATA_TYPE_INT:      v = GET_INT32_VAL(pData); break;
    case TSDB_DATA_TYPE_BIGINT:   v = GET_INT64_VAL(pData); break;
  }

1015
  SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, &pRuntimeEnv->windowResInfo, pData, bytes, true);
1016 1017 1018
  if (pWindowRes == NULL) {
    return -1;
  }
1019

1020 1021 1022
  pWindowRes->window.skey = v;
  pWindowRes->window.ekey = v;

1023 1024 1025 1026 1027 1028
  if (pWindowRes->pos.pageId == -1) {
    int32_t ret = addNewWindowResultBuf(pWindowRes, pResultBuf, GROUPRESULTID, pRuntimeEnv->numOfRowsPerPage);
    if (ret != 0) {
      return -1;
    }
  }
1029

1030 1031 1032 1033 1034
  setWindowResOutputBuf(pRuntimeEnv, pWindowRes);
  initCtxOutputBuf(pRuntimeEnv);
  return TSDB_CODE_SUCCESS;
}

1035
static char *getGroupbyColumnData(SQuery *pQuery, int16_t *type, int16_t *bytes, SArray* pDataBlock) {
1036
  SSqlGroupbyExpr *pGroupbyExpr = pQuery->pGroupbyExpr;
1037

1038
  for (int32_t k = 0; k < pGroupbyExpr->numOfGroupCols; ++k) {
1039 1040
    SColIndex* pColIndex = taosArrayGet(pGroupbyExpr->columnInfo, k);
    if (pColIndex->flag == TSDB_COL_TAG) {
1041 1042
      continue;
    }
1043

1044
    int16_t colIndex = -1;
1045
    int32_t colId = pColIndex->colId;
1046

1047
    for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
1048
      if (pQuery->colList[i].colId == colId) {
1049 1050 1051 1052
        colIndex = i;
        break;
      }
    }
1053

1054
    assert(colIndex >= 0 && colIndex < pQuery->numOfCols);
1055

1056 1057
    *type = pQuery->colList[colIndex].type;
    *bytes = pQuery->colList[colIndex].bytes;
1058
    /*
1059 1060 1061
     *  the colIndex is acquired from the first tables of all qualified tables in this vnode during query prepare
     * stage, the remain tables may not have the required column in cache actually. So, the validation of required
     * column in cache with the corresponding schema is reinforced.
1062 1063
     */
    int32_t numOfCols = taosArrayGetSize(pDataBlock);
1064

1065 1066 1067 1068 1069 1070
    for (int32_t i = 0; i < numOfCols; ++i) {
      SColumnInfoData *p = taosArrayGet(pDataBlock, i);
      if (pColIndex->colId == p->info.colId) {
        return p->pData;
      }
    }
1071
  }
1072

1073
  return NULL;
1074 1075 1076 1077
}

static int32_t doTSJoinFilter(SQueryRuntimeEnv *pRuntimeEnv, int32_t offset) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
1078

1079 1080
  STSElem         elem = tsBufGetElem(pRuntimeEnv->pTSBuf);
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
1081

1082 1083 1084 1085
  // compare tag first
  if (pCtx[0].tag.i64Key != elem.tag) {
    return TS_JOIN_TAG_NOT_EQUALS;
  }
1086

1087 1088 1089
  TSKEY key = *(TSKEY *)(pCtx[0].aInputElemBuf + TSDB_KEYSIZE * offset);

#if defined(_DEBUG_VIEW)
1090 1091
  printf("elem in comp ts file:%" PRId64 ", key:%" PRId64 ", tag:%"PRIu64", query order:%d, ts order:%d, traverse:%d, index:%d\n",
         elem.ts, key, elem.tag, pQuery->order.order, pRuntimeEnv->pTSBuf->tsOrder,
1092 1093
         pRuntimeEnv->pTSBuf->cur.order, pRuntimeEnv->pTSBuf->cur.tsIndex);
#endif
1094

1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    if (key < elem.ts) {
      return TS_JOIN_TS_NOT_EQUALS;
    } else if (key > elem.ts) {
      assert(false);
    }
  } else {
    if (key > elem.ts) {
      return TS_JOIN_TS_NOT_EQUALS;
    } else if (key < elem.ts) {
      assert(false);
    }
  }
1108

1109 1110 1111 1112 1113
  return TS_JOIN_TS_EQUAL;
}

static bool functionNeedToExecute(SQueryRuntimeEnv *pRuntimeEnv, SQLFunctionCtx *pCtx, int32_t functionId) {
  SResultInfo *pResInfo = GET_RES_INFO(pCtx);
H
hjxilinx 已提交
1114
  SQuery* pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
1115 1116 1117 1118 1119

  // in case of timestamp column, always generated results.
  if (functionId == TSDB_FUNC_TS) {
    return true;
  }
H
hjxilinx 已提交
1120
  
1121 1122 1123
  if (pResInfo->complete || functionId == TSDB_FUNC_TAG_DUMMY || functionId == TSDB_FUNC_TS_DUMMY) {
    return false;
  }
1124

1125
  if (functionId == TSDB_FUNC_FIRST_DST || functionId == TSDB_FUNC_FIRST) {
H
hjxilinx 已提交
1126 1127
    return QUERY_IS_ASC_QUERY(pQuery);
  }
1128 1129 1130 1131 1132 1133

  // todo add comments
  if ((functionId == TSDB_FUNC_LAST_DST || functionId == TSDB_FUNC_LAST)) {
    return pCtx->param[0].i64Key == pQuery->order.order;
  }

1134
  // in the supplementary scan, only the following functions need to be executed
H
Haojun Liao 已提交
1135
  if (IS_REVERSE_SCAN(pRuntimeEnv)) {
1136 1137
    return false;
  }
1138

1139 1140 1141
  return true;
}

1142 1143
static void rowwiseApplyFunctions(SQueryRuntimeEnv *pRuntimeEnv, SDataStatis *pStatis, SDataBlockInfo *pDataBlockInfo,
    SWindowResInfo *pWindowResInfo, SArray *pDataBlock) {
1144
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
1145
  bool masterScan = IS_MASTER_SCAN(pRuntimeEnv);
1146

1147
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
1148
  STableQueryInfo* item = pQuery->current;
H
Haojun Liao 已提交
1149 1150 1151 1152

  SColumnInfoData* pColumnInfoData = (SColumnInfoData *)taosArrayGet(pDataBlock, 0);

  TSKEY  *tsCols = (pColumnInfoData->info.type == TSDB_DATA_TYPE_TIMESTAMP)? (TSKEY*) pColumnInfoData->pData:NULL;
H
Haojun Liao 已提交
1153 1154
  bool    groupbyColumnValue = pRuntimeEnv->groupbyNormalCol;

1155
  SArithmeticSupport *sasArray = calloc((size_t)pQuery->numOfOutput, sizeof(SArithmeticSupport));
H
Haojun Liao 已提交
1156
  if (sasArray == NULL) {
H
Haojun Liao 已提交
1157
    finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
H
Haojun Liao 已提交
1158 1159
    longjmp(pRuntimeEnv->env, TSDB_CODE_QRY_OUT_OF_MEMORY);
  }
1160

1161 1162
  int16_t type = 0;
  int16_t bytes = 0;
1163

1164
  char *groupbyColumnData = NULL;
H
Haojun Liao 已提交
1165
  if (groupbyColumnValue) {
1166
    groupbyColumnData = getGroupbyColumnData(pQuery, &type, &bytes, pDataBlock);
1167
  }
1168

1169
  for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
H
hjxilinx 已提交
1170
    char *dataBlock = getDataBlock(pRuntimeEnv, &sasArray[k], k, pDataBlockInfo->rows, pDataBlock);
1171
    setExecParams(pQuery, &pCtx[k], dataBlock, tsCols, pDataBlockInfo, pStatis, &sasArray[k], k);
1172
  }
1173

1174 1175
  // set the input column data
  for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
1176
    SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
H
hjxilinx 已提交
1177 1178
    pFilterInfo->pData = getDataBlockImpl(pDataBlock, pFilterInfo->info.colId);
    assert(pFilterInfo->pData != NULL);
1179
  }
1180

1181
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
1182

1183 1184 1185
  // from top to bottom in desc
  // from bottom to top in asc order
  if (pRuntimeEnv->pTSBuf != NULL) {
H
Haojun Liao 已提交
1186
    SQInfo *pQInfo = (SQInfo *)GET_QINFO_ADDR(pRuntimeEnv);
1187
    qDebug("QInfo:%p process data rows, numOfRows:%d, query order:%d, ts comp order:%d", pQInfo, pDataBlockInfo->rows,
1188 1189
           pQuery->order.order, pRuntimeEnv->pTSBuf->cur.order);
  }
1190

1191
  int32_t j = 0;
H
hjxilinx 已提交
1192
  int32_t offset = -1;
1193

1194
  for (j = 0; j < pDataBlockInfo->rows; ++j) {
H
hjxilinx 已提交
1195
    offset = GET_COL_DATA_POS(pQuery, j, step);
1196

1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
    if (pRuntimeEnv->pTSBuf != NULL) {
      int32_t r = doTSJoinFilter(pRuntimeEnv, offset);
      if (r == TS_JOIN_TAG_NOT_EQUALS) {
        break;
      } else if (r == TS_JOIN_TS_NOT_EQUALS) {
        continue;
      } else {
        assert(r == TS_JOIN_TS_EQUAL);
      }
    }
1207

1208
    if (pQuery->numOfFilterCols > 0 && (!doFilterData(pQuery, offset))) {
1209 1210
      continue;
    }
1211

1212
    // interval window query, decide the time window according to the primary timestamp
H
Haojun Liao 已提交
1213
    if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
1214
      int64_t     ts = tsCols[offset];
1215
      STimeWindow win = getActiveTimeWindow(pWindowResInfo, ts, pQuery);
1216

1217 1218
      bool hasTimeWindow = false;
      int32_t ret = setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pDataBlockInfo->tid, &win, masterScan, &hasTimeWindow);
1219 1220 1221
      if (ret != TSDB_CODE_SUCCESS) {  // null data, too many state code
        continue;
      }
1222

1223 1224 1225 1226
      if (!hasTimeWindow) {
        continue;
      }

1227 1228
      SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
      doRowwiseApplyFunctions(pRuntimeEnv, pStatus, &win, offset);
1229

1230 1231
      STimeWindow nextWin = win;
      int32_t     index = pWindowResInfo->curIndex;
1232

1233
      while (1) {
H
Haojun Liao 已提交
1234
        GET_NEXT_TIMEWINDOW(pQuery, &nextWin);
1235
        if ((nextWin.skey > pQuery->window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
H
Haojun Liao 已提交
1236
            (nextWin.skey < pQuery->window.ekey && !QUERY_IS_ASC_QUERY(pQuery))) {
1237 1238
          break;
        }
1239

1240 1241 1242
        if (ts < nextWin.skey || ts > nextWin.ekey) {
          break;
        }
1243

1244
        // null data, failed to allocate more memory buffer
H
Haojun Liao 已提交
1245
        hasTimeWindow = false;
1246
        if (setWindowOutputBufByKey(pRuntimeEnv, pWindowResInfo, pDataBlockInfo->tid, &nextWin, masterScan, &hasTimeWindow) != TSDB_CODE_SUCCESS) {
1247 1248
          break;
        }
1249

1250 1251 1252 1253
        if (hasTimeWindow) {
          pStatus = getTimeWindowResStatus(pWindowResInfo, curTimeWindow(pWindowResInfo));
          doRowwiseApplyFunctions(pRuntimeEnv, pStatus, &nextWin, offset);
        }
1254
      }
1255

1256 1257 1258
      pWindowResInfo->curIndex = index;
    } else {  // other queries
      // decide which group this rows belongs to according to current state value
H
Haojun Liao 已提交
1259
      if (groupbyColumnValue) {
H
hjxilinx 已提交
1260
        char *val = groupbyColumnData + bytes * offset;
1261

H
hjxilinx 已提交
1262
        int32_t ret = setGroupResultOutputBuf(pRuntimeEnv, val, type, bytes);
1263 1264 1265 1266
        if (ret != TSDB_CODE_SUCCESS) {  // null data, too many state code
          continue;
        }
      }
1267

1268
      for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
1269
        int32_t functionId = pQuery->pSelectExpr[k].base.functionId;
1270 1271 1272 1273 1274
        if (functionNeedToExecute(pRuntimeEnv, &pCtx[k], functionId)) {
          aAggs[functionId].xFunctionF(&pCtx[k], offset);
        }
      }
    }
1275

1276 1277 1278
    if (pRuntimeEnv->pTSBuf != NULL) {
      // if timestamp filter list is empty, quit current query
      if (!tsBufNextPos(pRuntimeEnv->pTSBuf)) {
H
hjxilinx 已提交
1279
        setQueryStatus(pQuery, QUERY_COMPLETED);
1280 1281 1282 1283
        break;
      }
    }
  }
H
Haojun Liao 已提交
1284 1285 1286 1287 1288 1289 1290 1291

  assert(offset >= 0);
  if (tsCols != NULL) {
    item->lastKey = tsCols[offset] + step;
  } else {
    item->lastKey = (QUERY_IS_ASC_QUERY(pQuery)? pDataBlockInfo->window.ekey:pDataBlockInfo->window.skey) + step;
  }

1292 1293 1294 1295 1296
  // todo refactor: extract method
  for(int32_t i = 0; i < pQuery->numOfOutput; ++i) {
    if (pQuery->pSelectExpr[i].base.functionId != TSDB_FUNC_ARITHM) {
      continue;
    }
1297

1298 1299
    tfree(sasArray[i].data);
  }
1300

1301 1302 1303 1304
  free(sasArray);
}

static int32_t tableApplyFunctionsOnBlock(SQueryRuntimeEnv *pRuntimeEnv, SDataBlockInfo *pDataBlockInfo,
H
hjxilinx 已提交
1305
                                          SDataStatis *pStatis, __block_search_fn_t searchFn, SArray *pDataBlock) {
H
hjxilinx 已提交
1306
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
1307 1308 1309
  
  STableQueryInfo* pTableQInfo = pQuery->current;
  SWindowResInfo*  pWindowResInfo = &pRuntimeEnv->windowResInfo;
H
hjxilinx 已提交
1310
  
H
Haojun Liao 已提交
1311
  if (pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL || pRuntimeEnv->groupbyNormalCol) {
1312
    rowwiseApplyFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, pDataBlock);
1313
  } else {
1314
    blockwiseApplyFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, searchFn, pDataBlock);
1315
  }
1316

1317
  // update the lastkey of current table
1318
  TSKEY lastKey = QUERY_IS_ASC_QUERY(pQuery) ? pDataBlockInfo->window.ekey : pDataBlockInfo->window.skey;
H
hjxilinx 已提交
1319
  pTableQInfo->lastKey = lastKey + GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
1320

1321
  // interval query with limit applied
1322
  int32_t numOfRes = 0;
H
Haojun Liao 已提交
1323
  if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
1324 1325 1326
    numOfRes = doCheckQueryCompleted(pRuntimeEnv, lastKey, pWindowResInfo);
  } else {
    numOfRes = getNumOfResult(pRuntimeEnv);
1327

1328 1329 1330 1331
    // update the number of output result
    if (numOfRes > 0 && pQuery->checkBuffer == 1) {
      assert(numOfRes >= pQuery->rec.rows);
      pQuery->rec.rows = numOfRes;
1332

1333 1334 1335
      if (numOfRes >= pQuery->rec.threshold) {
        setQueryStatus(pQuery, QUERY_RESBUF_FULL);
      }
1336

1337 1338 1339
      if ((pQuery->limit.limit >= 0) && (pQuery->limit.limit + pQuery->limit.offset) <= numOfRes) {
        setQueryStatus(pQuery, QUERY_COMPLETED);
      }
H
Haojun Liao 已提交
1340
    }
1341
  }
1342

1343
  return numOfRes;
1344 1345
}

H
Haojun Liao 已提交
1346
void setExecParams(SQuery *pQuery, SQLFunctionCtx *pCtx, void* inputData, TSKEY *tsCol, SDataBlockInfo* pBlockInfo,
1347 1348 1349 1350 1351 1352
                   SDataStatis *pStatis, void *param, int32_t colIndex) {
  
  int32_t functionId = pQuery->pSelectExpr[colIndex].base.functionId;
  int32_t colId = pQuery->pSelectExpr[colIndex].base.colInfo.colId;
  
  SDataStatis *tpField = NULL;
H
Haojun Liao 已提交
1353
  pCtx->hasNull = hasNullValue(&pQuery->pSelectExpr[colIndex].base.colInfo, pStatis, &tpField);
1354
  pCtx->aInputElemBuf = inputData;
1355

1356
  if (tpField != NULL) {
H
Haojun Liao 已提交
1357
    pCtx->preAggVals.isSet  = true;
1358 1359
    pCtx->preAggVals.statis = *tpField;
    assert(pCtx->preAggVals.statis.numOfNull <= pBlockInfo->rows);
1360 1361 1362
  } else {
    pCtx->preAggVals.isSet = false;
  }
1363

H
Haojun Liao 已提交
1364 1365
  pCtx->preAggVals.dataBlockLoaded = (inputData != NULL);

H
Haojun Liao 已提交
1366 1367 1368
  // limit/offset query will affect this value
  pCtx->startOffset = QUERY_IS_ASC_QUERY(pQuery) ? pQuery->pos:0;
  pCtx->size = QUERY_IS_ASC_QUERY(pQuery) ? pBlockInfo->rows - pQuery->pos : pQuery->pos + 1;
1369

1370 1371
  uint32_t status = aAggs[functionId].nStatus;
  if (((status & (TSDB_FUNCSTATE_SELECTIVITY | TSDB_FUNCSTATE_NEED_TS)) != 0) && (tsCol != NULL)) {
H
Haojun Liao 已提交
1372
    pCtx->ptsList = tsCol;
1373
  }
1374

1375 1376 1377 1378 1379
  if (functionId >= TSDB_FUNC_FIRST_DST && functionId <= TSDB_FUNC_LAST_DST) {
    // last_dist or first_dist function
    // store the first&last timestamp into the intermediate buffer [1], the true
    // value may be null but timestamp will never be null
  } else if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_TWA ||
1380
             functionId == TSDB_FUNC_DIFF || (functionId >= TSDB_FUNC_RATE && functionId <= TSDB_FUNC_AVG_IRATE)) {
1381
    /*
H
Haojun Liao 已提交
1382
     * least squares function needs two columns of input, currently, the x value of linear equation is set to
1383 1384 1385 1386 1387 1388 1389 1390 1391 1392
     * timestamp column, and the y-value is the column specified in pQuery->pSelectExpr[i].colIdxInBuffer
     *
     * top/bottom function needs timestamp to indicate when the
     * top/bottom values emerge, so does diff function
     */
    if (functionId == TSDB_FUNC_TWA) {
      STwaInfo *pTWAInfo = GET_RES_INFO(pCtx)->interResultBuf;
      pTWAInfo->SKey = pQuery->window.skey;
      pTWAInfo->EKey = pQuery->window.ekey;
    }
1393

1394 1395
  } else if (functionId == TSDB_FUNC_ARITHM) {
    pCtx->param[1].pz = param;
H
Haojun Liao 已提交
1396 1397 1398 1399 1400 1401
  } else if (functionId == TSDB_FUNC_SPREAD) {  // set the statistics data for primary time stamp column
    if (colId == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
      pCtx->preAggVals.isSet  = true;
      pCtx->preAggVals.statis.min = pBlockInfo->window.skey;
      pCtx->preAggVals.statis.max = pBlockInfo->window.ekey;
    }
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414
  } else if (functionId == TSDB_FUNC_INTERP) {
    SInterpInfoDetail *pInterpInfo = GET_RES_INFO(pCtx)->interResultBuf;
    pInterpInfo->type = pQuery->fillType;
    pInterpInfo->ts = pQuery->window.skey;
    pInterpInfo->primaryCol = (colId == PRIMARYKEY_TIMESTAMP_COL_INDEX);
  
    if (pQuery->fillVal != NULL) {
      if (isNull((const char*) &pQuery->fillVal[colIndex], pCtx->inputType)) {
        pCtx->param[1].nType = TSDB_DATA_TYPE_NULL;
      } else { // todo refactor, tVariantCreateFromBinary should handle the NULL value
        tVariantCreateFromBinary(&pCtx->param[1], (char*) &pQuery->fillVal[colIndex], pCtx->inputBytes, pCtx->inputType);
      }
    }
1415
  }
1416

1417 1418 1419 1420 1421 1422
#if defined(_DEBUG_VIEW)
  //  int64_t *tsList = (int64_t *)primaryColumnData;
//  int64_t  s = tsList[0];
//  int64_t  e = tsList[size - 1];

//    if (IS_DATA_BLOCK_LOADED(blockStatus)) {
1423
//        qDebug("QInfo:%p query ts:%lld-%lld, offset:%d, rows:%d, bstatus:%d,
1424 1425 1426
//        functId:%d", GET_QINFO_ADDR(pQuery),
//               s, e, startOffset, size, blockStatus, functionId);
//    } else {
1427
//        qDebug("QInfo:%p block not loaded, bstatus:%d",
1428 1429 1430 1431 1432 1433
//        GET_QINFO_ADDR(pQuery), blockStatus);
//    }
#endif
}

// set the output buffer for the selectivity + tag query
H
Haojun Liao 已提交
1434 1435 1436
static void setCtxTagColumnInfo(SQueryRuntimeEnv *pRuntimeEnv, SQLFunctionCtx *pCtx) {
  SQuery* pQuery = pRuntimeEnv->pQuery;

1437
  if (isSelectivityWithTagsQuery(pQuery)) {
1438
    int32_t num = 0;
1439
    int16_t tagLen = 0;
1440 1441
    
    SQLFunctionCtx *p = NULL;
1442
    SQLFunctionCtx **pTagCtx = calloc(pQuery->numOfOutput, POINTER_BYTES);
H
Haojun Liao 已提交
1443

1444
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1445
      SSqlFuncMsg *pSqlFuncMsg = &pQuery->pSelectExpr[i].base;
1446
      
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459
      if (pSqlFuncMsg->functionId == TSDB_FUNC_TAG_DUMMY || pSqlFuncMsg->functionId == TSDB_FUNC_TS_DUMMY) {
        tagLen += pCtx[i].outputBytes;
        pTagCtx[num++] = &pCtx[i];
      } else if ((aAggs[pSqlFuncMsg->functionId].nStatus & TSDB_FUNCSTATE_SELECTIVITY) != 0) {
        p = &pCtx[i];
      } else if (pSqlFuncMsg->functionId == TSDB_FUNC_TS || pSqlFuncMsg->functionId == TSDB_FUNC_TAG) {
        // tag function may be the group by tag column
        // ts may be the required primary timestamp column
        continue;
      } else {
        // the column may be the normal column, group by normal_column, the functionId is TSDB_FUNC_PRJ
      }
    }
dengyihao's avatar
dengyihao 已提交
1460 1461 1462 1463 1464 1465 1466
    if (p != NULL) {
      p->tagInfo.pTagCtxList = pTagCtx;
      p->tagInfo.numOfTagCols = num;
      p->tagInfo.tagsLen = tagLen;
    } else {
      tfree(pTagCtx); 
    }
1467 1468 1469
  }
}

H
Haojun Liao 已提交
1470 1471
static FORCE_INLINE void setWindowResultInfo(SResultInfo *pResultInfo, SQuery *pQuery, bool isStableQuery, char* buf) {
  char* p = buf;
1472
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
H
Haojun Liao 已提交
1473 1474 1475 1476
    int32_t size = pQuery->pSelectExpr[i].interBytes;
    setResultInfoBuf(&pResultInfo[i], size, isStableQuery, p);

    p += size;
1477 1478 1479
  }
}

1480
static int32_t setupQueryRuntimeEnv(SQueryRuntimeEnv *pRuntimeEnv, int16_t order) {
1481
  qDebug("QInfo:%p setup runtime env", GET_QINFO_ADDR(pRuntimeEnv));
1482 1483
  SQuery *pQuery = pRuntimeEnv->pQuery;

H
Haojun Liao 已提交
1484 1485 1486
  size_t size = pRuntimeEnv->interBufSize + pQuery->numOfOutput * sizeof(SResultInfo);

  pRuntimeEnv->resultInfo = calloc(1, size);
1487
  pRuntimeEnv->pCtx = (SQLFunctionCtx *)calloc(pQuery->numOfOutput, sizeof(SQLFunctionCtx));
1488

1489
  if (pRuntimeEnv->resultInfo == NULL || pRuntimeEnv->pCtx == NULL) {
1490
    goto _clean;
1491
  }
1492

1493 1494
  qDebug("QInfo:%p setup runtime env1", GET_QINFO_ADDR(pRuntimeEnv));

1495
  pRuntimeEnv->offset[0] = 0;
1496
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1497
    SSqlFuncMsg *pSqlFuncMsg = &pQuery->pSelectExpr[i].base;
1498

1499
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
1500
    SColIndex* pIndex = &pSqlFuncMsg->colInfo;
1501

1502 1503
    int32_t index = pSqlFuncMsg->colInfo.colIndex;
    if (TSDB_COL_IS_TAG(pIndex->flag)) {
1504
      if (pIndex->colId == TSDB_TBNAME_COLUMN_INDEX) {  // todo refactor
H
Haojun Liao 已提交
1505 1506 1507 1508
        SSchema s = tGetTableNameColumnSchema();

        pCtx->inputBytes = s.bytes;
        pCtx->inputType = s.type;
1509 1510 1511 1512
      } else {
        pCtx->inputBytes = pQuery->tagColList[index].bytes;
        pCtx->inputType = pQuery->tagColList[index].type;
      }
1513
      
1514 1515 1516 1517
    } else {
      pCtx->inputBytes = pQuery->colList[index].bytes;
      pCtx->inputType = pQuery->colList[index].type;
    }
1518
  
1519
    assert(isValidDataType(pCtx->inputType));
1520
    pCtx->ptsOutputBuf = NULL;
1521

1522 1523
    pCtx->outputBytes = pQuery->pSelectExpr[i].bytes;
    pCtx->outputType = pQuery->pSelectExpr[i].type;
1524

1525 1526
    pCtx->order = pQuery->order.order;
    pCtx->functionId = pSqlFuncMsg->functionId;
1527

1528 1529 1530 1531 1532 1533 1534 1535 1536 1537
    pCtx->numOfParams = pSqlFuncMsg->numOfParams;
    for (int32_t j = 0; j < pCtx->numOfParams; ++j) {
      int16_t type = pSqlFuncMsg->arg[j].argType;
      int16_t bytes = pSqlFuncMsg->arg[j].argBytes;
      if (type == TSDB_DATA_TYPE_BINARY || type == TSDB_DATA_TYPE_NCHAR) {
        tVariantCreateFromBinary(&pCtx->param[j], pSqlFuncMsg->arg->argValue.pz, bytes, type);
      } else {
        tVariantCreateFromBinary(&pCtx->param[j], (char *)&pSqlFuncMsg->arg[j].argValue.i64, bytes, type);
      }
    }
1538

1539 1540
    qDebug("QInfo:%p setup runtime env2", GET_QINFO_ADDR(pRuntimeEnv));

1541 1542
    // set the order information for top/bottom query
    int32_t functionId = pCtx->functionId;
1543

1544
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
1545
      int32_t f = pQuery->pSelectExpr[0].base.functionId;
1546
      assert(f == TSDB_FUNC_TS || f == TSDB_FUNC_TS_DUMMY);
1547

1548 1549 1550 1551
      pCtx->param[2].i64Key = order;
      pCtx->param[2].nType = TSDB_DATA_TYPE_BIGINT;
      pCtx->param[3].i64Key = functionId;
      pCtx->param[3].nType = TSDB_DATA_TYPE_BIGINT;
1552

1553 1554
      pCtx->param[1].i64Key = pQuery->order.orderColId;
    }
1555

1556 1557 1558 1559
    if (i > 0) {
      pRuntimeEnv->offset[i] = pRuntimeEnv->offset[i - 1] + pRuntimeEnv->pCtx[i - 1].outputBytes;
    }
  }
1560

1561 1562
  qDebug("QInfo:%p setup runtime env3", GET_QINFO_ADDR(pRuntimeEnv));

H
Haojun Liao 已提交
1563
  char* buf = (char*) pRuntimeEnv->resultInfo + sizeof(SResultInfo) * pQuery->numOfOutput;
H
Haojun Liao 已提交
1564

1565
  // set the intermediate result output buffer
H
Haojun Liao 已提交
1566
  setWindowResultInfo(pRuntimeEnv->resultInfo, pQuery, pRuntimeEnv->stableQuery, buf);
1567

1568 1569
  qDebug("QInfo:%p setup runtime env4", GET_QINFO_ADDR(pRuntimeEnv));

1570
  // if it is group by normal column, do not set output buffer, the output buffer is pResult
1571
  if (!pRuntimeEnv->groupbyNormalCol && !pRuntimeEnv->stableQuery) {
1572 1573
    resetCtxOutputBuf(pRuntimeEnv);
  }
1574

1575 1576
  qDebug("QInfo:%p setup runtime env5", GET_QINFO_ADDR(pRuntimeEnv));

H
Haojun Liao 已提交
1577
  setCtxTagColumnInfo(pRuntimeEnv, pRuntimeEnv->pCtx);
1578 1579

  qDebug("QInfo:%p init completed", GET_QINFO_ADDR(pRuntimeEnv));
1580
  return TSDB_CODE_SUCCESS;
1581

1582
_clean:
1583 1584
  tfree(pRuntimeEnv->resultInfo);
  tfree(pRuntimeEnv->pCtx);
1585

1586
  return TSDB_CODE_QRY_OUT_OF_MEMORY;
1587 1588 1589 1590 1591 1592
}

static void teardownQueryRuntimeEnv(SQueryRuntimeEnv *pRuntimeEnv) {
  if (pRuntimeEnv->pQuery == NULL) {
    return;
  }
1593

1594
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
1595
  SQInfo* pQInfo = (SQInfo*) GET_QINFO_ADDR(pRuntimeEnv);
1596

1597
  qDebug("QInfo:%p teardown runtime env", pQInfo);
H
Haojun Liao 已提交
1598
  cleanupTimeWindowInfo(&pRuntimeEnv->windowResInfo);
1599

1600
  if (pRuntimeEnv->pCtx != NULL) {
1601
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1602
      SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
1603

1604 1605 1606
      for (int32_t j = 0; j < pCtx->numOfParams; ++j) {
        tVariantDestroy(&pCtx->param[j]);
      }
1607

1608 1609 1610
      tVariantDestroy(&pCtx->tag);
      tfree(pCtx->tagInfo.pTagCtxList);
    }
1611

1612 1613 1614
    tfree(pRuntimeEnv->resultInfo);
    tfree(pRuntimeEnv->pCtx);
  }
1615

H
Haojun Liao 已提交
1616
  pRuntimeEnv->pFillInfo = taosDestoryFillInfo(pRuntimeEnv->pFillInfo);
1617

H
hjxilinx 已提交
1618
  destroyResultBuf(pRuntimeEnv->pResultBuf, pQInfo);
1619
  tsdbCleanupQueryHandle(pRuntimeEnv->pQueryHandle);
1620
  tsdbCleanupQueryHandle(pRuntimeEnv->pSecQueryHandle);
1621

H
Haojun Liao 已提交
1622
  pRuntimeEnv->pTSBuf = tsBufDestroy(pRuntimeEnv->pTSBuf);
1623 1624
}

H
Haojun Liao 已提交
1625
#define IS_QUERY_KILLED(_q) ((_q)->code == TSDB_CODE_TSC_QUERY_CANCELLED)
1626

H
Haojun Liao 已提交
1627
static void setQueryKilled(SQInfo *pQInfo) { pQInfo->code = TSDB_CODE_TSC_QUERY_CANCELLED;}
H
hjxilinx 已提交
1628

H
Haojun Liao 已提交
1629 1630 1631
static bool isFixedOutputQuery(SQueryRuntimeEnv* pRuntimeEnv) {
  SQuery* pQuery = pRuntimeEnv->pQuery;
  if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
1632 1633
    return false;
  }
1634

1635
  // Note:top/bottom query is fixed output query
H
Haojun Liao 已提交
1636
  if (pRuntimeEnv->topBotQuery || pRuntimeEnv->groupbyNormalCol) {
1637 1638
    return true;
  }
1639

1640
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1641
    SSqlFuncMsg *pExprMsg = &pQuery->pSelectExpr[i].base;
1642

1643 1644
    // ignore the ts_comp function
    if (i == 0 && pExprMsg->functionId == TSDB_FUNC_PRJ && pExprMsg->numOfParams == 1 &&
1645
        pExprMsg->colInfo.colIndex == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
1646 1647
      continue;
    }
1648

1649 1650 1651
    if (pExprMsg->functionId == TSDB_FUNC_TS || pExprMsg->functionId == TSDB_FUNC_TS_DUMMY) {
      continue;
    }
1652

1653 1654 1655 1656
    if (!IS_MULTIOUTPUT(aAggs[pExprMsg->functionId].nStatus)) {
      return true;
    }
  }
1657

1658 1659 1660
  return false;
}

1661
// todo refactor with isLastRowQuery
H
hjxilinx 已提交
1662
static bool isPointInterpoQuery(SQuery *pQuery) {
1663
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1664
    int32_t functionID = pQuery->pSelectExpr[i].base.functionId;
1665
    if (functionID == TSDB_FUNC_INTERP) {
1666 1667 1668
      return true;
    }
  }
1669

1670 1671 1672 1673
  return false;
}

// TODO REFACTOR:MERGE WITH CLIENT-SIDE FUNCTION
H
hjxilinx 已提交
1674
static bool isSumAvgRateQuery(SQuery *pQuery) {
1675
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1676
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
1677 1678 1679
    if (functionId == TSDB_FUNC_TS) {
      continue;
    }
1680

1681 1682 1683 1684 1685
    if (functionId == TSDB_FUNC_SUM_RATE || functionId == TSDB_FUNC_SUM_IRATE || functionId == TSDB_FUNC_AVG_RATE ||
        functionId == TSDB_FUNC_AVG_IRATE) {
      return true;
    }
  }
1686

1687 1688 1689
  return false;
}

H
hjxilinx 已提交
1690
static bool isFirstLastRowQuery(SQuery *pQuery) {
1691
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1692
    int32_t functionID = pQuery->pSelectExpr[i].base.functionId;
1693 1694 1695 1696
    if (functionID == TSDB_FUNC_LAST_ROW) {
      return true;
    }
  }
1697

1698 1699 1700
  return false;
}

H
hjxilinx 已提交
1701
static bool needReverseScan(SQuery *pQuery) {
1702
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1703
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
1704 1705 1706
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TS_DUMMY || functionId == TSDB_FUNC_TAG) {
      continue;
    }
1707

1708
    if ((functionId == TSDB_FUNC_FIRST || functionId == TSDB_FUNC_FIRST_DST) && !QUERY_IS_ASC_QUERY(pQuery)) {
1709 1710
      return true;
    }
1711 1712 1713 1714 1715

    if (functionId == TSDB_FUNC_LAST || functionId == TSDB_FUNC_LAST_DST) {
      int32_t order = pQuery->pSelectExpr[i].base.arg->argValue.i64;
      return order != pQuery->order.order;
    }
1716
  }
1717

1718 1719
  return false;
}
H
hjxilinx 已提交
1720 1721 1722

static bool onlyQueryTags(SQuery* pQuery) {
  for(int32_t i = 0; i < pQuery->numOfOutput; ++i) {
H
Haojun Liao 已提交
1723 1724 1725 1726 1727
    SExprInfo* pExprInfo = &pQuery->pSelectExpr[i];

    int32_t functionId = pExprInfo->base.functionId;
    if (functionId != TSDB_FUNC_TAGPRJ && functionId != TSDB_FUNC_TID_TAG &&
        (!(functionId == TSDB_FUNC_COUNT && pExprInfo->base.colInfo.colId == TSDB_TBNAME_COLUMN_INDEX))) {
H
hjxilinx 已提交
1728 1729 1730
      return false;
    }
  }
1731

H
hjxilinx 已提交
1732 1733 1734
  return true;
}

1735 1736
/////////////////////////////////////////////////////////////////////////////////////////////

H
Haojun Liao 已提交
1737
void getAlignQueryTimeWindow(SQuery *pQuery, int64_t key, int64_t keyFirst, int64_t keyLast, STimeWindow *win) {
1738
  assert(key >= keyFirst && key <= keyLast && pQuery->slidingTime <= pQuery->intervalTime);
H
Haojun Liao 已提交
1739
  win->skey = taosGetIntervalStartTimestamp(key, pQuery->slidingTime, pQuery->intervalTime, pQuery->slidingTimeUnit, pQuery->precision);
H
Haojun Liao 已提交
1740 1741 1742 1743 1744

  /*
   * if the realSkey > INT64_MAX - pQuery->intervalTime, the query duration between
   * realSkey and realEkey must be less than one interval.Therefore, no need to adjust the query ranges.
   */
1745 1746 1747 1748
  if (keyFirst > (INT64_MAX - pQuery->intervalTime)) {
    assert(keyLast - keyFirst < pQuery->intervalTime);
    win->ekey = INT64_MAX;
    return;
H
Haojun Liao 已提交
1749 1750
  } else {
    win->ekey = win->skey + pQuery->intervalTime - 1;
1751 1752 1753 1754 1755
  }
}

static void setScanLimitationByResultBuffer(SQuery *pQuery) {
  if (isTopBottomQuery(pQuery)) {
1756
    pQuery->checkBuffer = 0;
1757
  } else if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
1758
    pQuery->checkBuffer = 0;
1759 1760
  } else {
    bool hasMultioutput = false;
1761
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1762
      SSqlFuncMsg *pExprMsg = &pQuery->pSelectExpr[i].base;
1763 1764 1765
      if (pExprMsg->functionId == TSDB_FUNC_TS || pExprMsg->functionId == TSDB_FUNC_TS_DUMMY) {
        continue;
      }
1766

1767 1768 1769 1770 1771
      hasMultioutput = IS_MULTIOUTPUT(aAggs[pExprMsg->functionId].nStatus);
      if (!hasMultioutput) {
        break;
      }
    }
1772

1773
    pQuery->checkBuffer = hasMultioutput ? 1 : 0;
1774 1775 1776 1777 1778 1779
  }
}

/*
 * todo add more parameters to check soon..
 */
1780
bool colIdCheck(SQuery *pQuery) {
1781 1782
  // load data column information is incorrect
  for (int32_t i = 0; i < pQuery->numOfCols - 1; ++i) {
1783
    if (pQuery->colList[i].colId == pQuery->colList[i + 1].colId) {
S
slguan 已提交
1784
      qError("QInfo:%p invalid data load column for query", GET_QINFO_ADDR(pQuery));
1785 1786 1787
      return false;
    }
  }
1788
  
1789 1790 1791 1792 1793 1794
  return true;
}

// todo ignore the avg/sum/min/max/count/stddev/top/bottom functions, of which
// the scan order is not matter
static bool onlyOneQueryType(SQuery *pQuery, int32_t functId, int32_t functIdDst) {
1795
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
1796
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
1797

1798 1799 1800 1801
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TS_DUMMY || functionId == TSDB_FUNC_TAG ||
        functionId == TSDB_FUNC_TAG_DUMMY) {
      continue;
    }
1802

1803 1804 1805 1806
    if (functionId != functId && functionId != functIdDst) {
      return false;
    }
  }
1807

1808 1809 1810 1811 1812 1813 1814
  return true;
}

static bool onlyFirstQuery(SQuery *pQuery) { return onlyOneQueryType(pQuery, TSDB_FUNC_FIRST, TSDB_FUNC_FIRST_DST); }

static bool onlyLastQuery(SQuery *pQuery) { return onlyOneQueryType(pQuery, TSDB_FUNC_LAST, TSDB_FUNC_LAST_DST); }

H
Haojun Liao 已提交
1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
// todo refactor, add iterator
static void doExchangeTimeWindow(SQInfo* pQInfo) {
  size_t t = GET_NUM_OF_TABLEGROUP(pQInfo);
  for(int32_t i = 0; i < t; ++i) {
    SArray* p1 = GET_TABLEGROUP(pQInfo, i);

    size_t len = taosArrayGetSize(p1);
    for(int32_t j = 0; j < len; ++j) {
      STableQueryInfo* pTableQueryInfo = (STableQueryInfo*) taosArrayGetP(p1, j);
      SWAP(pTableQueryInfo->win.skey, pTableQueryInfo->win.ekey, TSKEY);
    }
  }
}

H
Haojun Liao 已提交
1829 1830 1831
static void changeExecuteScanOrder(SQInfo *pQInfo, bool stableQuery) {
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;

1832 1833 1834
  // in case of point-interpolation query, use asc order scan
  char msg[] = "QInfo:%p scan order changed for %s query, old:%d, new:%d, qrange exchanged, old qrange:%" PRId64
               "-%" PRId64 ", new qrange:%" PRId64 "-%" PRId64;
1835

1836 1837 1838
  // todo handle the case the the order irrelevant query type mixed up with order critical query type
  // descending order query for last_row query
  if (isFirstLastRowQuery(pQuery)) {
1839
    qDebug("QInfo:%p scan order changed for last_row query, old:%d, new:%d", GET_QINFO_ADDR(pQuery),
1840
           pQuery->order.order, TSDB_ORDER_DESC);
1841

1842
    pQuery->order.order = TSDB_ORDER_DESC;
1843

1844 1845
    int64_t skey = MIN(pQuery->window.skey, pQuery->window.ekey);
    int64_t ekey = MAX(pQuery->window.skey, pQuery->window.ekey);
1846

1847 1848
    pQuery->window.skey = ekey;
    pQuery->window.ekey = skey;
1849

1850 1851
    return;
  }
1852

1853 1854
  if (isPointInterpoQuery(pQuery) && pQuery->intervalTime == 0) {
    if (!QUERY_IS_ASC_QUERY(pQuery)) {
1855
      qDebug(msg, GET_QINFO_ADDR(pQuery), "interp", pQuery->order.order, TSDB_ORDER_ASC, pQuery->window.skey,
1856
             pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);
1857 1858
      SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
    }
1859

1860
    pQuery->order.order = TSDB_ORDER_ASC;
1861 1862
    return;
  }
1863

1864 1865 1866
  if (pQuery->intervalTime == 0) {
    if (onlyFirstQuery(pQuery)) {
      if (!QUERY_IS_ASC_QUERY(pQuery)) {
1867
        qDebug(msg, GET_QINFO_ADDR(pQuery), "only-first", pQuery->order.order, TSDB_ORDER_ASC, pQuery->window.skey,
1868 1869
               pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1870
        SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
H
Haojun Liao 已提交
1871
        doExchangeTimeWindow(pQInfo);
1872
      }
1873

1874
      pQuery->order.order = TSDB_ORDER_ASC;
1875 1876
    } else if (onlyLastQuery(pQuery)) {
      if (QUERY_IS_ASC_QUERY(pQuery)) {
1877
        qDebug(msg, GET_QINFO_ADDR(pQuery), "only-last", pQuery->order.order, TSDB_ORDER_DESC, pQuery->window.skey,
1878 1879
               pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1880
        SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
H
Haojun Liao 已提交
1881
        doExchangeTimeWindow(pQInfo);
1882
      }
1883

1884
      pQuery->order.order = TSDB_ORDER_DESC;
1885
    }
1886

1887
  } else {  // interval query
1888
    if (stableQuery) {
1889 1890
      if (onlyFirstQuery(pQuery)) {
        if (!QUERY_IS_ASC_QUERY(pQuery)) {
1891
          qDebug(msg, GET_QINFO_ADDR(pQuery), "only-first stable", pQuery->order.order, TSDB_ORDER_ASC,
1892 1893
                 pQuery->window.skey, pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1894 1895
          SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
        }
1896

1897
        pQuery->order.order = TSDB_ORDER_ASC;
1898 1899
      } else if (onlyLastQuery(pQuery)) {
        if (QUERY_IS_ASC_QUERY(pQuery)) {
1900
          qDebug(msg, GET_QINFO_ADDR(pQuery), "only-last stable", pQuery->order.order, TSDB_ORDER_DESC,
1901 1902
                 pQuery->window.skey, pQuery->window.ekey, pQuery->window.ekey, pQuery->window.skey);

1903 1904
          SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
        }
1905

1906
        pQuery->order.order = TSDB_ORDER_DESC;
1907 1908 1909 1910 1911 1912 1913 1914
      }
    }
  }
}

static int32_t getInitialPageNum(SQInfo *pQInfo) {
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
  int32_t INITIAL_RESULT_ROWS_VALUE = 16;
1915

1916
  int32_t num = 0;
1917

1918 1919
  if (isGroupbyNormalCol(pQuery->pGroupbyExpr)) {
    num = 128;
H
Haojun Liao 已提交
1920
  } else if (QUERY_IS_INTERVAL_QUERY(pQuery)) {  // time window query, allocate one page for each table
1921
    size_t s = pQInfo->tableqinfoGroupInfo.numOfTables;
1922
    num = MAX(s, INITIAL_RESULT_ROWS_VALUE);
1923 1924
  } else {    // for super table query, one page for each subset
    num = 1;  // pQInfo->pSidSet->numOfSubSet;
1925
  }
1926

1927 1928 1929 1930
  assert(num > 0);
  return num;
}

1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944
static void getIntermediateBufInfo(SQueryRuntimeEnv* pRuntimeEnv, int32_t* ps, int32_t* rowsize) {
  SQuery* pQuery = pRuntimeEnv->pQuery;

  *rowsize = pQuery->rowSize * GET_ROW_PARAM_FOR_MULTIOUTPUT(pQuery, pRuntimeEnv->topBotQuery, pRuntimeEnv->stableQuery);
  int32_t overhead = sizeof(tFilePage);

  // one page contains at least two rows
  *ps = DEFAULT_INTERN_BUF_PAGE_SIZE;
  while(((*rowsize) * 2) > (*ps) - overhead) {
    *ps = (*ps << 1u);
  }

  pRuntimeEnv->numOfRowsPerPage = ((*ps) - sizeof(tFilePage)) / (*rowsize);

1945 1946
}

H
Haojun Liao 已提交
1947
#define IS_PREFILTER_TYPE(_t) ((_t) != TSDB_DATA_TYPE_BINARY && (_t) != TSDB_DATA_TYPE_NCHAR)
1948

H
Haojun Liao 已提交
1949 1950 1951 1952
static bool needToLoadDataBlock(SQueryRuntimeEnv* pRuntimeEnv, SDataStatis *pDataStatis, SQLFunctionCtx *pCtx,
    int32_t numOfRows) {
  SQuery* pQuery = pRuntimeEnv->pQuery;
  if (pDataStatis == NULL || (pQuery->numOfFilterCols == 0 && (!pRuntimeEnv->topBotQuery))) {
1953 1954 1955 1956 1957
    return true;
  }

  for (int32_t k = 0; k < pQuery->numOfFilterCols; ++k) {
    SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[k];
1958

H
Haojun Liao 已提交
1959 1960 1961 1962 1963 1964 1965 1966
    int32_t index = -1;
    for(int32_t i = 0; i < pQuery->numOfCols; ++i) {
      if (pDataStatis[i].colId == pFilterInfo->info.colId) {
        index = i;
        break;
      }
    }

H
Haojun Liao 已提交
1967
    // no statistics data
H
Haojun Liao 已提交
1968
    if (index == -1) {
H
Haojun Liao 已提交
1969
      return true;
1970
    }
1971

1972
    // not support pre-filter operation on binary/nchar data type
H
Haojun Liao 已提交
1973
    if (!IS_PREFILTER_TYPE(pFilterInfo->info.type)) {
H
Haojun Liao 已提交
1974
      return true;
1975
    }
1976

1977
    // all points in current column are NULL, no need to check its boundary value
H
Haojun Liao 已提交
1978
    if (pDataStatis[index].numOfNull == numOfRows) {
1979 1980
      continue;
    }
1981

H
Haojun Liao 已提交
1982 1983 1984 1985 1986
    SDataStatis* pDataBlockst = &pDataStatis[index];

    if (pFilterInfo->info.type == TSDB_DATA_TYPE_FLOAT) {
      float minval = *(double *)(&pDataBlockst->min);
      float maxval = *(double *)(&pDataBlockst->max);
1987

1988 1989 1990 1991 1992 1993 1994
      for (int32_t i = 0; i < pFilterInfo->numOfFilters; ++i) {
        if (pFilterInfo->pFilters[i].fp(&pFilterInfo->pFilters[i], (char *)&minval, (char *)&maxval)) {
          return true;
        }
      }
    } else {
      for (int32_t i = 0; i < pFilterInfo->numOfFilters; ++i) {
H
Haojun Liao 已提交
1995
        if (pFilterInfo->pFilters[i].fp(&pFilterInfo->pFilters[i], (char *)&pDataBlockst->min, (char *)&pDataBlockst->max)) {
1996 1997 1998 1999 2000
          return true;
        }
      }
    }
  }
2001

H
Haojun Liao 已提交
2002 2003 2004 2005 2006 2007 2008 2009
  if (pRuntimeEnv->topBotQuery) {
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
      int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
      if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
        return topbot_datablock_filter(&pCtx[i], functionId, (char *)&pDataStatis[i].min, (char *)&pDataStatis[i].max);
      }
    }
  }
2010

H
Haojun Liao 已提交
2011
  return false;
2012 2013
}

H
Haojun Liao 已提交
2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060
#define PT_IN_WINDOW(_p, _w)  ((_p) > (_w).skey && (_p) < (_w).ekey)

static bool overlapWithTimeWindow(SQuery* pQuery, SDataBlockInfo* pBlockInfo) {
  STimeWindow w = {0};

  TSKEY sk = MIN(pQuery->window.skey, pQuery->window.ekey);
  TSKEY ek = MAX(pQuery->window.skey, pQuery->window.ekey);


  if (QUERY_IS_ASC_QUERY(pQuery)) {
    getAlignQueryTimeWindow(pQuery, pBlockInfo->window.skey, sk, ek, &w);

    if (PT_IN_WINDOW(w.ekey, pBlockInfo->window)) {
      return true;
    }

    while(1) {
      GET_NEXT_TIMEWINDOW(pQuery, &w);
      if (w.skey > pBlockInfo->window.skey) {
        break;
      }

      if (PT_IN_WINDOW(w.skey, pBlockInfo->window) || PT_IN_WINDOW(w.ekey, pBlockInfo->window)) {
        return true;
      }
    }
  } else {
    getAlignQueryTimeWindow(pQuery, pBlockInfo->window.ekey, sk, ek, &w);
    if (PT_IN_WINDOW(w.skey, pBlockInfo->window)) {
      return true;
    }

    while(1) {
      GET_NEXT_TIMEWINDOW(pQuery, &w);
      if (w.ekey < pBlockInfo->window.skey) {
        break;
      }

      if (PT_IN_WINDOW(w.skey, pBlockInfo->window) || PT_IN_WINDOW(w.ekey, pBlockInfo->window)) {
        return true;
      }
    }
  }

  return false;
}

H
Haojun Liao 已提交
2061
int32_t loadDataBlockOnDemand(SQueryRuntimeEnv *pRuntimeEnv, void* pQueryHandle, SDataBlockInfo* pBlockInfo, SDataStatis **pStatis, SArray** pDataBlock) {
2062
  SQuery *pQuery = pRuntimeEnv->pQuery;
2063

H
Haojun Liao 已提交
2064
  uint32_t status = 0;
H
Haojun Liao 已提交
2065
  if (pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf > 0) {
H
Haojun Liao 已提交
2066 2067
    status = BLK_DATA_ALL_NEEDED;
  } else { // check if this data block is required to load
H
Haojun Liao 已提交
2068

H
Haojun Liao 已提交
2069
    // Calculate all time windows that are overlapping or contain current data block.
2070
    // If current data block is contained by all possible time window, do not load current data block.
H
Haojun Liao 已提交
2071 2072
    if (QUERY_IS_INTERVAL_QUERY(pQuery) && overlapWithTimeWindow(pQuery, pBlockInfo)) {
      status = BLK_DATA_ALL_NEEDED;
2073
    }
2074

H
Haojun Liao 已提交
2075 2076 2077 2078 2079 2080 2081 2082
    if (status != BLK_DATA_ALL_NEEDED) {
      for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
        SSqlFuncMsg* pSqlFunc = &pQuery->pSelectExpr[i].base;

        int32_t functionId = pSqlFunc->functionId;
        int32_t colId = pSqlFunc->colInfo.colId;

        status |= aAggs[functionId].dataReqFunc(&pRuntimeEnv->pCtx[i], pBlockInfo->window.skey, pBlockInfo->window.ekey, colId);
H
Haojun Liao 已提交
2083
        if ((status & BLK_DATA_ALL_NEEDED) == BLK_DATA_ALL_NEEDED) {
H
Haojun Liao 已提交
2084 2085 2086
          break;
        }
      }
2087 2088
    }
  }
2089

H
Haojun Liao 已提交
2090 2091 2092
  if (status == BLK_DATA_NO_NEEDED) {
    qDebug("QInfo:%p data block discard, brange:%"PRId64 "-%"PRId64", rows:%d", GET_QINFO_ADDR(pRuntimeEnv),
           pBlockInfo->window.skey, pBlockInfo->window.ekey, pBlockInfo->rows);
2093
    pRuntimeEnv->summary.discardBlocks += 1;
H
Haojun Liao 已提交
2094
  } else if (status == BLK_DATA_STATIS_NEEDED) {
H
hjxilinx 已提交
2095
    if (tsdbRetrieveDataBlockStatisInfo(pQueryHandle, pStatis) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
2096
      //        return DISK_DATA_LOAD_FAILED;
2097
    }
2098 2099 2100 2101
  
    pRuntimeEnv->summary.loadBlockStatis += 1;
  
    if (*pStatis == NULL) { // data block statistics does not exist, load data block
H
Haojun Liao 已提交
2102
      *pDataBlock = tsdbRetrieveDataBlock(pQueryHandle, NULL);
2103
      pRuntimeEnv->summary.totalCheckedRows += pBlockInfo->rows;
2104 2105
    }
  } else {
H
Haojun Liao 已提交
2106
    assert(status == BLK_DATA_ALL_NEEDED);
2107 2108
  
    // load the data block statistics to perform further filter
H
Haojun Liao 已提交
2109
    pRuntimeEnv->summary.loadBlockStatis += 1;
H
hjxilinx 已提交
2110
    if (tsdbRetrieveDataBlockStatisInfo(pQueryHandle, pStatis) != TSDB_CODE_SUCCESS) {
2111
    }
2112
    
H
Haojun Liao 已提交
2113
    if (!needToLoadDataBlock(pRuntimeEnv, *pStatis, pRuntimeEnv->pCtx, pBlockInfo->rows)) {
2114
#if defined(_DEBUG_VIEW)
2115
      qDebug("QInfo:%p block discarded by per-filter", GET_QINFO_ADDR(pRuntimeEnv));
2116
#endif
2117 2118
      // current block has been discard due to filter applied
      pRuntimeEnv->summary.discardBlocks += 1;
H
Haojun Liao 已提交
2119 2120 2121
      qDebug("QInfo:%p data block discard, brange:%"PRId64 "-%"PRId64", rows:%d", GET_QINFO_ADDR(pRuntimeEnv),
          pBlockInfo->window.skey, pBlockInfo->window.ekey, pBlockInfo->rows);
      return BLK_DATA_DISCARD;
2122
    }
2123
  
2124
    pRuntimeEnv->summary.totalCheckedRows += pBlockInfo->rows;
H
Haojun Liao 已提交
2125
    pRuntimeEnv->summary.loadBlocks += 1;
H
Haojun Liao 已提交
2126
    *pDataBlock = tsdbRetrieveDataBlock(pQueryHandle, NULL);
2127
  }
2128

H
Haojun Liao 已提交
2129
  return TSDB_CODE_SUCCESS;
2130 2131
}

H
hjxilinx 已提交
2132
int32_t binarySearchForKey(char *pValue, int num, TSKEY key, int order) {
2133
  int32_t midPos = -1;
H
Haojun Liao 已提交
2134
  int32_t numOfRows;
2135

2136 2137 2138
  if (num <= 0) {
    return -1;
  }
2139

2140
  assert(order == TSDB_ORDER_ASC || order == TSDB_ORDER_DESC);
2141 2142

  TSKEY * keyList = (TSKEY *)pValue;
2143
  int32_t firstPos = 0;
2144
  int32_t lastPos = num - 1;
2145

2146
  if (order == TSDB_ORDER_DESC) {
H
hjxilinx 已提交
2147 2148 2149 2150 2151
    // find the first position which is smaller than the key
    while (1) {
      if (key >= keyList[lastPos]) return lastPos;
      if (key == keyList[firstPos]) return firstPos;
      if (key < keyList[firstPos]) return firstPos - 1;
2152

H
Haojun Liao 已提交
2153 2154
      numOfRows = lastPos - firstPos + 1;
      midPos = (numOfRows >> 1) + firstPos;
2155

H
hjxilinx 已提交
2156 2157 2158 2159 2160 2161 2162 2163
      if (key < keyList[midPos]) {
        lastPos = midPos - 1;
      } else if (key > keyList[midPos]) {
        firstPos = midPos + 1;
      } else {
        break;
      }
    }
2164

H
hjxilinx 已提交
2165 2166 2167 2168 2169
  } else {
    // find the first position which is bigger than the key
    while (1) {
      if (key <= keyList[firstPos]) return firstPos;
      if (key == keyList[lastPos]) return lastPos;
2170

H
hjxilinx 已提交
2171 2172 2173 2174 2175 2176 2177
      if (key > keyList[lastPos]) {
        lastPos = lastPos + 1;
        if (lastPos >= num)
          return -1;
        else
          return lastPos;
      }
2178

H
Haojun Liao 已提交
2179 2180
      numOfRows = lastPos - firstPos + 1;
      midPos = (numOfRows >> 1) + firstPos;
2181

H
hjxilinx 已提交
2182 2183 2184 2185 2186 2187 2188 2189 2190
      if (key < keyList[midPos]) {
        lastPos = midPos - 1;
      } else if (key > keyList[midPos]) {
        firstPos = midPos + 1;
      } else {
        break;
      }
    }
  }
2191

H
hjxilinx 已提交
2192 2193 2194
  return midPos;
}

2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216
static void ensureOutputBufferSimple(SQueryRuntimeEnv* pRuntimeEnv, int32_t capacity) {
  SQuery* pQuery = pRuntimeEnv->pQuery;

  if (capacity < pQuery->rec.capacity) {
    return;
  }

  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
    int32_t bytes = pQuery->pSelectExpr[i].bytes;
    assert(bytes > 0 && capacity > 0);

    char *tmp = realloc(pQuery->sdata[i], bytes * capacity + sizeof(tFilePage));
    if (tmp == NULL) {  // todo handle the oom
      assert(0);
    } else {
      pQuery->sdata[i] = (tFilePage *)tmp;
    }

    // set the pCtx output buffer position
    pRuntimeEnv->pCtx[i].aOutputBuf = pQuery->sdata[i]->data;
  }

2217
  qDebug("QInfo:%p realloc output buffer to inc output buffer from: %" PRId64 " rows to:%d rows", GET_QINFO_ADDR(pRuntimeEnv),
2218 2219 2220 2221 2222
         pQuery->rec.capacity, capacity);

  pQuery->rec.capacity = capacity;
}

2223 2224 2225
static void ensureOutputBuffer(SQueryRuntimeEnv* pRuntimeEnv, SDataBlockInfo* pBlockInfo) {
  // in case of prj/diff query, ensure the output buffer is sufficient to accommodate the results of current block
  SQuery* pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
2226
  if (!QUERY_IS_INTERVAL_QUERY(pQuery) && !pRuntimeEnv->groupbyNormalCol && !isFixedOutputQuery(pRuntimeEnv)) {
2227 2228 2229 2230 2231 2232 2233 2234
    SResultRec *pRec = &pQuery->rec;
    
    if (pQuery->rec.capacity - pQuery->rec.rows < pBlockInfo->rows) {
      int32_t remain = pRec->capacity - pRec->rows;
      int32_t newSize = pRec->capacity + (pBlockInfo->rows - remain);
      
      for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
        int32_t bytes = pQuery->pSelectExpr[i].bytes;
H
Haojun Liao 已提交
2235 2236
        assert(bytes > 0 && newSize > 0);

2237 2238 2239 2240
        char *tmp = realloc(pQuery->sdata[i], bytes * newSize + sizeof(tFilePage));
        if (tmp == NULL) {  // todo handle the oom
          assert(0);
        } else {
H
Hongze Cheng 已提交
2241
          memset(tmp + sizeof(tFilePage) + bytes * pRec->rows, 0, (newSize - pRec->rows) * bytes);
2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253
          pQuery->sdata[i] = (tFilePage *)tmp;
        }
        
        // set the pCtx output buffer position
        pRuntimeEnv->pCtx[i].aOutputBuf = pQuery->sdata[i]->data + pRec->rows * bytes;
        
        int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
        if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
          pRuntimeEnv->pCtx[i].ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
        }
      }
      
2254
      qDebug("QInfo:%p realloc output buffer, new size: %d rows, old:%" PRId64 ", remain:%" PRId64, GET_QINFO_ADDR(pRuntimeEnv),
2255 2256 2257 2258 2259 2260 2261
             newSize, pRec->capacity, newSize - pRec->rows);
      
      pRec->capacity = newSize;
    }
  }
}

2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282
static void doSetInitialTimewindow(SQueryRuntimeEnv* pRuntimeEnv, SDataBlockInfo* pBlockInfo) {
  SQuery* pQuery = pRuntimeEnv->pQuery;

  if (QUERY_IS_INTERVAL_QUERY(pQuery) && pRuntimeEnv->windowResInfo.prevSKey == TSKEY_INITIAL_VAL) {
    STimeWindow w = TSWINDOW_INITIALIZER;
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;

    if (QUERY_IS_ASC_QUERY(pQuery)) {
      getAlignQueryTimeWindow(pQuery, pBlockInfo->window.skey, pBlockInfo->window.skey, pQuery->window.ekey, &w);
      pWindowResInfo->startTime = w.skey;
      pWindowResInfo->prevSKey = w.skey;
    } else {
      // the start position of the first time window in the endpoint that spreads beyond the queried last timestamp
      getAlignQueryTimeWindow(pQuery, pBlockInfo->window.ekey, pQuery->window.ekey, pBlockInfo->window.ekey, &w);

      pWindowResInfo->startTime = pQuery->window.skey;
      pWindowResInfo->prevSKey = w.skey;
    }
  }
}

2283 2284
static int64_t doScanAllDataBlocks(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
2285
  STableQueryInfo* pTableQueryInfo = pQuery->current;
H
Haojun Liao 已提交
2286
  SQueryCostInfo*  summary  = &pRuntimeEnv->summary;
2287

2288
  qDebug("QInfo:%p query start, qrange:%" PRId64 "-%" PRId64 ", lastkey:%" PRId64 ", order:%d",
H
hjxilinx 已提交
2289 2290
         GET_QINFO_ADDR(pRuntimeEnv), pTableQueryInfo->win.skey, pTableQueryInfo->win.ekey, pTableQueryInfo->lastKey,
         pQuery->order.order);
2291

2292
  TsdbQueryHandleT pQueryHandle = IS_MASTER_SCAN(pRuntimeEnv)? pRuntimeEnv->pQueryHandle : pRuntimeEnv->pSecQueryHandle;
H
Haojun Liao 已提交
2293
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
H
Haojun Liao 已提交
2294

H
Haojun Liao 已提交
2295
  SDataBlockInfo blockInfo = SDATA_BLOCK_INITIALIZER;
B
Bomin Zhang 已提交
2296 2297 2298 2299 2300 2301 2302
  while (true) {
    if (!tsdbNextDataBlock(pQueryHandle)) {
      if (terrno != TSDB_CODE_SUCCESS) {
        longjmp(pRuntimeEnv->env, terrno);
      }
      break;
    }
H
Haojun Liao 已提交
2303
    summary->totalBlocks += 1;
H
Haojun Liao 已提交
2304

H
Haojun Liao 已提交
2305
    if (IS_QUERY_KILLED(GET_QINFO_ADDR(pRuntimeEnv))) {
H
Haojun Liao 已提交
2306
      longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
2307
    }
2308

H
Haojun Liao 已提交
2309
    tsdbRetrieveDataBlockInfo(pQueryHandle, &blockInfo);
2310
    doSetInitialTimewindow(pRuntimeEnv, &blockInfo);
2311

H
hjxilinx 已提交
2312
    // in case of prj/diff query, ensure the output buffer is sufficient to accommodate the results of current block
2313
    ensureOutputBuffer(pRuntimeEnv, &blockInfo);
2314

2315
    SDataStatis *pStatis = NULL;
H
Haojun Liao 已提交
2316 2317 2318 2319 2320
    SArray *pDataBlock   = NULL;
    if (loadDataBlockOnDemand(pRuntimeEnv, pQueryHandle, &blockInfo, &pStatis, &pDataBlock) == BLK_DATA_DISCARD) {
      pQuery->current->lastKey = QUERY_IS_ASC_QUERY(pQuery)? blockInfo.window.ekey + step:blockInfo.window.skey + step;
      continue;
    }
2321

H
Haojun Liao 已提交
2322 2323
    // query start position can not move into tableApplyFunctionsOnBlock due to limit/offset condition
    pQuery->pos = QUERY_IS_ASC_QUERY(pQuery)? 0 : blockInfo.rows - 1;
H
hjxilinx 已提交
2324
    int32_t numOfRes = tableApplyFunctionsOnBlock(pRuntimeEnv, &blockInfo, pStatis, binarySearchForKey, pDataBlock);
2325

H
Haojun Liao 已提交
2326
    summary->totalRows += blockInfo.rows;
2327
    qDebug("QInfo:%p check data block, brange:%" PRId64 "-%" PRId64 ", numOfRows:%d, numOfRes:%d, lastKey:%"PRId64, GET_QINFO_ADDR(pRuntimeEnv),
2328
           blockInfo.window.skey, blockInfo.window.ekey, blockInfo.rows, numOfRes, pQuery->current->lastKey);
2329

2330 2331
    // while the output buffer is full or limit/offset is applied, query may be paused here
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL | QUERY_COMPLETED)) {
H
hjxilinx 已提交
2332
      break;
2333 2334
    }
  }
2335

H
hjxilinx 已提交
2336
  // if the result buffer is not full, set the query complete
2337 2338 2339
  if (!Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
    setQueryStatus(pQuery, QUERY_COMPLETED);
  }
2340

H
Haojun Liao 已提交
2341
  if (QUERY_IS_INTERVAL_QUERY(pQuery) && IS_MASTER_SCAN(pRuntimeEnv)) {
H
hjxilinx 已提交
2342
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
2343
      closeAllTimeWindow(&pRuntimeEnv->windowResInfo);
H
hjxilinx 已提交
2344
      pRuntimeEnv->windowResInfo.curIndex = pRuntimeEnv->windowResInfo.size - 1;  // point to the last time window
2345 2346 2347 2348
    } else {
      assert(Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL));
    }
  }
2349

2350
  return 0;
2351 2352 2353 2354 2355 2356
}

/*
 * set tag value in SQLFunctionCtx
 * e.g.,tag information into input buffer
 */
2357
static void doSetTagValueInParam(void *tsdb, void* pTable, int32_t tagColId, tVariant *tag, int16_t type, int16_t bytes) {
H
[td-90]  
Haojun Liao 已提交
2358
  tVariantDestroy(tag);
2359

2360
  if (tagColId == TSDB_TBNAME_COLUMN_INDEX) {
2361
    char* val = tsdbGetTableName(pTable);
H
[td-90]  
Haojun Liao 已提交
2362 2363 2364
    assert(val != NULL);
    
    tVariantCreateFromBinary(tag, varDataVal(val), varDataLen(val), TSDB_DATA_TYPE_BINARY);
2365
  } else {
2366
    char* val = tsdbGetTableTagVal(pTable, tagColId, type, bytes);
H
[td-90]  
Haojun Liao 已提交
2367 2368 2369 2370
    if (val == NULL) {
      tag->nType = TSDB_DATA_TYPE_NULL;
      return;
    }
H
hjxilinx 已提交
2371 2372
    
    if (type == TSDB_DATA_TYPE_BINARY || type == TSDB_DATA_TYPE_NCHAR) {
H
Hongze Cheng 已提交
2373
      if (isNull(val, type)) {
H
Haojun Liao 已提交
2374 2375 2376 2377
        tag->nType = TSDB_DATA_TYPE_NULL;
        return;
      }

H
[td-90]  
Haojun Liao 已提交
2378
      tVariantCreateFromBinary(tag, varDataVal(val), varDataLen(val), type);
H
hjxilinx 已提交
2379
    } else {
H
Haojun Liao 已提交
2380 2381 2382 2383 2384
      if (isNull(val, type)) {
        tag->nType = TSDB_DATA_TYPE_NULL;
        return;
      }

H
[td-90]  
Haojun Liao 已提交
2385
      tVariantCreateFromBinary(tag, val, bytes, type);
H
hjxilinx 已提交
2386
    }
2387
  }
2388 2389
}

2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401
static SColumnInfo* doGetTagColumnInfoById(SColumnInfo* pTagColList, int32_t numOfTags, int16_t colId) {
  assert(pTagColList != NULL && numOfTags > 0);

  for(int32_t i = 0; i < numOfTags; ++i) {
    if (pTagColList[i].colId == colId) {
      return &pTagColList[i];
    }
  }

  return NULL;
}

2402
void setTagVal(SQueryRuntimeEnv *pRuntimeEnv, void *pTable, void *tsdb) {
2403
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
2404
  SQInfo* pQInfo = GET_QINFO_ADDR(pRuntimeEnv);
2405

H
[td-90]  
Haojun Liao 已提交
2406 2407 2408
  SExprInfo *pExprInfo = &pQuery->pSelectExpr[0];
  if (pQuery->numOfOutput == 1 && pExprInfo->base.functionId == TSDB_FUNC_TS_COMP) {
    assert(pExprInfo->base.numOfParams == 1);
H
Haojun Liao 已提交
2409

2410 2411
    int16_t tagColId = pExprInfo->base.arg->argValue.i64;
    SColumnInfo* pColInfo = doGetTagColumnInfoById(pQuery->tagColList, pQuery->numOfTags, tagColId);
H
Haojun Liao 已提交
2412

2413
    doSetTagValueInParam(tsdb, pTable, tagColId, &pRuntimeEnv->pCtx[0].tag, pColInfo->type, pColInfo->bytes);
2414 2415
  } else {
    // set tag value, by which the results are aggregated.
2416
    for (int32_t idx = 0; idx < pQuery->numOfOutput; ++idx) {
H
Haojun Liao 已提交
2417
      SExprInfo* pLocalExprInfo = &pQuery->pSelectExpr[idx];
H
[td-90]  
Haojun Liao 已提交
2418
  
2419
      // ts_comp column required the tag value for join filter
H
Haojun Liao 已提交
2420
      if (!TSDB_COL_IS_TAG(pLocalExprInfo->base.colInfo.flag)) {
2421 2422
        continue;
      }
2423

2424
      // todo use tag column index to optimize performance
2425
      doSetTagValueInParam(tsdb, pTable, pLocalExprInfo->base.colInfo.colId, &pRuntimeEnv->pCtx[idx].tag,
H
Haojun Liao 已提交
2426
                           pLocalExprInfo->type, pLocalExprInfo->bytes);
2427
    }
2428

2429
    // set the join tag for first column
H
[td-90]  
Haojun Liao 已提交
2430
    SSqlFuncMsg *pFuncMsg = &pExprInfo->base;
2431 2432
    if ((pFuncMsg->functionId == TSDB_FUNC_TS || pFuncMsg->functionId == TSDB_FUNC_PRJ) && pRuntimeEnv->pTSBuf != NULL &&
        pFuncMsg->colInfo.colIndex == PRIMARYKEY_TIMESTAMP_COL_INDEX) {
2433
      assert(pFuncMsg->numOfParams == 1);
H
Haojun Liao 已提交
2434

2435 2436
      int16_t tagColId = pExprInfo->base.arg->argValue.i64;
      SColumnInfo* pColInfo = doGetTagColumnInfoById(pQuery->tagColList, pQuery->numOfTags, tagColId);
H
Haojun Liao 已提交
2437

2438
      doSetTagValueInParam(tsdb, pTable, tagColId, &pRuntimeEnv->pCtx[0].tag, pColInfo->type, pColInfo->bytes);
2439
      qDebug("QInfo:%p set tag value for join comparison, colId:%" PRId64 ", val:%"PRId64, pQInfo, pExprInfo->base.arg->argValue.i64,
B
Bomin Zhang 已提交
2440
          pRuntimeEnv->pCtx[0].tag.i64Key)
2441 2442 2443 2444 2445 2446 2447
    }
  }
}

static void doMerge(SQueryRuntimeEnv *pRuntimeEnv, int64_t timestamp, SWindowResult *pWindowRes, bool mergeFlag) {
  SQuery *        pQuery = pRuntimeEnv->pQuery;
  SQLFunctionCtx *pCtx = pRuntimeEnv->pCtx;
2448

2449
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
2450
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
2451 2452 2453
    if (!mergeFlag) {
      pCtx[i].aOutputBuf = pCtx[i].aOutputBuf + pCtx[i].outputBytes;
      pCtx[i].currentStage = FIRST_STAGE_MERGE;
2454

2455
      RESET_RESULT_INFO(pCtx[i].resultInfo);
2456 2457
      aAggs[functionId].init(&pCtx[i]);
    }
2458

2459 2460 2461
    pCtx[i].hasNull = true;
    pCtx[i].nStartQueryTimestamp = timestamp;
    pCtx[i].aInputElemBuf = getPosInResultPage(pRuntimeEnv, i, pWindowRes);
2462

2463 2464 2465
    // in case of tag column, the tag information should be extracted from input buffer
    if (functionId == TSDB_FUNC_TAG_DUMMY || functionId == TSDB_FUNC_TAG) {
      tVariantDestroy(&pCtx[i].tag);
2466 2467 2468 2469 2470 2471 2472 2473
  
      int32_t type = pCtx[i].outputType;
      if (type == TSDB_DATA_TYPE_BINARY || type == TSDB_DATA_TYPE_NCHAR) {
        tVariantCreateFromBinary(&pCtx[i].tag, varDataVal(pCtx[i].aInputElemBuf), varDataLen(pCtx[i].aInputElemBuf), type);
      } else {
        tVariantCreateFromBinary(&pCtx[i].tag, pCtx[i].aInputElemBuf, pCtx[i].inputBytes, pCtx[i].inputType);
      }
      
2474 2475
    }
  }
2476

2477
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
2478
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
2479 2480 2481
    if (functionId == TSDB_FUNC_TAG_DUMMY) {
      continue;
    }
2482

2483 2484 2485 2486
    aAggs[functionId].distMergeFunc(&pCtx[i]);
  }
}

2487
static UNUSED_FUNC void printBinaryData(int32_t functionId, char *data, int32_t srcDataType) {
2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555
  if (functionId == TSDB_FUNC_FIRST_DST || functionId == TSDB_FUNC_LAST_DST) {
    switch (srcDataType) {
      case TSDB_DATA_TYPE_BINARY:
        printf("%" PRId64 ",%s\t", *(TSKEY *)data, (data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_BOOL:
        printf("%" PRId64 ",%d\t", *(TSKEY *)data, *(int8_t *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_SMALLINT:
        printf("%" PRId64 ",%d\t", *(TSKEY *)data, *(int16_t *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_BIGINT:
      case TSDB_DATA_TYPE_TIMESTAMP:
        printf("%" PRId64 ",%" PRId64 "\t", *(TSKEY *)data, *(TSKEY *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_INT:
        printf("%" PRId64 ",%d\t", *(TSKEY *)data, *(int32_t *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_FLOAT:
        printf("%" PRId64 ",%f\t", *(TSKEY *)data, *(float *)(data + TSDB_KEYSIZE + 1));
        break;
      case TSDB_DATA_TYPE_DOUBLE:
        printf("%" PRId64 ",%lf\t", *(TSKEY *)data, *(double *)(data + TSDB_KEYSIZE + 1));
        break;
    }
  } else if (functionId == TSDB_FUNC_AVG) {
    printf("%lf,%d\t", *(double *)data, *(int32_t *)(data + sizeof(double)));
  } else if (functionId == TSDB_FUNC_SPREAD) {
    printf("%lf,%lf\t", *(double *)data, *(double *)(data + sizeof(double)));
  } else if (functionId == TSDB_FUNC_TWA) {
    data += 1;
    printf("%lf,%" PRId64 ",%" PRId64 ",%" PRId64 "\t", *(double *)data, *(int64_t *)(data + 8),
           *(int64_t *)(data + 16), *(int64_t *)(data + 24));
  } else if (functionId == TSDB_FUNC_MIN || functionId == TSDB_FUNC_MAX) {
    switch (srcDataType) {
      case TSDB_DATA_TYPE_TINYINT:
      case TSDB_DATA_TYPE_BOOL:
        printf("%d\t", *(int8_t *)data);
        break;
      case TSDB_DATA_TYPE_SMALLINT:
        printf("%d\t", *(int16_t *)data);
        break;
      case TSDB_DATA_TYPE_BIGINT:
      case TSDB_DATA_TYPE_TIMESTAMP:
        printf("%" PRId64 "\t", *(int64_t *)data);
        break;
      case TSDB_DATA_TYPE_INT:
        printf("%d\t", *(int *)data);
        break;
      case TSDB_DATA_TYPE_FLOAT:
        printf("%f\t", *(float *)data);
        break;
      case TSDB_DATA_TYPE_DOUBLE:
        printf("%f\t", *(float *)data);
        break;
    }
  } else if (functionId == TSDB_FUNC_SUM) {
    if (srcDataType == TSDB_DATA_TYPE_FLOAT || srcDataType == TSDB_DATA_TYPE_DOUBLE) {
      printf("%lf\t", *(float *)data);
    } else {
      printf("%" PRId64 "\t", *(int64_t *)data);
    }
  } else {
    printf("%s\t", data);
  }
}

2556
void UNUSED_FUNC displayInterResult(tFilePage **pdata, SQueryRuntimeEnv* pRuntimeEnv, int32_t numOfRows) {
2557
  SQuery* pQuery = pRuntimeEnv->pQuery;
2558
  int32_t numOfCols = pQuery->numOfOutput;
2559
  printf("super table query intermediate result, total:%d\n", numOfRows);
2560

2561 2562
  for (int32_t j = 0; j < numOfRows; ++j) {
    for (int32_t i = 0; i < numOfCols; ++i) {
2563
      
2564
      switch (pQuery->pSelectExpr[i].type) {
2565
        case TSDB_DATA_TYPE_BINARY: {
2566
          int32_t type = pQuery->pSelectExpr[i].type;
2567
          printBinaryData(pQuery->pSelectExpr[i].base.functionId, pdata[i]->data + pQuery->pSelectExpr[i].bytes * j,
2568 2569 2570 2571 2572
                          type);
          break;
        }
        case TSDB_DATA_TYPE_TIMESTAMP:
        case TSDB_DATA_TYPE_BIGINT:
2573
          printf("%" PRId64 "\t", *(int64_t *)(pdata[i]->data + pQuery->pSelectExpr[i].bytes * j));
2574 2575
          break;
        case TSDB_DATA_TYPE_INT:
2576
          printf("%d\t", *(int32_t *)(pdata[i]->data + pQuery->pSelectExpr[i].bytes * j));
2577 2578
          break;
        case TSDB_DATA_TYPE_FLOAT:
2579
          printf("%f\t", *(float *)(pdata[i]->data + pQuery->pSelectExpr[i].bytes * j));
2580 2581
          break;
        case TSDB_DATA_TYPE_DOUBLE:
2582
          printf("%lf\t", *(double *)(pdata[i]->data + pQuery->pSelectExpr[i].bytes * j));
2583 2584 2585 2586 2587 2588 2589 2590
          break;
      }
    }
    printf("\n");
  }
}

typedef struct SCompSupporter {
H
hjxilinx 已提交
2591 2592 2593
  STableQueryInfo **pTableQueryInfo;
  int32_t *         position;
  SQInfo *          pQInfo;
2594 2595 2596 2597 2598
} SCompSupporter;

int32_t tableResultComparFn(const void *pLeft, const void *pRight, void *param) {
  int32_t left = *(int32_t *)pLeft;
  int32_t right = *(int32_t *)pRight;
2599

2600 2601
  SCompSupporter *  supporter = (SCompSupporter *)param;
  SQueryRuntimeEnv *pRuntimeEnv = &supporter->pQInfo->runtimeEnv;
2602

2603 2604
  int32_t leftPos = supporter->position[left];
  int32_t rightPos = supporter->position[right];
2605

2606 2607 2608 2609
  /* left source is exhausted */
  if (leftPos == -1) {
    return 1;
  }
2610

2611 2612 2613 2614
  /* right source is exhausted*/
  if (rightPos == -1) {
    return -1;
  }
2615

H
hjxilinx 已提交
2616
  SWindowResInfo *pWindowResInfo1 = &supporter->pTableQueryInfo[left]->windowResInfo;
2617
  SWindowResult * pWindowRes1 = getWindowResult(pWindowResInfo1, leftPos);
2618

2619 2620
  char *b1 = getPosInResultPage(pRuntimeEnv, PRIMARYKEY_TIMESTAMP_COL_INDEX, pWindowRes1);
  TSKEY leftTimestamp = GET_INT64_VAL(b1);
2621

H
hjxilinx 已提交
2622
  SWindowResInfo *pWindowResInfo2 = &supporter->pTableQueryInfo[right]->windowResInfo;
2623
  SWindowResult * pWindowRes2 = getWindowResult(pWindowResInfo2, rightPos);
2624

2625 2626
  char *b2 = getPosInResultPage(pRuntimeEnv, PRIMARYKEY_TIMESTAMP_COL_INDEX, pWindowRes2);
  TSKEY rightTimestamp = GET_INT64_VAL(b2);
2627

2628 2629 2630
  if (leftTimestamp == rightTimestamp) {
    return 0;
  }
2631

2632 2633 2634
  return leftTimestamp > rightTimestamp ? 1 : -1;
}

2635
int32_t mergeIntoGroupResult(SQInfo *pQInfo) {
2636
  int64_t st = taosGetTimestampMs();
2637
  int32_t ret = TSDB_CODE_SUCCESS;
2638

H
Haojun Liao 已提交
2639
  int32_t numOfGroups = GET_NUM_OF_TABLEGROUP(pQInfo);
2640

2641
  while (pQInfo->groupIndex < numOfGroups) {
H
Haojun Liao 已提交
2642
    SArray *group = GET_TABLEGROUP(pQInfo, pQInfo->groupIndex);
2643
    ret = mergeIntoGroupResultImpl(pQInfo, group);
2644 2645 2646 2647
    if (ret < 0) {  // not enough disk space to save the data into disk
      return -1;
    }

2648
    pQInfo->groupIndex += 1;
2649 2650

    // this group generates at least one result, return results
2651 2652 2653
    if (ret > 0) {
      break;
    }
2654 2655

    assert(pQInfo->numOfGroupResultPages == 0);
2656
    qDebug("QInfo:%p no result in group %d, continue", pQInfo, pQInfo->groupIndex - 1);
2657
  }
2658

2659
  qDebug("QInfo:%p merge res data into group, index:%d, total group:%d, elapsed time:%" PRId64 "ms", pQInfo,
2660
         pQInfo->groupIndex - 1, numOfGroups, taosGetTimestampMs() - st);
2661

2662 2663 2664 2665 2666 2667
  return TSDB_CODE_SUCCESS;
}

void copyResToQueryResultBuf(SQInfo *pQInfo, SQuery *pQuery) {
  if (pQInfo->offset == pQInfo->numOfGroupResultPages) {
    pQInfo->numOfGroupResultPages = 0;
2668

2669
    // current results of group has been sent to client, try next group
2670
    if (mergeIntoGroupResult(pQInfo) != TSDB_CODE_SUCCESS) {
2671 2672
      return;  // failed to save data in the disk
    }
2673

2674
    // check if all results has been sent to client
H
Haojun Liao 已提交
2675
    int32_t numOfGroup = GET_NUM_OF_TABLEGROUP(pQInfo);
2676
    if (pQInfo->numOfGroupResultPages == 0 && pQInfo->groupIndex == numOfGroup) {
2677
      pQInfo->tableIndex = pQInfo->tableqinfoGroupInfo.numOfTables;  // set query completed
2678 2679
      return;
    }
2680
  }
2681 2682

  SQueryRuntimeEnv *   pRuntimeEnv = &pQInfo->runtimeEnv;
2683
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
2684

2685
  int32_t id = getGroupResultId(pQInfo->groupIndex - 1);
2686
  SIDList list = getDataBufPagesIdList(pResultBuf, pQInfo->offset + id);
2687

2688
  int32_t total = 0;
H
Haojun Liao 已提交
2689 2690 2691
  int32_t size = taosArrayGetSize(list);
  for (int32_t i = 0; i < size; ++i) {
    int32_t* pgId = taosArrayGet(list, i);
2692
    tFilePage *pData = getResBufPage(pResultBuf, *pgId);
2693
    total += pData->num;
2694
  }
2695

2696
  int32_t rows = total;
2697

2698
  int32_t offset = 0;
H
Haojun Liao 已提交
2699 2700
  for (int32_t j = 0; j < size; ++j) {
    int32_t* pgId = taosArrayGet(list, j);
2701
    tFilePage *pData = getResBufPage(pResultBuf, *pgId);
2702

2703
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
2704
      int32_t bytes = pRuntimeEnv->pCtx[i].outputBytes;
2705
      char *  pDest = pQuery->sdata[i]->data;
2706

2707 2708
      memcpy(pDest + offset * bytes, pData->data + pRuntimeEnv->offset[i] * pData->num,
             bytes * pData->num);
2709
    }
2710

2711
    offset += pData->num;
2712
  }
2713

2714
  assert(pQuery->rec.rows == 0);
2715

2716
  pQuery->rec.rows += rows;
2717 2718 2719
  pQInfo->offset += 1;
}

H
Haojun Liao 已提交
2720
int64_t getNumOfResultWindowRes(SQuery *pQuery, SWindowResult *pWindowRes) {
2721
  for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
2722
    int32_t functionId = pQuery->pSelectExpr[j].base.functionId;
2723

2724 2725 2726 2727 2728 2729 2730
    /*
     * ts, tag, tagprj function can not decide the output number of current query
     * the number of output result is decided by main output
     */
    if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ) {
      continue;
    }
2731

2732
    SResultInfo *pResultInfo = &pWindowRes->resultInfo[j];
H
Haojun Liao 已提交
2733
    assert(pResultInfo != NULL);
2734

H
Haojun Liao 已提交
2735 2736
    if (pResultInfo->numOfRes > 0) {
      return pResultInfo->numOfRes;
2737 2738
    }
  }
2739

H
Haojun Liao 已提交
2740
  return 0;
2741 2742
}

2743
int32_t mergeIntoGroupResultImpl(SQInfo *pQInfo, SArray *pGroup) {
2744
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
2745
  SQuery *          pQuery = pRuntimeEnv->pQuery;
2746

2747
  size_t size = taosArrayGetSize(pGroup);
2748
  tFilePage **buffer = pQuery->sdata;
2749

2750
  int32_t*   posList = calloc(size, sizeof(int32_t));
H
hjxilinx 已提交
2751
  STableQueryInfo **pTableList = malloc(POINTER_BYTES * size);
2752

2753 2754 2755 2756 2757
  if (pTableList == NULL || posList == NULL) {
    tfree(posList);
    tfree(pTableList);

    qError("QInfo:%p failed alloc memory", pQInfo);
H
Haojun Liao 已提交
2758
    longjmp(pRuntimeEnv->env, TSDB_CODE_QRY_OUT_OF_MEMORY);
2759 2760
  }

2761
  // todo opt for the case of one table per group
2762
  int32_t numOfTables = 0;
2763
  for (int32_t i = 0; i < size; ++i) {
2764
    STableQueryInfo *item = taosArrayGetP(pGroup, i);
2765

H
Haojun Liao 已提交
2766
    SIDList list = getDataBufPagesIdList(pRuntimeEnv->pResultBuf, TSDB_TABLEID(item->pTable)->tid);
H
Haojun Liao 已提交
2767
    if (taosArrayGetSize(list) > 0 && item->windowResInfo.size > 0) {
2768
      pTableList[numOfTables] = item;
2769
      numOfTables += 1;
2770 2771
    }
  }
2772

2773
  if (numOfTables == 0) {
2774 2775
    tfree(posList);
    tfree(pTableList);
2776

2777 2778
    assert(pQInfo->numOfGroupResultPages == 0);
    return 0;
H
Haojun Liao 已提交
2779
  } else if (numOfTables == 1) { // no need to merge results since only one table in each group
2780

2781
  }
2782

2783
  SCompSupporter cs = {pTableList, posList, pQInfo};
2784

2785
  SLoserTreeInfo *pTree = NULL;
2786
  tLoserTreeCreate(&pTree, numOfTables, &cs, tableResultComparFn);
2787

2788
  SResultInfo *pResultInfo = calloc(pQuery->numOfOutput, sizeof(SResultInfo));
H
Haojun Liao 已提交
2789 2790 2791 2792
  if (pResultInfo == NULL) {
    longjmp(pRuntimeEnv->env, TSDB_CODE_QRY_OUT_OF_MEMORY);
  }

H
Haojun Liao 已提交
2793 2794
  char* buf = calloc(1, pRuntimeEnv->interBufSize);
  setWindowResultInfo(pResultInfo, pQuery, pRuntimeEnv->stableQuery, buf);
2795
  resetMergeResultBuf(pQuery, pRuntimeEnv->pCtx, pResultInfo);
2796

2797 2798
  int64_t lastTimestamp = -1;
  int64_t startt = taosGetTimestampMs();
2799

2800 2801
  while (1) {
    int32_t pos = pTree->pNode[0].index;
2802

H
hjxilinx 已提交
2803
    SWindowResInfo *pWindowResInfo = &pTableList[pos]->windowResInfo;
2804
    SWindowResult * pWindowRes = getWindowResult(pWindowResInfo, cs.position[pos]);
2805

2806 2807
    char *b = getPosInResultPage(pRuntimeEnv, PRIMARYKEY_TIMESTAMP_COL_INDEX, pWindowRes);
    TSKEY ts = GET_INT64_VAL(b);
2808

2809
    assert(ts == pWindowRes->window.skey);
H
Haojun Liao 已提交
2810
    int64_t num = getNumOfResultWindowRes(pQuery, pWindowRes);
2811 2812
    if (num <= 0) {
      cs.position[pos] += 1;
2813

2814 2815
      if (cs.position[pos] >= pWindowResInfo->size) {
        cs.position[pos] = -1;
2816

2817
        // all input sources are exhausted
2818
        if (--numOfTables == 0) {
2819 2820 2821 2822 2823 2824 2825
          break;
        }
      }
    } else {
      if (ts == lastTimestamp) {  // merge with the last one
        doMerge(pRuntimeEnv, ts, pWindowRes, true);
      } else {  // copy data to disk buffer
2826
        if (buffer[0]->num == pQuery->rec.capacity) {
2827 2828 2829
          if (flushFromResultBuf(pQInfo) != TSDB_CODE_SUCCESS) {
            return -1;
          }
2830

2831 2832
          resetMergeResultBuf(pQuery, pRuntimeEnv->pCtx, pResultInfo);
        }
2833

2834
        doMerge(pRuntimeEnv, ts, pWindowRes, false);
2835
        buffer[0]->num += 1;
2836
      }
2837

2838
      lastTimestamp = ts;
2839

2840 2841 2842
      cs.position[pos] += 1;
      if (cs.position[pos] >= pWindowResInfo->size) {
        cs.position[pos] = -1;
2843

2844
        // all input sources are exhausted
2845
        if (--numOfTables == 0) {
2846 2847 2848 2849
          break;
        }
      }
    }
2850

2851 2852
    tLoserTreeAdjust(pTree, pos + pTree->numOfEntries);
  }
2853

2854
  if (buffer[0]->num != 0) {  // there are data in buffer
2855
    if (flushFromResultBuf(pQInfo) != TSDB_CODE_SUCCESS) {
S
slguan 已提交
2856
      qError("QInfo:%p failed to flush data into temp file, abort query", pQInfo);
2857

2858 2859 2860 2861
      tfree(pTree);
      tfree(pTableList);
      tfree(posList);
      tfree(pResultInfo);
2862

2863 2864 2865
      return -1;
    }
  }
2866

2867 2868 2869
  int64_t endt = taosGetTimestampMs();

#ifdef _DEBUG_VIEW
2870
  displayInterResult(pQuery->sdata, pRuntimeEnv, pQuery->sdata[0]->num);
2871
#endif
2872

2873
  qDebug("QInfo:%p result merge completed for group:%d, elapsed time:%" PRId64 " ms", pQInfo, pQInfo->groupIndex, endt - startt);
2874

2875 2876
  tfree(pTableList);
  tfree(posList);
H
Haojun Liao 已提交
2877
  tfree(pTree);
2878

2879
  pQInfo->offset = 0;
2880

2881
  tfree(pResultInfo);
H
Haojun Liao 已提交
2882
  tfree(buf);
2883 2884 2885 2886
  return pQInfo->numOfGroupResultPages;
}

int32_t flushFromResultBuf(SQInfo *pQInfo) {
2887 2888 2889
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

2890
  SDiskbasedResultBuf *pResultBuf = pRuntimeEnv->pResultBuf;
2891

2892 2893
  // the base value for group result, since the maximum number of table for each vnode will not exceed 100,000.
  int32_t pageId = -1;
2894
  int32_t capacity = pResultBuf->numOfRowsPerPage;
2895

2896
  int32_t remain = pQuery->sdata[0]->num;
2897
  int32_t offset = 0;
2898

2899 2900 2901 2902 2903
  while (remain > 0) {
    int32_t r = remain;
    if (r > capacity) {
      r = capacity;
    }
2904

2905
    int32_t    id = getGroupResultId(pQInfo->groupIndex) + pQInfo->numOfGroupResultPages;
2906
    tFilePage *buf = getNewDataBuf(pResultBuf, id, &pageId);
2907

2908
    // pagewise copy to dest buffer
2909
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
2910
      int32_t bytes = pRuntimeEnv->pCtx[i].outputBytes;
2911
      buf->num = r;
2912

2913 2914
      memcpy(buf->data + pRuntimeEnv->offset[i] * buf->num, ((char *)pQuery->sdata[i]->data) + offset * bytes,
             buf->num * bytes);
2915
    }
2916

2917 2918 2919
    offset += r;
    remain -= r;
  }
2920

2921 2922 2923 2924 2925
  pQInfo->numOfGroupResultPages += 1;
  return TSDB_CODE_SUCCESS;
}

void resetMergeResultBuf(SQuery *pQuery, SQLFunctionCtx *pCtx, SResultInfo *pResultInfo) {
2926
  for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
2927
    pCtx[k].aOutputBuf = pQuery->sdata[k]->data - pCtx[k].outputBytes;
2928 2929 2930
    pCtx[k].size = 1;
    pCtx[k].startOffset = 0;
    pCtx[k].resultInfo = &pResultInfo[k];
2931

2932
    pQuery->sdata[k]->num = 0;
2933 2934 2935
  }
}

2936 2937 2938 2939 2940 2941 2942
static void updateTableQueryInfoForReverseScan(SQuery *pQuery, STableQueryInfo *pTableQueryInfo) {
  if (pTableQueryInfo == NULL) {
    return;
  }
  
  // order has change already!
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
H
Haojun Liao 已提交
2943 2944 2945 2946 2947 2948 2949
  
  // TODO validate the assertion
//  if (!QUERY_IS_ASC_QUERY(pQuery)) {
//    assert(pTableQueryInfo->win.ekey >= pTableQueryInfo->lastKey + step);
//  } else {
//    assert(pTableQueryInfo->win.ekey <= pTableQueryInfo->lastKey + step);
//  }
2950 2951 2952 2953 2954 2955 2956 2957
  
  pTableQueryInfo->win.ekey = pTableQueryInfo->lastKey + step;
  
  SWAP(pTableQueryInfo->win.skey, pTableQueryInfo->win.ekey, TSKEY);
  pTableQueryInfo->lastKey = pTableQueryInfo->win.skey;
  
  SWITCH_ORDER(pTableQueryInfo->cur.order);
  pTableQueryInfo->cur.vgroupIndex = -1;
H
Haojun Liao 已提交
2958 2959 2960

  // set the index at the end of time window
  pTableQueryInfo->windowResInfo.curIndex = pTableQueryInfo->windowResInfo.size - 1;
2961 2962 2963 2964 2965
}

static void disableFuncInReverseScanImpl(SQInfo* pQInfo, SWindowResInfo *pWindowResInfo, int32_t order) {
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
  
2966 2967 2968 2969 2970
  for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
    SWindowStatus *pStatus = getTimeWindowResStatus(pWindowResInfo, i);
    if (!pStatus->closed) {
      continue;
    }
2971

2972
    SWindowResult *buf = getWindowResult(pWindowResInfo, i);
2973

2974
    // open/close the specified query for each group result
2975
    for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
2976
      int32_t functId = pQuery->pSelectExpr[j].base.functionId;
2977

2978 2979
      if (((functId == TSDB_FUNC_FIRST || functId == TSDB_FUNC_FIRST_DST) && order == TSDB_ORDER_ASC) ||
          ((functId == TSDB_FUNC_LAST || functId == TSDB_FUNC_LAST_DST) && order == TSDB_ORDER_DESC)) {
2980 2981 2982 2983 2984 2985 2986 2987
        buf->resultInfo[j].complete = false;
      } else if (functId != TSDB_FUNC_TS && functId != TSDB_FUNC_TAG) {
        buf->resultInfo[j].complete = true;
      }
    }
  }
}

2988 2989
void disableFuncInReverseScan(SQInfo *pQInfo) {
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
2990
  SQuery *pQuery = pRuntimeEnv->pQuery;
2991
  int32_t order = pQuery->order.order;
2992

2993 2994
  // group by normal columns and interval query on normal table
  SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
H
Haojun Liao 已提交
2995
  if (pRuntimeEnv->groupbyNormalCol || QUERY_IS_INTERVAL_QUERY(pQuery)) {
2996
    disableFuncInReverseScanImpl(pQInfo, pWindowResInfo, order);
2997
  } else {  // for simple result of table query,
2998
    for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {  // todo refactor
2999
      int32_t functId = pQuery->pSelectExpr[j].base.functionId;
3000

3001
      SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[j];
3002 3003 3004
      if (pCtx->resultInfo == NULL) {
        continue; // resultInfo is NULL, means no data checked in previous scan
      }
3005

3006 3007
      if (((functId == TSDB_FUNC_FIRST || functId == TSDB_FUNC_FIRST_DST) && order == TSDB_ORDER_ASC) ||
          ((functId == TSDB_FUNC_LAST || functId == TSDB_FUNC_LAST_DST) && order == TSDB_ORDER_DESC)) {
3008 3009 3010 3011 3012 3013
        pCtx->resultInfo->complete = false;
      } else if (functId != TSDB_FUNC_TS && functId != TSDB_FUNC_TAG) {
        pCtx->resultInfo->complete = true;
      }
    }
  }
H
hjxilinx 已提交
3014
  
H
Haojun Liao 已提交
3015
  int32_t numOfGroups = GET_NUM_OF_TABLEGROUP(pQInfo);
H
hjxilinx 已提交
3016 3017
  
  for(int32_t i = 0; i < numOfGroups; ++i) {
H
Haojun Liao 已提交
3018
    SArray *group = GET_TABLEGROUP(pQInfo, i);
H
hjxilinx 已提交
3019 3020 3021
    
    size_t t = taosArrayGetSize(group);
    for (int32_t j = 0; j < t; ++j) {
3022 3023
      STableQueryInfo *pCheckInfo = taosArrayGetP(group, j);
      updateTableQueryInfoForReverseScan(pQuery, pCheckInfo);
H
hjxilinx 已提交
3024 3025
    }
  }
3026 3027
}

3028
void switchCtxOrder(SQueryRuntimeEnv *pRuntimeEnv) {
3029
  SQuery *pQuery = pRuntimeEnv->pQuery;
3030
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
H
Haojun Liao 已提交
3031
    SWITCH_ORDER(pRuntimeEnv->pCtx[i].order);
3032 3033 3034
  }
}

H
Haojun Liao 已提交
3035
int32_t createQueryResultInfo(SQuery *pQuery, SWindowResult *pResultRow, bool isSTableQuery, size_t interBufSize) {
3036
  int32_t numOfCols = pQuery->numOfOutput;
3037

H
Haojun Liao 已提交
3038 3039
  size_t size = numOfCols * sizeof(SResultInfo) + interBufSize;
  pResultRow->resultInfo = calloc(1, size);
B
Bomin Zhang 已提交
3040 3041 3042
  if (pResultRow->resultInfo == NULL) {
    return TSDB_CODE_QRY_OUT_OF_MEMORY;
  }
3043

H
Haojun Liao 已提交
3044
  pResultRow->pos = (SPosInfo) {-1, -1};
3045

H
Haojun Liao 已提交
3046
  char* buf = (char*) pResultRow->resultInfo + numOfCols * sizeof(SResultInfo);
H
Haojun Liao 已提交
3047

3048
  // set the intermediate result output buffer
H
Haojun Liao 已提交
3049
  setWindowResultInfo(pResultRow->resultInfo, pQuery, isSTableQuery, buf);
B
Bomin Zhang 已提交
3050
  return TSDB_CODE_SUCCESS;
3051 3052 3053 3054
}

void resetCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3055

3056
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
3057 3058
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
    pCtx->aOutputBuf = pQuery->sdata[i]->data;
3059

3060 3061 3062 3063
    /*
     * set the output buffer information and intermediate buffer
     * not all queries require the interResultBuf, such as COUNT/TAGPRJ/PRJ/TAG etc.
     */
3064
    RESET_RESULT_INFO(&pRuntimeEnv->resultInfo[i]);
3065
    pCtx->resultInfo = &pRuntimeEnv->resultInfo[i];
3066

3067
    // set the timestamp output buffer for top/bottom/diff query
3068
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
3069 3070 3071
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
      pCtx->ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
    }
3072

3073
    memset(pQuery->sdata[i]->data, 0, (size_t)pQuery->pSelectExpr[i].bytes * pQuery->rec.capacity);
3074
  }
3075

3076 3077 3078 3079 3080
  initCtxOutputBuf(pRuntimeEnv);
}

void forwardCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, int64_t output) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3081

3082
  // reset the execution contexts
3083
  for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3084
    int32_t functionId = pQuery->pSelectExpr[j].base.functionId;
3085
    assert(functionId != TSDB_FUNC_DIFF);
3086

3087 3088 3089 3090
    // set next output position
    if (IS_OUTER_FORWARD(aAggs[functionId].nStatus)) {
      pRuntimeEnv->pCtx[j].aOutputBuf += pRuntimeEnv->pCtx[j].outputBytes * output;
    }
3091

3092 3093 3094 3095 3096 3097 3098 3099 3100 3101
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
      /*
       * NOTE: for top/bottom query, the value of first column of output (timestamp) are assigned
       * in the procedure of top/bottom routine
       * the output buffer in top/bottom routine is ptsOutputBuf, so we need to forward the output buffer
       *
       * diff function is handled in multi-output function
       */
      pRuntimeEnv->pCtx[j].ptsOutputBuf += TSDB_KEYSIZE * output;
    }
3102

3103
    RESET_RESULT_INFO(pRuntimeEnv->pCtx[j].resultInfo);
3104 3105 3106 3107 3108
  }
}

void initCtxOutputBuf(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3109

3110
  for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3111
    int32_t functionId = pQuery->pSelectExpr[j].base.functionId;
3112
    pRuntimeEnv->pCtx[j].currentStage = 0;
3113

H
Haojun Liao 已提交
3114 3115 3116 3117
    SResultInfo* pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
    if (pResInfo->initialized) {
      continue;
    }
3118

3119 3120 3121 3122
    aAggs[functionId].init(&pRuntimeEnv->pCtx[j]);
  }
}

3123
void skipResults(SQueryRuntimeEnv *pRuntimeEnv) {
3124
  SQuery *pQuery = pRuntimeEnv->pQuery;
3125
  if (pQuery->rec.rows == 0 || pQuery->limit.offset == 0) {
3126 3127
    return;
  }
3128

3129
  if (pQuery->rec.rows <= pQuery->limit.offset) {
3130
    qDebug("QInfo:%p skip rows:%" PRId64 ", new offset:%" PRIu64, GET_QINFO_ADDR(pRuntimeEnv), pQuery->rec.rows,
3131 3132
        pQuery->limit.offset - pQuery->rec.rows);
    
3133 3134
    pQuery->limit.offset -= pQuery->rec.rows;
    pQuery->rec.rows = 0;
3135

3136
    resetCtxOutputBuf(pRuntimeEnv);
3137

H
Haojun Liao 已提交
3138
    // clear the buffer full flag if exists
3139
    CLEAR_QUERY_STATUS(pQuery, QUERY_RESBUF_FULL);
3140
  } else {
3141
    int64_t numOfSkip = pQuery->limit.offset;
3142
    pQuery->rec.rows -= numOfSkip;
3143 3144
    pQuery->limit.offset = 0;
  
3145
    qDebug("QInfo:%p skip row:%"PRId64", new offset:%d, numOfRows remain:%" PRIu64, GET_QINFO_ADDR(pRuntimeEnv), numOfSkip,
3146 3147
           0, pQuery->rec.rows);
    
3148
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
3149
      int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
3150
      int32_t bytes = pRuntimeEnv->pCtx[i].outputBytes;
3151
      
H
Haojun Liao 已提交
3152 3153
      memmove(pQuery->sdata[i]->data, (char*) pQuery->sdata[i]->data + bytes * numOfSkip, pQuery->rec.rows * bytes);
      pRuntimeEnv->pCtx[i].aOutputBuf = ((char*) pQuery->sdata[i]->data) + pQuery->rec.rows * bytes;
3154

3155
      if (functionId == TSDB_FUNC_DIFF || functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM) {
3156
        pRuntimeEnv->pCtx[i].ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
3157 3158
      }
    }
3159

3160
    updateNumOfResult(pRuntimeEnv, pQuery->rec.rows);
3161 3162 3163 3164 3165 3166 3167 3168
  }
}

void setQueryStatus(SQuery *pQuery, int8_t status) {
  if (status == QUERY_NOT_COMPLETED) {
    pQuery->status = status;
  } else {
    // QUERY_NOT_COMPLETED is not compatible with any other status, so clear its position first
3169
    CLEAR_QUERY_STATUS(pQuery, QUERY_NOT_COMPLETED);
3170 3171 3172 3173 3174 3175
    pQuery->status |= status;
  }
}

bool needScanDataBlocksAgain(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3176

H
hjxilinx 已提交
3177
  bool toContinue = false;
H
Haojun Liao 已提交
3178
  if (pRuntimeEnv->groupbyNormalCol || QUERY_IS_INTERVAL_QUERY(pQuery)) {
3179 3180
    // for each group result, call the finalize function for each column
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
3181

3182 3183 3184 3185 3186
    for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
      SWindowResult *pResult = getWindowResult(pWindowResInfo, i);
      if (!pResult->status.closed) {
        continue;
      }
3187

3188
      setWindowResOutputBuf(pRuntimeEnv, pResult);
3189

3190
      for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3191
        int16_t functId = pQuery->pSelectExpr[j].base.functionId;
3192 3193 3194
        if (functId == TSDB_FUNC_TS) {
          continue;
        }
3195

3196 3197
        aAggs[functId].xNextStep(&pRuntimeEnv->pCtx[j]);
        SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
3198

3199 3200 3201 3202
        toContinue |= (!pResInfo->complete);
      }
    }
  } else {
3203
    for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3204
      int16_t functId = pQuery->pSelectExpr[j].base.functionId;
3205 3206 3207
      if (functId == TSDB_FUNC_TS) {
        continue;
      }
3208

3209 3210
      aAggs[functId].xNextStep(&pRuntimeEnv->pCtx[j]);
      SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[j]);
3211

3212 3213 3214
      toContinue |= (!pResInfo->complete);
    }
  }
3215

3216 3217 3218
  return toContinue;
}

H
Haojun Liao 已提交
3219
static SQueryStatusInfo getQueryStatusInfo(SQueryRuntimeEnv *pRuntimeEnv, TSKEY start) {
3220
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
3221 3222
  STableQueryInfo* pTableQueryInfo = pQuery->current;
  
H
Haojun Liao 已提交
3223 3224 3225
  assert((start <= pTableQueryInfo->lastKey && QUERY_IS_ASC_QUERY(pQuery)) ||
      (start >= pTableQueryInfo->lastKey && !QUERY_IS_ASC_QUERY(pQuery)));
  
3226
  SQueryStatusInfo info = {
H
hjxilinx 已提交
3227
      .status      = pQuery->status,
3228
      .windowIndex = pRuntimeEnv->windowResInfo.curIndex,
H
Haojun Liao 已提交
3229
      .lastKey     = start,
H
hjxilinx 已提交
3230
      .w           = pQuery->window,
H
Haojun Liao 已提交
3231
      .curWindow   = {.skey = start, .ekey = pTableQueryInfo->win.ekey},
3232
  };
3233

3234 3235 3236
  return info;
}

3237 3238 3239 3240
static void setEnvBeforeReverseScan(SQueryRuntimeEnv *pRuntimeEnv, SQueryStatusInfo *pStatus) {
  SQInfo *pQInfo = GET_QINFO_ADDR(pRuntimeEnv);
  SQuery *pQuery = pRuntimeEnv->pQuery;

3241 3242 3243 3244 3245
  pStatus->cur = tsBufGetCursor(pRuntimeEnv->pTSBuf);  // save the cursor
  if (pRuntimeEnv->pTSBuf) {
    SWITCH_ORDER(pRuntimeEnv->pTSBuf->cur.order);
    tsBufNextPos(pRuntimeEnv->pTSBuf);
  }
3246

3247
  // reverse order time range
3248 3249 3250
  pQuery->window = pStatus->curWindow;
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);

3251
  SWITCH_ORDER(pQuery->order.order);
3252 3253 3254 3255 3256 3257 3258

  if (QUERY_IS_ASC_QUERY(pQuery)) {
    assert(pQuery->window.skey <= pQuery->window.ekey);
  } else {
    assert(pQuery->window.skey >= pQuery->window.ekey);
  }

3259
  SET_REVERSE_SCAN_FLAG(pRuntimeEnv);
3260

3261
  STsdbQueryCond cond = {
3262
      .twindow = pQuery->window,
H
hjxilinx 已提交
3263
      .order   = pQuery->order.order,
3264
      .colList = pQuery->colList,
3265 3266
      .numOfCols = pQuery->numOfCols,
  };
3267

3268 3269 3270 3271
  // clean unused handle
  if (pRuntimeEnv->pSecQueryHandle != NULL) {
    tsdbCleanupQueryHandle(pRuntimeEnv->pSecQueryHandle);
  }
3272

3273 3274
  // add ref for table
  pRuntimeEnv->pSecQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &pQInfo->tableGroupInfo, pQInfo);
B
Bomin Zhang 已提交
3275 3276 3277
  if (pRuntimeEnv->pSecQueryHandle == NULL) {
    longjmp(pRuntimeEnv->env, terrno);
  }
3278

3279 3280
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
  switchCtxOrder(pRuntimeEnv);
3281
  disableFuncInReverseScan(pQInfo);
3282 3283
}

3284 3285
static void clearEnvAfterReverseScan(SQueryRuntimeEnv *pRuntimeEnv, SQueryStatusInfo *pStatus) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
3286
  STableQueryInfo* pTableQueryInfo = pQuery->current;
3287

3288 3289
  SWITCH_ORDER(pQuery->order.order);
  switchCtxOrder(pRuntimeEnv);
3290

3291 3292 3293 3294
  tsBufSetCursor(pRuntimeEnv->pTSBuf, &pStatus->cur);
  if (pRuntimeEnv->pTSBuf) {
    pRuntimeEnv->pTSBuf->cur.order = pQuery->order.order;
  }
3295

3296
  SET_MASTER_SCAN_FLAG(pRuntimeEnv);
3297

3298
  // update the pQuery->window.skey and pQuery->window.ekey to limit the scan scope of sliding query during reverse scan
H
hjxilinx 已提交
3299
  pTableQueryInfo->lastKey = pStatus->lastKey;
3300
  pQuery->status = pStatus->status;
3301
  
H
hjxilinx 已提交
3302
  pTableQueryInfo->win = pStatus->w;
3303
  pQuery->window = pTableQueryInfo->win;
3304 3305
}

3306
void scanOneTableDataBlocks(SQueryRuntimeEnv *pRuntimeEnv, TSKEY start) {
H
hjxilinx 已提交
3307
  SQInfo *pQInfo = (SQInfo *) GET_QINFO_ADDR(pRuntimeEnv);
3308
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
3309 3310
  STableQueryInfo *pTableQueryInfo = pQuery->current;
  
3311
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
3312

3313
  // store the start query position
H
Haojun Liao 已提交
3314
  SQueryStatusInfo qstatus = getQueryStatusInfo(pRuntimeEnv, start);
3315

3316 3317
  SET_MASTER_SCAN_FLAG(pRuntimeEnv);
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
3318

3319 3320
  while (1) {
    doScanAllDataBlocks(pRuntimeEnv);
3321

3322 3323
    if (pRuntimeEnv->scanFlag == MASTER_SCAN) {
      qstatus.status = pQuery->status;
3324 3325 3326 3327 3328 3329

      // do nothing if no data blocks are found qualified during scan
      if (qstatus.lastKey != pTableQueryInfo->lastKey) {
        qstatus.curWindow.ekey = pTableQueryInfo->lastKey - step;
      }

3330
      qstatus.lastKey = pTableQueryInfo->lastKey;
3331
    }
3332

3333
    if (!needScanDataBlocksAgain(pRuntimeEnv)) {
3334
      // restore the status code and jump out of loop
3335
      if (pRuntimeEnv->scanFlag == REPEAT_SCAN) {
3336
        pQuery->status = qstatus.status;
3337
      }
3338

3339 3340
      break;
    }
3341

3342
    STsdbQueryCond cond = {
3343
        .twindow = qstatus.curWindow,
H
hjxilinx 已提交
3344
        .order   = pQuery->order.order,
3345
        .colList = pQuery->colList,
3346
        .numOfCols = pQuery->numOfCols,
3347
    };
3348

3349 3350
    if (pRuntimeEnv->pSecQueryHandle != NULL) {
      tsdbCleanupQueryHandle(pRuntimeEnv->pSecQueryHandle);
3351
    }
3352

3353
    pRuntimeEnv->pSecQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &pQInfo->tableGroupInfo, pQInfo);
B
Bomin Zhang 已提交
3354 3355 3356
    if (pRuntimeEnv->pSecQueryHandle == NULL) {
      longjmp(pRuntimeEnv->env, terrno);
    }
3357

3358
    pRuntimeEnv->windowResInfo.curIndex = qstatus.windowIndex;
3359 3360
    setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
    pRuntimeEnv->scanFlag = REPEAT_SCAN;
3361
    
3362
    qDebug("QInfo:%p start to repeat scan data blocks due to query func required, qrange:%"PRId64"-%"PRId64, pQInfo,
3363
        cond.twindow.skey, cond.twindow.ekey);
3364

3365
    // check if query is killed or not
H
Haojun Liao 已提交
3366
    if (IS_QUERY_KILLED(pQInfo)) {
H
Haojun Liao 已提交
3367 3368
      finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
      longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
3369 3370
    }
  }
3371

H
hjxilinx 已提交
3372
  if (!needReverseScan(pQuery)) {
3373 3374
    return;
  }
3375

3376
  setEnvBeforeReverseScan(pRuntimeEnv, &qstatus);
3377

3378
  // reverse scan from current position
3379
  qDebug("QInfo:%p start to reverse scan", pQInfo);
3380
  doScanAllDataBlocks(pRuntimeEnv);
3381 3382

  clearEnvAfterReverseScan(pRuntimeEnv, &qstatus);
3383 3384
}

H
hjxilinx 已提交
3385
void finalizeQueryResult(SQueryRuntimeEnv *pRuntimeEnv) {
3386
  SQuery *pQuery = pRuntimeEnv->pQuery;
3387

H
Haojun Liao 已提交
3388
  if (pRuntimeEnv->groupbyNormalCol || QUERY_IS_INTERVAL_QUERY(pQuery)) {
3389 3390
    // for each group result, call the finalize function for each column
    SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
H
Haojun Liao 已提交
3391
    if (pRuntimeEnv->groupbyNormalCol) {
3392 3393
      closeAllTimeWindow(pWindowResInfo);
    }
3394

3395 3396 3397 3398 3399
    for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
      SWindowResult *buf = &pWindowResInfo->pResult[i];
      if (!isWindowResClosed(pWindowResInfo, i)) {
        continue;
      }
3400

3401
      setWindowResOutputBuf(pRuntimeEnv, buf);
3402

3403
      for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3404
        aAggs[pQuery->pSelectExpr[j].base.functionId].xFinalize(&pRuntimeEnv->pCtx[j]);
3405
      }
3406

3407 3408 3409 3410 3411 3412
      /*
       * set the number of output results for group by normal columns, the number of output rows usually is 1 except
       * the top and bottom query
       */
      buf->numOfRows = getNumOfResult(pRuntimeEnv);
    }
3413

3414
  } else {
3415
    for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3416
      aAggs[pQuery->pSelectExpr[j].base.functionId].xFinalize(&pRuntimeEnv->pCtx[j]);
3417 3418 3419 3420 3421
    }
  }
}

static bool hasMainOutput(SQuery *pQuery) {
3422
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
3423
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
3424

3425 3426 3427 3428
    if (functionId != TSDB_FUNC_TS && functionId != TSDB_FUNC_TAG && functionId != TSDB_FUNC_TAGPRJ) {
      return true;
    }
  }
3429

3430 3431 3432
  return false;
}

H
Haojun Liao 已提交
3433
static STableQueryInfo *createTableQueryInfo(SQueryRuntimeEnv *pRuntimeEnv, void* pTable, STimeWindow win, void* buf) {
H
Haojun Liao 已提交
3434
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
3435

H
Haojun Liao 已提交
3436
  STableQueryInfo *pTableQueryInfo = buf;
3437

H
hjxilinx 已提交
3438 3439
  pTableQueryInfo->win = win;
  pTableQueryInfo->lastKey = win.skey;
3440

3441
  pTableQueryInfo->pTable = pTable;
3442
  pTableQueryInfo->cur.vgroupIndex = -1;
3443

H
Haojun Liao 已提交
3444 3445
  // set more initial size of interval/groupby query
  if (QUERY_IS_INTERVAL_QUERY(pQuery) || pRuntimeEnv->groupbyNormalCol) {
H
Haojun Liao 已提交
3446
    int32_t initialSize = 16;
H
Haojun Liao 已提交
3447
    int32_t initialThreshold = 100;
B
Bomin Zhang 已提交
3448 3449 3450 3451
    int32_t code = initWindowResInfo(&pTableQueryInfo->windowResInfo, pRuntimeEnv, initialSize, initialThreshold, TSDB_DATA_TYPE_INT);
    if (code != TSDB_CODE_SUCCESS) {
      return NULL;
    }
H
Haojun Liao 已提交
3452
  } else { // in other aggregate query, do not initialize the windowResInfo
H
Haojun Liao 已提交
3453 3454
  }

3455 3456 3457
  return pTableQueryInfo;
}

H
Haojun Liao 已提交
3458
void destroyTableQueryInfo(STableQueryInfo *pTableQueryInfo) {
3459 3460 3461
  if (pTableQueryInfo == NULL) {
    return;
  }
3462

H
Haojun Liao 已提交
3463
  cleanupTimeWindowInfo(&pTableQueryInfo->windowResInfo);
3464 3465
}

H
Haojun Liao 已提交
3466 3467 3468 3469
#define CHECK_QUERY_TIME_RANGE(_q, _tableInfo)                                              \
  do {                                                                                      \
    assert((((_tableInfo)->lastKey >= (_tableInfo)->win.skey) && QUERY_IS_ASC_QUERY(_q)) || \
           (((_tableInfo)->lastKey <= (_tableInfo)->win.skey) && !QUERY_IS_ASC_QUERY(_q))); \
H
Haojun Liao 已提交
3470
  } while (0)
3471 3472 3473 3474

/**
 * set output buffer for different group
 * @param pRuntimeEnv
3475
 * @param pDataBlockInfo
3476
 */
H
Haojun Liao 已提交
3477
void setExecutionContext(SQInfo *pQInfo, int32_t groupIndex, TSKEY nextKey) {
3478
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
H
Haojun Liao 已提交
3479 3480 3481
  STableQueryInfo  *pTableQueryInfo = pRuntimeEnv->pQuery->current;
  SWindowResInfo   *pWindowResInfo = &pRuntimeEnv->windowResInfo;

H
Haojun Liao 已提交
3482 3483
  // lastKey needs to be updated
  pTableQueryInfo->lastKey = nextKey;
H
Haojun Liao 已提交
3484 3485 3486 3487

  if (pRuntimeEnv->hasTagResults || pRuntimeEnv->pTSBuf != NULL) {
    setAdditionalInfo(pQInfo, pTableQueryInfo->pTable, pTableQueryInfo);
  }
H
Haojun Liao 已提交
3488

H
Haojun Liao 已提交
3489 3490 3491
  if (pRuntimeEnv->prevGroupId != INT32_MIN && pRuntimeEnv->prevGroupId == groupIndex) {
    return;
  }
3492

3493 3494
  SWindowResult *pWindowRes = doSetTimeWindowFromKey(pRuntimeEnv, pWindowResInfo, (char *)&groupIndex,
      sizeof(groupIndex), true);
3495 3496 3497
  if (pWindowRes == NULL) {
    return;
  }
3498

3499 3500 3501 3502 3503
  /*
   * not assign result buffer yet, add new result buffer
   * all group belong to one result set, and each group result has different group id so set the id to be one
   */
  if (pWindowRes->pos.pageId == -1) {
3504
    if (addNewWindowResultBuf(pWindowRes, pRuntimeEnv->pResultBuf, groupIndex, pRuntimeEnv->numOfRowsPerPage) !=
3505 3506 3507 3508
        TSDB_CODE_SUCCESS) {
      return;
    }
  }
3509

H
Haojun Liao 已提交
3510 3511
  // record the current active group id
  pRuntimeEnv->prevGroupId = groupIndex;
3512 3513 3514 3515
  setWindowResOutputBuf(pRuntimeEnv, pWindowRes);
  initCtxOutputBuf(pRuntimeEnv);
}

H
Haojun Liao 已提交
3516
void setWindowResOutputBuf(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pResult) {
3517
  SQuery *pQuery = pRuntimeEnv->pQuery;
3518

3519
  // Note: pResult->pos[i]->num == 0, there is only fixed number of results for each group
3520
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
3521 3522
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];
    pCtx->aOutputBuf = getPosInResultPage(pRuntimeEnv, i, pResult);
3523

3524
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
3525 3526 3527
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
      pCtx->ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
    }
3528

3529 3530 3531 3532 3533
    /*
     * set the output buffer information and intermediate buffer
     * not all queries require the interResultBuf, such as COUNT
     */
    pCtx->resultInfo = &pResult->resultInfo[i];
3534

3535 3536 3537 3538 3539 3540
    // set super table query flag
    SResultInfo *pResInfo = GET_RES_INFO(pCtx);
    pResInfo->superTableQ = pRuntimeEnv->stableQuery;
  }
}

H
Haojun Liao 已提交
3541 3542
void setWindowResOutputBufInitCtx(SQueryRuntimeEnv *pRuntimeEnv, SWindowResult *pResult) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
3543

H
Haojun Liao 已提交
3544 3545 3546 3547 3548
  // Note: pResult->pos[i]->num == 0, there is only fixed number of results for each group
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
    SQLFunctionCtx *pCtx = &pRuntimeEnv->pCtx[i];

    pCtx->resultInfo = &pResult->resultInfo[i];
H
Haojun Liao 已提交
3549
    if (pCtx->resultInfo->initialized && pCtx->resultInfo->complete) {
H
Haojun Liao 已提交
3550 3551
      continue;
    }
3552

H
Haojun Liao 已提交
3553 3554
    pCtx->aOutputBuf = getPosInResultPage(pRuntimeEnv, i, pResult);
    pCtx->currentStage = 0;
3555

H
Haojun Liao 已提交
3556 3557 3558 3559
    int32_t functionId = pCtx->functionId;
    if (functionId == TSDB_FUNC_TOP || functionId == TSDB_FUNC_BOTTOM || functionId == TSDB_FUNC_DIFF) {
      pCtx->ptsOutputBuf = pRuntimeEnv->pCtx[0].aOutputBuf;
    }
3560

H
Haojun Liao 已提交
3561 3562 3563 3564 3565
    /*
     * set the output buffer information and intermediate buffer
     * not all queries require the interResultBuf, such as COUNT
     */
    pCtx->resultInfo->superTableQ = pRuntimeEnv->stableQuery;     // set super table query flag
3566

H
Haojun Liao 已提交
3567 3568 3569 3570 3571 3572
    if (!pCtx->resultInfo->initialized) {
      aAggs[functionId].init(pCtx);
    }
  }
}

3573
int32_t setAdditionalInfo(SQInfo *pQInfo, void* pTable, STableQueryInfo *pTableQueryInfo) {
3574
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
3575

3576
  setTagVal(pRuntimeEnv, pTable, pQInfo->tsdb);
3577

3578 3579
  // both the master and supplement scan needs to set the correct ts comp start position
  if (pRuntimeEnv->pTSBuf != NULL) {
3580
    if (pTableQueryInfo->cur.vgroupIndex == -1) {
3581
      pTableQueryInfo->tag = pRuntimeEnv->pCtx[0].tag.i64Key;
3582

3583
      tsBufGetElemStartPos(pRuntimeEnv->pTSBuf, 0, pTableQueryInfo->tag);
3584

3585 3586 3587 3588 3589 3590
      // keep the cursor info of current meter
      pTableQueryInfo->cur = pRuntimeEnv->pTSBuf->cur;
    } else {
      tsBufSetCursor(pRuntimeEnv->pTSBuf, &pTableQueryInfo->cur);
    }
  }
3591

3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603
  return 0;
}

/*
 * There are two cases to handle:
 *
 * 1. Query range is not set yet (queryRangeSet = 0). we need to set the query range info, including pQuery->lastKey,
 *    pQuery->window.skey, and pQuery->eKey.
 * 2. Query range is set and query is in progress. There may be another result with the same query ranges to be
 *    merged during merge stage. In this case, we need the pTableQueryInfo->lastResRows to decide if there
 *    is a previous result generated or not.
 */
H
hjxilinx 已提交
3604
void setIntervalQueryRange(SQInfo *pQInfo, TSKEY key) {
3605 3606
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
3607 3608
  STableQueryInfo *pTableQueryInfo = pQuery->current;
  
3609 3610 3611
  if (pTableQueryInfo->queryRangeSet) {
    pTableQueryInfo->lastKey = key;
  } else {
3612
    pTableQueryInfo->win.skey = key;
3613
    STimeWindow win = {.skey = key, .ekey = pQuery->window.ekey};
3614

3615 3616 3617 3618 3619
    // for too small query range, no data in this interval.
    if ((QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.ekey < pQuery->window.skey)) ||
        (!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.skey < pQuery->window.ekey))) {
      return;
    }
3620

3621 3622 3623 3624 3625 3626
    /**
     * In handling the both ascending and descending order super table query, we need to find the first qualified
     * timestamp of this table, and then set the first qualified start timestamp.
     * In ascending query, key is the first qualified timestamp. However, in the descending order query, additional
     * operations involve.
     */
H
Haojun Liao 已提交
3627
    STimeWindow     w = TSWINDOW_INITIALIZER;
3628
    SWindowResInfo *pWindowResInfo = &pTableQueryInfo->windowResInfo;
3629

H
Haojun Liao 已提交
3630 3631
    TSKEY sk = MIN(win.skey, win.ekey);
    TSKEY ek = MAX(win.skey, win.ekey);
H
Haojun Liao 已提交
3632
    getAlignQueryTimeWindow(pQuery, win.skey, sk, ek, &w);
3633
    pWindowResInfo->startTime = pTableQueryInfo->win.skey;  // windowSKey may be 0 in case of 1970 timestamp
3634

3635 3636
    if (pWindowResInfo->prevSKey == TSKEY_INITIAL_VAL) {
      if (!QUERY_IS_ASC_QUERY(pQuery)) {
H
Haojun Liao 已提交
3637
        assert(win.ekey == pQuery->window.ekey);
3638
      }
3639 3640
      
      pWindowResInfo->prevSKey = w.skey;
3641
    }
3642

3643
    pTableQueryInfo->queryRangeSet = 1;
3644
    pTableQueryInfo->lastKey = pTableQueryInfo->win.skey;
3645 3646 3647 3648
  }
}

bool requireTimestamp(SQuery *pQuery) {
3649
  for (int32_t i = 0; i < pQuery->numOfOutput; i++) {
3650
    int32_t functionId = pQuery->pSelectExpr[i].base.functionId;
3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663
    if ((aAggs[functionId].nStatus & TSDB_FUNCSTATE_NEED_TS) != 0) {
      return true;
    }
  }
  return false;
}

bool needPrimaryTimestampCol(SQuery *pQuery, SDataBlockInfo *pDataBlockInfo) {
  /*
   * 1. if skey or ekey locates in this block, we need to load the timestamp column to decide the precise position
   * 2. if there are top/bottom, first_dst/last_dst functions, we need to load timestamp column in any cases;
   */
  STimeWindow *w = &pDataBlockInfo->window;
H
hjxilinx 已提交
3664 3665 3666
  STableQueryInfo* pTableQueryInfo = pQuery->current;
  
  bool loadPrimaryTS = (pTableQueryInfo->lastKey >= w->skey && pTableQueryInfo->lastKey <= w->ekey) ||
3667 3668
                       (pQuery->window.ekey >= w->skey && pQuery->window.ekey <= w->ekey) || requireTimestamp(pQuery);

3669 3670 3671
  return loadPrimaryTS;
}

3672
static int32_t doCopyToSData(SQInfo *pQInfo, SWindowResInfo *pResultInfo, int32_t orderType) {
3673 3674
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
3675

3676 3677 3678
  int32_t numOfResult = 0;
  int32_t startIdx = 0;
  int32_t step = -1;
3679

3680
  qDebug("QInfo:%p start to copy data from windowResInfo to query buf", pQInfo);
3681 3682
  int32_t totalSet = numOfClosedTimeWindow(pResultInfo);
  SWindowResult* result = pResultInfo->pResult;
3683

3684
  if (orderType == TSDB_ORDER_ASC) {
3685
    startIdx = pQInfo->groupIndex;
3686 3687
    step = 1;
  } else {  // desc order copy all data
3688
    startIdx = totalSet - pQInfo->groupIndex - 1;
3689 3690
    step = -1;
  }
3691

3692
  for (int32_t i = startIdx; (i < totalSet) && (i >= 0); i += step) {
3693 3694
    if (result[i].numOfRows == 0) {
      pQInfo->offset = 0;
3695
      pQInfo->groupIndex += 1;
3696 3697
      continue;
    }
3698

dengyihao's avatar
dengyihao 已提交
3699
    assert(pQInfo->offset <= 1);
3700

3701 3702
    int32_t numOfRowsToCopy = result[i].numOfRows - pQInfo->offset;
    int32_t oldOffset = pQInfo->offset;
3703

3704 3705 3706 3707
    /*
     * current output space is not enough to keep all the result data of this group, only copy partial results
     * to SQuery object's result buffer
     */
3708 3709 3710 3711 3712
    if (numOfRowsToCopy > pQuery->rec.capacity - numOfResult) {
      numOfRowsToCopy = pQuery->rec.capacity - numOfResult;
      pQInfo->offset += numOfRowsToCopy;
    } else {
      pQInfo->offset = 0;
3713
      pQInfo->groupIndex += 1;
3714
    }
3715

3716
    for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
3717
      int32_t size = pRuntimeEnv->pCtx[j].outputBytes;
3718

3719 3720 3721 3722
      char *out = pQuery->sdata[j]->data + numOfResult * size;
      char *in = getPosInResultPage(pRuntimeEnv, j, &result[i]);
      memcpy(out, in + oldOffset * size, size * numOfRowsToCopy);
    }
3723

3724
    numOfResult += numOfRowsToCopy;
3725 3726 3727
    if (numOfResult == pQuery->rec.capacity) {
      break;
    }
3728
  }
3729

3730
  qDebug("QInfo:%p copy data to query buf completed", pQInfo);
3731 3732

#ifdef _DEBUG_VIEW
3733
  displayInterResult(pQuery->sdata, pRuntimeEnv, numOfResult);
3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746
#endif
  return numOfResult;
}

/**
 * copyFromWindowResToSData support copy data in ascending/descending order
 * For interval query of both super table and table, copy the data in ascending order, since the output results are
 * ordered in SWindowResutl already. While handling the group by query for both table and super table,
 * all group result are completed already.
 *
 * @param pQInfo
 * @param result
 */
3747
void copyFromWindowResToSData(SQInfo *pQInfo, SWindowResInfo *pResultInfo) {
3748
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
3749

3750
  int32_t orderType = (pQuery->pGroupbyExpr != NULL) ? pQuery->pGroupbyExpr->orderType : TSDB_ORDER_ASC;
3751
  int32_t numOfResult = doCopyToSData(pQInfo, pResultInfo, orderType);
3752

3753
  pQuery->rec.rows += numOfResult;
3754

3755
  assert(pQuery->rec.rows <= pQuery->rec.capacity);
3756 3757
}

H
Haojun Liao 已提交
3758
static void updateWindowResNumOfRes(SQueryRuntimeEnv *pRuntimeEnv) {
3759
  SQuery *pQuery = pRuntimeEnv->pQuery;
3760

3761
  // update the number of result for each, only update the number of rows for the corresponding window result.
H
Haojun Liao 已提交
3762 3763 3764
  if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
    return;
  }
3765

H
Haojun Liao 已提交
3766 3767
  for (int32_t i = 0; i < pRuntimeEnv->windowResInfo.size; ++i) {
    SWindowResult *pResult = &pRuntimeEnv->windowResInfo.pResult[i];
3768

H
Haojun Liao 已提交
3769 3770 3771 3772
    for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
      int32_t functionId = pRuntimeEnv->pCtx[j].functionId;
      if (functionId == TSDB_FUNC_TS || functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ) {
        continue;
3773
      }
H
Haojun Liao 已提交
3774 3775

      pResult->numOfRows = MAX(pResult->numOfRows, pResult->resultInfo[j].numOfRes);
3776 3777 3778 3779
    }
  }
}

H
Haojun Liao 已提交
3780
static void stableApplyFunctionsOnBlock(SQueryRuntimeEnv *pRuntimeEnv, SDataBlockInfo *pDataBlockInfo, SDataStatis *pStatis,
3781
    SArray *pDataBlock, __block_search_fn_t searchFn) {
3782
  SQuery *         pQuery = pRuntimeEnv->pQuery;
3783 3784
  STableQueryInfo* pTableQueryInfo = pQuery->current;
  
3785
  SWindowResInfo * pWindowResInfo = &pTableQueryInfo->windowResInfo;
H
hjxilinx 已提交
3786
  pQuery->pos = QUERY_IS_ASC_QUERY(pQuery)? 0 : pDataBlockInfo->rows - 1;
3787

H
Haojun Liao 已提交
3788
  if (pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL || pRuntimeEnv->groupbyNormalCol) {
3789
    rowwiseApplyFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, pDataBlock);
3790
  } else {
3791
    blockwiseApplyFunctions(pRuntimeEnv, pStatis, pDataBlockInfo, pWindowResInfo, searchFn, pDataBlock);
3792 3793 3794
  }
}

3795 3796 3797
bool queryHasRemainResults(SQueryRuntimeEnv* pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
  SFillInfo *pFillInfo = pRuntimeEnv->pFillInfo;
3798

H
Haojun Liao 已提交
3799
  if (pQuery->limit.limit > 0 && pQuery->rec.total >= pQuery->limit.limit) {
3800 3801
    return false;
  }
3802

3803
  if (pQuery->fillType != TSDB_FILL_NONE && !isPointInterpoQuery(pQuery)) {
H
Haojun Liao 已提交
3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827
    // There are results not returned to client yet, so filling operation applied to the remain result is required
    // in the first place.
    int32_t remain = taosNumOfRemainRows(pFillInfo);
    if (remain > 0) {
      return true;
    }

    /*
     * While the code reaches here, there are no results remains now.
     * If query is not completed yet, the gaps between two results blocks need to be handled after next data block
     * is retrieved from TSDB.
     *
     * NOTE: If the result set is not the first block, the gap in front of the result set will be filled. If the result
     * set is the FIRST result block, the gap between the start time of query time window and the timestamp of the
     * first result row in the actual result set will fill nothing.
     */
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
      int32_t numOfTotal = getFilledNumOfRes(pFillInfo, pQuery->window.ekey, pQuery->rec.capacity);
      return numOfTotal > 0;
    }

  } else {
    // there are results waiting for returned to client.
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED) &&
H
Haojun Liao 已提交
3828
        (pRuntimeEnv->groupbyNormalCol || QUERY_IS_INTERVAL_QUERY(pQuery)) &&
H
Haojun Liao 已提交
3829 3830 3831
        (pRuntimeEnv->windowResInfo.size > 0)) {
      return true;
    }
3832
  }
3833 3834

  return false;
3835 3836 3837
}

static void doCopyQueryResultToMsg(SQInfo *pQInfo, int32_t numOfRows, char *data) {
3838
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
Haojun Liao 已提交
3839

3840 3841
  for (int32_t col = 0; col < pQuery->numOfOutput; ++col) {
    int32_t bytes = pQuery->pSelectExpr[col].bytes;
3842

3843 3844 3845
    memmove(data, pQuery->sdata[col]->data, bytes * numOfRows);
    data += bytes * numOfRows;
  }
3846

weixin_48148422's avatar
weixin_48148422 已提交
3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858
  int32_t numOfTables = (int32_t)taosArrayGetSize(pQInfo->arrTableIdInfo);
  *(int32_t*)data = htonl(numOfTables);
  data += sizeof(int32_t);
  for(int32_t i = 0; i < numOfTables; i++) {
    STableIdInfo* pSrc = taosArrayGet(pQInfo->arrTableIdInfo, i);
    STableIdInfo* pDst = (STableIdInfo*)data;
    pDst->uid = htobe64(pSrc->uid);
    pDst->tid = htonl(pSrc->tid);
    pDst->key = htobe64(pSrc->key);
    data += sizeof(STableIdInfo);
  }

H
hjxilinx 已提交
3859 3860
  // all data returned, set query over
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
3861
    if (pQInfo->runtimeEnv.stableQuery) {
3862
      if (pQInfo->tableIndex >= pQInfo->tableqinfoGroupInfo.numOfTables) {
3863 3864 3865
        setQueryStatus(pQuery, QUERY_OVER);
      }
    } else {
3866 3867 3868
      if (!queryHasRemainResults(&pQInfo->runtimeEnv)) {
        setQueryStatus(pQuery, QUERY_OVER);
      }
3869
    }
H
hjxilinx 已提交
3870
  }
3871 3872
}

H
Haojun Liao 已提交
3873
int32_t doFillGapsInResults(SQueryRuntimeEnv* pRuntimeEnv, tFilePage **pDst, int32_t *numOfFilled) {
3874
  SQInfo* pQInfo = GET_QINFO_ADDR(pRuntimeEnv);
3875
  SQuery *pQuery = pRuntimeEnv->pQuery;
3876 3877
  SFillInfo* pFillInfo = pRuntimeEnv->pFillInfo;
  
3878
  while (1) {
3879
    int32_t ret = taosGenerateDataBlock(pFillInfo, (tFilePage**) pQuery->sdata, pQuery->rec.capacity);
3880
    
3881
    // todo apply limit output function
3882 3883
    /* reached the start position of according to offset value, return immediately */
    if (pQuery->limit.offset == 0) {
3884
      qDebug("QInfo:%p initial numOfRows:%d, generate filled result:%d rows", pQInfo, pFillInfo->numOfRows, ret);
3885 3886
      return ret;
    }
3887

3888
    if (pQuery->limit.offset < ret) {
3889
      qDebug("QInfo:%p initial numOfRows:%d, generate filled result:%d rows, offset:%" PRId64 ". Discard due to offset, remain:%" PRId64 ", new offset:%d",
3890 3891
             pQInfo, pFillInfo->numOfRows, ret, pQuery->limit.offset, ret - pQuery->limit.offset, 0);
      
3892 3893 3894
      ret -= pQuery->limit.offset;
      // todo !!!!there exactly number of interpo is not valid.
      // todo refactor move to the beginning of buffer
3895 3896 3897
      for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
        memmove(pDst[i]->data, pDst[i]->data + pQuery->pSelectExpr[i].bytes * pQuery->limit.offset,
                ret * pQuery->pSelectExpr[i].bytes);
3898
      }
3899
      
3900 3901 3902
      pQuery->limit.offset = 0;
      return ret;
    } else {
3903
      qDebug("QInfo:%p initial numOfRows:%d, generate filled result:%d rows, offset:%" PRId64 ". Discard due to offset, "
B
Bomin Zhang 已提交
3904
             "remain:%d, new offset:%" PRId64, pQInfo, pFillInfo->numOfRows, ret, pQuery->limit.offset, 0,
3905 3906
          pQuery->limit.offset - ret);
      
3907
      pQuery->limit.offset -= ret;
3908
      pQuery->rec.rows = 0;
3909 3910
      ret = 0;
    }
3911 3912

    if (!queryHasRemainResults(pRuntimeEnv)) {
3913 3914 3915 3916 3917
      return ret;
    }
  }
}

3918
static void queryCostStatis(SQInfo *pQInfo) {
3919
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
3920
  SQueryCostInfo *pSummary = &pRuntimeEnv->summary;
H
Haojun Liao 已提交
3921 3922

  qDebug("QInfo:%p :cost summary: elapsed time:%"PRId64" us, total blocks:%d, load block statis:%d,"
H
Haojun Liao 已提交
3923
         " load data block:%d, total rows:%"PRId64 ", check rows:%"PRId64,
H
Haojun Liao 已提交
3924
         pQInfo, pSummary->elapsedTime, pSummary->totalBlocks, pSummary->loadBlockStatis,
H
Haojun Liao 已提交
3925
         pSummary->loadBlocks, pSummary->totalRows, pSummary->totalCheckedRows);
3926 3927
}

3928 3929
static void updateOffsetVal(SQueryRuntimeEnv *pRuntimeEnv, SDataBlockInfo *pBlockInfo) {
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
hjxilinx 已提交
3930 3931
  STableQueryInfo* pTableQueryInfo = pQuery->current;
  
3932
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);
3933

3934
  if (pQuery->limit.offset == pBlockInfo->rows) {  // current block will ignore completed
H
hjxilinx 已提交
3935
    pTableQueryInfo->lastKey = QUERY_IS_ASC_QUERY(pQuery) ? pBlockInfo->window.ekey + step : pBlockInfo->window.skey + step;
3936 3937 3938
    pQuery->limit.offset = 0;
    return;
  }
3939

3940 3941 3942 3943 3944
  if (QUERY_IS_ASC_QUERY(pQuery)) {
    pQuery->pos = pQuery->limit.offset;
  } else {
    pQuery->pos = pBlockInfo->rows - pQuery->limit.offset - 1;
  }
3945

3946
  assert(pQuery->pos >= 0 && pQuery->pos <= pBlockInfo->rows - 1);
3947

3948
  SArray *         pDataBlock = tsdbRetrieveDataBlock(pRuntimeEnv->pQueryHandle, NULL);
3949
  SColumnInfoData *pColInfoData = taosArrayGet(pDataBlock, 0);
3950

3951
  // update the pQuery->limit.offset value, and pQuery->pos value
H
Haojun Liao 已提交
3952
  TSKEY *keys = (TSKEY *) pColInfoData->pData;
3953 3954

  // update the offset value
H
hjxilinx 已提交
3955
  pTableQueryInfo->lastKey = keys[pQuery->pos];
3956
  pQuery->limit.offset = 0;
3957

H
hjxilinx 已提交
3958
  int32_t numOfRes = tableApplyFunctionsOnBlock(pRuntimeEnv, pBlockInfo, NULL, binarySearchForKey, pDataBlock);
3959

3960
  qDebug("QInfo:%p check data block, brange:%" PRId64 "-%" PRId64 ", numOfRows:%d, numOfRes:%d, lastKey:%"PRId64, GET_QINFO_ADDR(pRuntimeEnv),
3961
         pBlockInfo->window.skey, pBlockInfo->window.ekey, pBlockInfo->rows, numOfRes, pQuery->current->lastKey);
3962
}
3963

3964 3965 3966 3967 3968
void skipBlocks(SQueryRuntimeEnv *pRuntimeEnv) {
  SQuery *pQuery = pRuntimeEnv->pQuery;

  if (pQuery->limit.offset <= 0 || pQuery->numOfFilterCols > 0) {
    return;
3969
  }
3970

3971 3972 3973
  pQuery->pos = 0;
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);

H
hjxilinx 已提交
3974
  STableQueryInfo* pTableQueryInfo = pQuery->current;
3975
  TsdbQueryHandleT pQueryHandle = pRuntimeEnv->pQueryHandle;
3976

H
Haojun Liao 已提交
3977
  SDataBlockInfo blockInfo = SDATA_BLOCK_INITIALIZER;
B
Bomin Zhang 已提交
3978 3979 3980 3981 3982 3983 3984 3985
  while (true) {
    if (!tsdbNextDataBlock(pQueryHandle)) {
      if (terrno != TSDB_CODE_SUCCESS) {
        longjmp(pRuntimeEnv->env, terrno);
      }
      break;
    }

H
Haojun Liao 已提交
3986
    if (IS_QUERY_KILLED(GET_QINFO_ADDR(pRuntimeEnv))) {
H
Haojun Liao 已提交
3987 3988
      finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
      longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
3989
    }
3990

H
Haojun Liao 已提交
3991
    tsdbRetrieveDataBlockInfo(pQueryHandle, &blockInfo);
3992

3993 3994
    if (pQuery->limit.offset > blockInfo.rows) {
      pQuery->limit.offset -= blockInfo.rows;
H
hjxilinx 已提交
3995 3996
      pTableQueryInfo->lastKey = (QUERY_IS_ASC_QUERY(pQuery)) ? blockInfo.window.ekey : blockInfo.window.skey;
      pTableQueryInfo->lastKey += step;
3997

3998
      qDebug("QInfo:%p skip rows:%d, offset:%" PRId64, GET_QINFO_ADDR(pRuntimeEnv), blockInfo.rows,
3999 4000
             pQuery->limit.offset);
    } else {  // find the appropriated start position in current block
4001 4002 4003
      updateOffsetVal(pRuntimeEnv, &blockInfo);
      break;
    }
4004
  }
4005
}
4006

H
Haojun Liao 已提交
4007
static bool skipTimeInterval(SQueryRuntimeEnv *pRuntimeEnv, TSKEY* start) {
4008
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
4009
  *start = pQuery->current->lastKey;
4010

4011
  // if queried with value filter, do NOT forward query start position
4012
  if (pQuery->limit.offset <= 0 || pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL || pRuntimeEnv->pFillInfo != NULL) {
4013
    return true;
4014
  }
4015

4016 4017 4018 4019 4020
  /*
   * 1. for interval without interpolation query we forward pQuery->intervalTime at a time for
   *    pQuery->limit.offset times. Since hole exists, pQuery->intervalTime*pQuery->limit.offset value is
   *    not valid. otherwise, we only forward pQuery->limit.offset number of points
   */
4021
  assert(pRuntimeEnv->windowResInfo.prevSKey == TSKEY_INITIAL_VAL);
4022

H
Haojun Liao 已提交
4023
  STimeWindow w = TSWINDOW_INITIALIZER;
4024
  
4025
  SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;
H
hjxilinx 已提交
4026
  STableQueryInfo *pTableQueryInfo = pQuery->current;
4027

H
Haojun Liao 已提交
4028
  SDataBlockInfo blockInfo = SDATA_BLOCK_INITIALIZER;
B
Bomin Zhang 已提交
4029 4030 4031 4032 4033 4034 4035 4036
  while (true) {
    if (!tsdbNextDataBlock(pRuntimeEnv->pQueryHandle)) {
      if (terrno != TSDB_CODE_SUCCESS) {
        longjmp(pRuntimeEnv->env, terrno);
      }
      break;
    }

H
Haojun Liao 已提交
4037
    tsdbRetrieveDataBlockInfo(pRuntimeEnv->pQueryHandle, &blockInfo);
4038

H
Haojun Liao 已提交
4039 4040
    if (QUERY_IS_ASC_QUERY(pQuery)) {
      if (pWindowResInfo->prevSKey == TSKEY_INITIAL_VAL) {
H
Haojun Liao 已提交
4041
        getAlignQueryTimeWindow(pQuery, blockInfo.window.skey, blockInfo.window.skey, pQuery->window.ekey, &w);
H
Haojun Liao 已提交
4042 4043 4044
        pWindowResInfo->startTime = w.skey;
        pWindowResInfo->prevSKey = w.skey;
      }
4045
    } else {
H
Haojun Liao 已提交
4046
      getAlignQueryTimeWindow(pQuery, blockInfo.window.ekey, pQuery->window.ekey, blockInfo.window.ekey, &w);
4047

4048 4049 4050
      pWindowResInfo->startTime = pQuery->window.skey;
      pWindowResInfo->prevSKey = w.skey;
    }
4051

4052 4053
    // the first time window
    STimeWindow win = getActiveTimeWindow(pWindowResInfo, pWindowResInfo->prevSKey, pQuery);
4054

4055 4056 4057 4058 4059 4060
    while (pQuery->limit.offset > 0) {
      if ((win.ekey <= blockInfo.window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
          (win.ekey >= blockInfo.window.skey && !QUERY_IS_ASC_QUERY(pQuery))) {
        pQuery->limit.offset -= 1;
        pWindowResInfo->prevSKey = win.skey;
      }
4061

4062
      STimeWindow tw = win;
H
Haojun Liao 已提交
4063
      GET_NEXT_TIMEWINDOW(pQuery, &tw);
4064

4065
      if (pQuery->limit.offset == 0) {
4066 4067
        if ((tw.skey <= blockInfo.window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
            (tw.ekey >= blockInfo.window.skey && !QUERY_IS_ASC_QUERY(pQuery))) {
H
Haojun Liao 已提交
4068 4069
          // load the data block and check data remaining in current data block
          // TODO optimize performance
4070 4071 4072
          SArray *         pDataBlock = tsdbRetrieveDataBlock(pRuntimeEnv->pQueryHandle, NULL);
          SColumnInfoData *pColInfoData = taosArrayGet(pDataBlock, 0);

4073 4074
          tw = win;
          int32_t startPos =
H
Haojun Liao 已提交
4075
              getNextQualifiedWindow(pRuntimeEnv, &tw, &blockInfo, pColInfoData->pData, binarySearchForKey, -1);
4076 4077 4078 4079
          assert(startPos >= 0);

          // set the abort info
          pQuery->pos = startPos;
H
Haojun Liao 已提交
4080 4081 4082 4083 4084 4085
          
          // reset the query start timestamp
          pTableQueryInfo->win.skey = ((TSKEY *)pColInfoData->pData)[startPos];
          pQuery->window.skey = pTableQueryInfo->win.skey;
          *start = pTableQueryInfo->win.skey;
          
4086
          pWindowResInfo->prevSKey = tw.skey;
H
Haojun Liao 已提交
4087 4088
          int32_t index = pRuntimeEnv->windowResInfo.curIndex;
          
H
hjxilinx 已提交
4089
          int32_t numOfRes = tableApplyFunctionsOnBlock(pRuntimeEnv, &blockInfo, NULL, binarySearchForKey, pDataBlock);
H
Haojun Liao 已提交
4090 4091
          pRuntimeEnv->windowResInfo.curIndex = index;  // restore the window index
          
4092
          qDebug("QInfo:%p check data block, brange:%" PRId64 "-%" PRId64 ", numOfRows:%d, numOfRes:%d, lastKey:%"PRId64,
4093 4094
                 GET_QINFO_ADDR(pRuntimeEnv), blockInfo.window.skey, blockInfo.window.ekey, blockInfo.rows, numOfRes, pQuery->current->lastKey);
          
4095
          return true;
H
Haojun Liao 已提交
4096 4097 4098 4099
        } else { // do nothing
          *start = tw.skey;
          pQuery->window.skey = tw.skey;
          pWindowResInfo->prevSKey = tw.skey;
4100
          return true;
4101 4102 4103
        }
      }

H
Haojun Liao 已提交
4104 4105 4106 4107 4108 4109 4110
      /*
       * If the next time window still starts from current data block,
       * load the primary timestamp column first, and then find the start position for the next queried time window.
       * Note that only the primary timestamp column is required.
       * TODO: Optimize for this cases. All data blocks are not needed to be loaded, only if the first actually required
       * time window resides in current data block.
       */
4111 4112 4113 4114 4115 4116 4117
      if ((tw.skey <= blockInfo.window.ekey && QUERY_IS_ASC_QUERY(pQuery)) ||
          (tw.ekey >= blockInfo.window.skey && !QUERY_IS_ASC_QUERY(pQuery))) {
        SArray *         pDataBlock = tsdbRetrieveDataBlock(pRuntimeEnv->pQueryHandle, NULL);
        SColumnInfoData *pColInfoData = taosArrayGet(pDataBlock, 0);

        tw = win;
        int32_t startPos =
H
Haojun Liao 已提交
4118
            getNextQualifiedWindow(pRuntimeEnv, &tw, &blockInfo, pColInfoData->pData, binarySearchForKey, -1);
4119 4120 4121 4122
        assert(startPos >= 0);

        // set the abort info
        pQuery->pos = startPos;
H
hjxilinx 已提交
4123
        pTableQueryInfo->lastKey = ((TSKEY *)pColInfoData->pData)[startPos];
4124 4125
        pWindowResInfo->prevSKey = tw.skey;
        win = tw;
4126
      } else {
H
Haojun Liao 已提交
4127
        break;  // offset is not 0, and next time window begins or ends in the next block.
4128 4129 4130
      }
    }
  }
4131

4132 4133 4134
  return true;
}

B
Bomin Zhang 已提交
4135
static int32_t setupQueryHandle(void* tsdb, SQInfo* pQInfo, bool isSTableQuery) {
B
Bomin Zhang 已提交
4136
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4137 4138
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;

B
Bomin Zhang 已提交
4139
  if (onlyQueryTags(pQuery)) {
B
Bomin Zhang 已提交
4140
    return TSDB_CODE_SUCCESS;
B
Bomin Zhang 已提交
4141 4142
  }

H
Haojun Liao 已提交
4143
  if (isSTableQuery && (!QUERY_IS_INTERVAL_QUERY(pQuery)) && (!isFixedOutputQuery(pRuntimeEnv))) {
B
Bomin Zhang 已提交
4144
    return TSDB_CODE_SUCCESS;
B
Bomin Zhang 已提交
4145
  }
4146 4147

  STsdbQueryCond cond = {
B
Bomin Zhang 已提交
4148 4149 4150 4151
    .twindow = pQuery->window,
    .order   = pQuery->order.order,
    .colList = pQuery->colList,
    .numOfCols = pQuery->numOfCols,
4152
  };
weixin_48148422's avatar
weixin_48148422 已提交
4153

B
Bomin Zhang 已提交
4154
  if (!isSTableQuery
4155
    && (pQInfo->tableqinfoGroupInfo.numOfTables == 1)
B
Bomin Zhang 已提交
4156
    && (cond.order == TSDB_ORDER_ASC) 
H
Haojun Liao 已提交
4157
    && (!QUERY_IS_INTERVAL_QUERY(pQuery))
B
Bomin Zhang 已提交
4158
    && (!isGroupbyNormalCol(pQuery->pGroupbyExpr))
H
Haojun Liao 已提交
4159
    && (!isFixedOutputQuery(pRuntimeEnv))
B
Bomin Zhang 已提交
4160
  ) {
H
Haojun Liao 已提交
4161
    SArray* pa = GET_TABLEGROUP(pQInfo, 0);
4162 4163
    STableQueryInfo* pCheckInfo = taosArrayGetP(pa, 0);
    cond.twindow = pCheckInfo->win;
4164
  }
B
Bomin Zhang 已提交
4165

B
Bomin Zhang 已提交
4166
  terrno = TSDB_CODE_SUCCESS;
H
Haojun Liao 已提交
4167
  if (isFirstLastRowQuery(pQuery)) {
4168
    pRuntimeEnv->pQueryHandle = tsdbQueryLastRow(tsdb, &cond, &pQInfo->tableGroupInfo, pQInfo);
4169
  } else if (isPointInterpoQuery(pQuery)) {
4170
    pRuntimeEnv->pQueryHandle = tsdbQueryRowsInExternalWindow(tsdb, &cond, &pQInfo->tableGroupInfo, pQInfo);
H
Haojun Liao 已提交
4171
  } else {
4172
    pRuntimeEnv->pQueryHandle = tsdbQueryTables(tsdb, &cond, &pQInfo->tableGroupInfo, pQInfo);
H
Haojun Liao 已提交
4173
  }
4174

B
Bomin Zhang 已提交
4175
  return terrno;
B
Bomin Zhang 已提交
4176 4177
}

4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190
static SFillColInfo* taosCreateFillColInfo(SQuery* pQuery) {
  int32_t numOfCols = pQuery->numOfOutput;
  int32_t offset = 0;
  
  SFillColInfo* pFillCol = calloc(numOfCols, sizeof(SFillColInfo));
  for(int32_t i = 0; i < numOfCols; ++i) {
    SExprInfo* pExprInfo = &pQuery->pSelectExpr[i];
    
    pFillCol[i].col.bytes  = pExprInfo->bytes;
    pFillCol[i].col.type   = pExprInfo->type;
    pFillCol[i].col.offset = offset;
    pFillCol[i].flag       = TSDB_COL_NORMAL;    // always be ta normal column for table query
    pFillCol[i].functionId = pExprInfo->base.functionId;
4191
    pFillCol[i].fillVal.i = pQuery->fillVal[i];
4192 4193 4194 4195 4196 4197 4198
    
    offset += pExprInfo->bytes;
  }
  
  return pFillCol;
}

4199
int32_t doInitQInfo(SQInfo *pQInfo, STSBuf *pTsBuf, void *tsdb, int32_t vgId, bool isSTableQuery) {
4200 4201
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;

4202
  int32_t code = TSDB_CODE_SUCCESS;
4203
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
Haojun Liao 已提交
4204

4205
  pQuery->precision = tsdbGetCfg(tsdb)->precision;
H
Haojun Liao 已提交
4206 4207
  pRuntimeEnv->topBotQuery = isTopBottomQuery(pQuery);
  pRuntimeEnv->hasTagResults = hasTagValOutput(pQuery);
4208 4209

  setScanLimitationByResultBuffer(pQuery);
H
Haojun Liao 已提交
4210
  changeExecuteScanOrder(pQInfo, false);
4211

B
Bomin Zhang 已提交
4212 4213 4214 4215
  code = setupQueryHandle(tsdb, pQInfo, isSTableQuery);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }
4216
  
4217
  pQInfo->tsdb = tsdb;
4218
  pQInfo->vgId = vgId;
4219 4220

  pRuntimeEnv->pQuery = pQuery;
H
Haojun Liao 已提交
4221
  pRuntimeEnv->pTSBuf = pTsBuf;
4222
  pRuntimeEnv->cur.vgroupIndex = -1;
4223
  pRuntimeEnv->stableQuery = isSTableQuery;
H
Haojun Liao 已提交
4224
  pRuntimeEnv->prevGroupId = INT32_MIN;
H
Haojun Liao 已提交
4225
  pRuntimeEnv->groupbyNormalCol = isGroupbyNormalCol(pQuery->pGroupbyExpr);
4226

H
Haojun Liao 已提交
4227
  if (pTsBuf != NULL) {
4228 4229 4230 4231 4232 4233 4234 4235 4236 4237
    int16_t order = (pQuery->order.order == pRuntimeEnv->pTSBuf->tsOrder) ? TSDB_ORDER_ASC : TSDB_ORDER_DESC;
    tsBufSetTraverseOrder(pRuntimeEnv->pTSBuf, order);
  }

  // create runtime environment
  code = setupQueryRuntimeEnv(pRuntimeEnv, pQuery->order.order);
  if (code != TSDB_CODE_SUCCESS) {
    return code;
  }

4238 4239 4240
  int32_t ps = DEFAULT_PAGE_SIZE;
  int32_t rowsize = 0;
  getIntermediateBufInfo(pRuntimeEnv, &ps, &rowsize);
4241

H
Haojun Liao 已提交
4242
  if (isSTableQuery && !onlyQueryTags(pRuntimeEnv->pQuery)) {
4243 4244
    int32_t numOfPages = getInitialPageNum(pQInfo);
    code = createDiskbasedResultBuffer(&pRuntimeEnv->pResultBuf, numOfPages, rowsize, ps, numOfPages, pQInfo);
4245 4246 4247 4248
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

H
Haojun Liao 已提交
4249
    if (!QUERY_IS_INTERVAL_QUERY(pQuery)) {
4250
      int16_t type = TSDB_DATA_TYPE_NULL;
4251
      int32_t threshold = 0;
4252

H
Haojun Liao 已提交
4253
      if (pRuntimeEnv->groupbyNormalCol) {  // group by columns not tags;
4254
        type = getGroupbyColumnType(pQuery, pQuery->pGroupbyExpr);
4255
        threshold = 4000;
4256 4257
      } else {
        type = TSDB_DATA_TYPE_INT;  // group id
4258 4259 4260 4261
        threshold = GET_NUM_OF_TABLEGROUP(pQInfo);
        if (threshold < 8) {
          threshold = 8;
        }
4262 4263
      }

4264
      code = initWindowResInfo(&pRuntimeEnv->windowResInfo, pRuntimeEnv, 8, threshold, type);
B
Bomin Zhang 已提交
4265 4266 4267
      if (code != TSDB_CODE_SUCCESS) {
        return code;
      }
4268
    }
H
Haojun Liao 已提交
4269
  } else if (pRuntimeEnv->groupbyNormalCol || QUERY_IS_INTERVAL_QUERY(pQuery)) {
4270 4271 4272 4273
    int32_t numOfResultRows = getInitialPageNum(pQInfo);
    getIntermediateBufInfo(pRuntimeEnv, &ps, &rowsize);

    code = createDiskbasedResultBuffer(&pRuntimeEnv->pResultBuf, numOfResultRows, rowsize, ps, numOfResultRows, pQInfo);
4274 4275 4276 4277 4278
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }

    int16_t type = TSDB_DATA_TYPE_NULL;
H
Haojun Liao 已提交
4279
    if (pRuntimeEnv->groupbyNormalCol) {
4280 4281 4282 4283 4284
      type = getGroupbyColumnType(pQuery, pQuery->pGroupbyExpr);
    } else {
      type = TSDB_DATA_TYPE_TIMESTAMP;
    }

4285
    code = initWindowResInfo(&pRuntimeEnv->windowResInfo, pRuntimeEnv, numOfResultRows, 4096, type);
B
Bomin Zhang 已提交
4286 4287 4288
    if (code != TSDB_CODE_SUCCESS) {
      return code;
    }
4289 4290
  }

4291
  if (pQuery->fillType != TSDB_FILL_NONE && !isPointInterpoQuery(pQuery)) {
4292
    SFillColInfo* pColInfo = taosCreateFillColInfo(pQuery);
H
Haojun Liao 已提交
4293 4294 4295 4296 4297 4298 4299
    STimeWindow w = TSWINDOW_INITIALIZER;

    TSKEY sk = MIN(pQuery->window.skey, pQuery->window.ekey);
    TSKEY ek = MAX(pQuery->window.skey, pQuery->window.ekey);
    getAlignQueryTimeWindow(pQuery, pQuery->window.skey, sk, ek, &w);

    pRuntimeEnv->pFillInfo = taosInitFillInfo(pQuery->order.order, w.skey, 0, pQuery->rec.capacity, pQuery->numOfOutput,
H
Haojun Liao 已提交
4300 4301
                                              pQuery->slidingTime, pQuery->slidingTimeUnit, pQuery->precision,
                                              pQuery->fillType, pColInfo);
4302
  }
4303

H
Haojun Liao 已提交
4304
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
4305
  return TSDB_CODE_SUCCESS;
4306 4307
}

4308
static void enableExecutionForNextTable(SQueryRuntimeEnv *pRuntimeEnv) {
4309
  SQuery *pQuery = pRuntimeEnv->pQuery;
4310

4311
  for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
4312 4313 4314 4315 4316 4317 4318
    SResultInfo *pResInfo = GET_RES_INFO(&pRuntimeEnv->pCtx[i]);
    if (pResInfo != NULL) {
      pResInfo->complete = false;
    }
  }
}

H
Haojun Liao 已提交
4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335
static FORCE_INLINE void setEnvForEachBlock(SQInfo* pQInfo, STableQueryInfo* pTableQueryInfo, SDataBlockInfo* pBlockInfo) {
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery* pQuery = pQInfo->runtimeEnv.pQuery;
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);

  if (!QUERY_IS_INTERVAL_QUERY(pQuery)) {
    setExecutionContext(pQInfo, pTableQueryInfo->groupIndex, pBlockInfo->window.ekey + step);
  } else {  // interval query
    TSKEY nextKey = pBlockInfo->window.skey;
    setIntervalQueryRange(pQInfo, nextKey);

    if (pRuntimeEnv->hasTagResults || pRuntimeEnv->pTSBuf != NULL) {
      setAdditionalInfo(pQInfo, pTableQueryInfo->pTable, pTableQueryInfo);
    }
  }
}

H
Haojun Liao 已提交
4336
static int64_t scanMultiTableDataBlocks(SQInfo *pQInfo) {
4337
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
H
Haojun Liao 已提交
4338 4339
  SQuery*           pQuery = pRuntimeEnv->pQuery;
  SQueryCostInfo*   summary  = &pRuntimeEnv->summary;
4340
  
H
hjxilinx 已提交
4341
  int64_t st = taosGetTimestampMs();
4342

4343
  TsdbQueryHandleT pQueryHandle = IS_MASTER_SCAN(pRuntimeEnv)? pRuntimeEnv->pQueryHandle : pRuntimeEnv->pSecQueryHandle;
H
Haojun Liao 已提交
4344
  SDataBlockInfo blockInfo = SDATA_BLOCK_INITIALIZER;
H
Haojun Liao 已提交
4345

H
Haojun Liao 已提交
4346 4347
  int32_t step = GET_FORWARD_DIRECTION_FACTOR(pQuery->order.order);

B
Bomin Zhang 已提交
4348 4349 4350 4351 4352 4353 4354 4355
  while (true) {
    if (!tsdbNextDataBlock(pQueryHandle)) {
      if (terrno != TSDB_CODE_SUCCESS) {
        longjmp(pRuntimeEnv->env, terrno);
      }
      break;
    }

4356
    summary->totalBlocks += 1;
H
Haojun Liao 已提交
4357
    
H
Haojun Liao 已提交
4358
    if (IS_QUERY_KILLED(pQInfo)) {
H
Haojun Liao 已提交
4359
      longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
4360
    }
4361

H
Haojun Liao 已提交
4362
    tsdbRetrieveDataBlockInfo(pQueryHandle, &blockInfo);
H
Haojun Liao 已提交
4363 4364 4365 4366
    STableQueryInfo **pTableQueryInfo = (STableQueryInfo**) taosHashGet(pQInfo->tableqinfoGroupInfo.map, &blockInfo.tid, sizeof(blockInfo.tid));
    if(pTableQueryInfo == NULL) {
      break;
    }
4367

H
Haojun Liao 已提交
4368 4369
    pQuery->current = *pTableQueryInfo;
    CHECK_QUERY_TIME_RANGE(pQuery, *pTableQueryInfo);
4370

H
Haojun Liao 已提交
4371
    if (!pRuntimeEnv->groupbyNormalCol) {
H
Haojun Liao 已提交
4372
      setEnvForEachBlock(pQInfo, *pTableQueryInfo, &blockInfo);
4373
    }
4374

H
Haojun Liao 已提交
4375 4376 4377 4378 4379 4380 4381
    SDataStatis *pStatis = NULL;
    SArray *pDataBlock = NULL;
    if (loadDataBlockOnDemand(pRuntimeEnv, pQueryHandle, &blockInfo, &pStatis, &pDataBlock) == BLK_DATA_DISCARD) {
      pQuery->current->lastKey = QUERY_IS_ASC_QUERY(pQuery)? blockInfo.window.ekey + step:blockInfo.window.skey + step;
      continue;
    }

4382 4383 4384
    summary->totalRows += blockInfo.rows;
    stableApplyFunctionsOnBlock(pRuntimeEnv, &blockInfo, pStatis, pDataBlock, binarySearchForKey);
  
4385
    qDebug("QInfo:%p check data block, uid:%"PRId64", tid:%d, brange:%" PRId64 "-%" PRId64 ", numOfRows:%d, lastKey:%" PRId64,
4386
           pQInfo, blockInfo.uid, blockInfo.tid, blockInfo.window.skey, blockInfo.window.ekey, blockInfo.rows, pQuery->current->lastKey);
4387
  }
4388

H
Haojun Liao 已提交
4389 4390
  updateWindowResNumOfRes(pRuntimeEnv);

H
hjxilinx 已提交
4391 4392
  int64_t et = taosGetTimestampMs();
  return et - st;
4393 4394
}

4395 4396
static bool multiTableMultioutputHelper(SQInfo *pQInfo, int32_t index) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4397
  SQuery *          pQuery = pRuntimeEnv->pQuery;
4398

4399
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
H
Haojun Liao 已提交
4400
  SArray *group = GET_TABLEGROUP(pQInfo, 0);
4401
  STableQueryInfo* pCheckInfo = taosArrayGetP(group, index);
4402

H
Haojun Liao 已提交
4403 4404 4405
  if (pRuntimeEnv->hasTagResults || pRuntimeEnv->pTSBuf != NULL) {
    setTagVal(pRuntimeEnv, pCheckInfo->pTable, pQInfo->tsdb);
  }
4406

H
Haojun Liao 已提交
4407
  STableId* id = TSDB_TABLEID(pCheckInfo->pTable);
4408
  qDebug("QInfo:%p query on (%d): uid:%" PRIu64 ", tid:%d, qrange:%" PRId64 "-%" PRId64, pQInfo, index,
H
Haojun Liao 已提交
4409
         id->uid, id->tid, pCheckInfo->lastKey, pCheckInfo->win.ekey);
4410

4411
  STsdbQueryCond cond = {
4412
      .twindow   = {pCheckInfo->lastKey, pCheckInfo->win.ekey},
H
hjxilinx 已提交
4413 4414
      .order     = pQuery->order.order,
      .colList   = pQuery->colList,
4415
      .numOfCols = pQuery->numOfCols,
4416
  };
4417

H
hjxilinx 已提交
4418
  // todo refactor
4419
  SArray *g1 = taosArrayInit(1, POINTER_BYTES);
4420
  SArray *tx = taosArrayInit(1, POINTER_BYTES);
4421

4422
  taosArrayPush(tx, &pCheckInfo->pTable);
4423
  taosArrayPush(g1, &tx);
4424
  STableGroupInfo gp = {.numOfTables = 1, .pGroupList = g1};
4425

4426
  // include only current table
4427 4428 4429 4430
  if (pRuntimeEnv->pQueryHandle != NULL) {
    tsdbCleanupQueryHandle(pRuntimeEnv->pQueryHandle);
    pRuntimeEnv->pQueryHandle = NULL;
  }
4431

H
Haojun Liao 已提交
4432
  pRuntimeEnv->pQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &gp, pQInfo);
4433 4434
  taosArrayDestroy(tx);
  taosArrayDestroy(g1);
B
Bomin Zhang 已提交
4435 4436 4437
  if (pRuntimeEnv->pQueryHandle == NULL) {
    longjmp(pRuntimeEnv->env, terrno);
  }
4438

4439
  if (pRuntimeEnv->pTSBuf != NULL) {
4440
    if (pRuntimeEnv->cur.vgroupIndex == -1) {
4441 4442
      int64_t tag = pRuntimeEnv->pCtx[0].tag.i64Key;
      STSElem elem = tsBufGetElemStartPos(pRuntimeEnv->pTSBuf, 0, tag);
4443

4444 4445 4446 4447 4448 4449 4450 4451
      // failed to find data with the specified tag value
      if (elem.vnode < 0) {
        return false;
      }
    } else {
      tsBufSetCursor(pRuntimeEnv->pTSBuf, &pRuntimeEnv->cur);
    }
  }
4452

4453
  initCtxOutputBuf(pRuntimeEnv);
4454 4455 4456 4457 4458 4459 4460 4461 4462 4463
  return true;
}

/**
 * super table query handler
 * 1. super table projection query, group-by on normal columns query, ts-comp query
 * 2. point interpolation query, last row query
 *
 * @param pQInfo
 */
4464
static void sequentialTableProcess(SQInfo *pQInfo) {
4465
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
4466
  SQuery *          pQuery = pRuntimeEnv->pQuery;
4467
  setQueryStatus(pQuery, QUERY_COMPLETED);
4468

H
Haojun Liao 已提交
4469
  size_t numOfGroups = GET_NUM_OF_TABLEGROUP(pQInfo);
4470

H
Haojun Liao 已提交
4471
  if (isPointInterpoQuery(pQuery) || isFirstLastRowQuery(pQuery)) {
4472 4473
    resetCtxOutputBuf(pRuntimeEnv);
    assert(pQuery->limit.offset == 0 && pQuery->limit.limit != 0);
4474

4475
    while (pQInfo->groupIndex < numOfGroups) {
H
Haojun Liao 已提交
4476
      SArray* group = taosArrayGetP(pQInfo->tableGroupInfo.pGroupList, pQInfo->groupIndex);
4477

4478
      qDebug("QInfo:%p last_row query on group:%d, total group:%zu, current group:%p", pQInfo, pQInfo->groupIndex,
dengyihao's avatar
dengyihao 已提交
4479
             numOfGroups, group);
H
Haojun Liao 已提交
4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499

      STsdbQueryCond cond = {
          .twindow = pQuery->window,
          .colList = pQuery->colList,
          .order   = pQuery->order.order,
          .numOfCols = pQuery->numOfCols,
      };

      SArray *g1 = taosArrayInit(1, POINTER_BYTES);
      SArray *tx = taosArrayClone(group);
      taosArrayPush(g1, &tx);
      
      STableGroupInfo gp = {.numOfTables = taosArrayGetSize(tx), .pGroupList = g1};

      // include only current table
      if (pRuntimeEnv->pQueryHandle != NULL) {
        tsdbCleanupQueryHandle(pRuntimeEnv->pQueryHandle);
        pRuntimeEnv->pQueryHandle = NULL;
      }
      
4500
      if (isFirstLastRowQuery(pQuery)) {
H
Haojun Liao 已提交
4501
        pRuntimeEnv->pQueryHandle = tsdbQueryLastRow(pQInfo->tsdb, &cond, &gp, pQInfo);
H
Haojun Liao 已提交
4502
      } else {
H
Haojun Liao 已提交
4503
        pRuntimeEnv->pQueryHandle = tsdbQueryRowsInExternalWindow(pQInfo->tsdb, &cond, &gp, pQInfo);
4504
      }
B
Bomin Zhang 已提交
4505 4506 4507 4508 4509 4510

      taosArrayDestroy(tx);
      taosArrayDestroy(g1);
      if (pRuntimeEnv->pQueryHandle == NULL) {
        longjmp(pRuntimeEnv->env, terrno);
      }
H
Haojun Liao 已提交
4511

H
Haojun Liao 已提交
4512
      initCtxOutputBuf(pRuntimeEnv);
4513
      
4514
      SArray* s = tsdbGetQueriedTableList(pRuntimeEnv->pQueryHandle);
4515 4516
      assert(taosArrayGetSize(s) >= 1);
      
4517
      setTagVal(pRuntimeEnv, taosArrayGetP(s, 0), pQInfo->tsdb);
4518 4519 4520
      if (isFirstLastRowQuery(pQuery)) {
        assert(taosArrayGetSize(s) == 1);
      }
H
Haojun Liao 已提交
4521

dengyihao's avatar
dengyihao 已提交
4522
      taosArrayDestroy(s);
H
Haojun Liao 已提交
4523

H
Haojun Liao 已提交
4524
      // here we simply set the first table as current table
4525 4526 4527
      SArray* first = GET_TABLEGROUP(pQInfo, pQInfo->groupIndex);
      pQuery->current = taosArrayGetP(first, 0);

4528
      scanOneTableDataBlocks(pRuntimeEnv, pQuery->current->lastKey);
H
Haojun Liao 已提交
4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540
      
      int64_t numOfRes = getNumOfResult(pRuntimeEnv);
      if (numOfRes > 0) {
        pQuery->rec.rows += numOfRes;
        forwardCtxOutputBuf(pRuntimeEnv, numOfRes);
      }
      
      skipResults(pRuntimeEnv);
      pQInfo->groupIndex += 1;

      // enable execution for next table, when handling the projection query
      enableExecutionForNextTable(pRuntimeEnv);
4541 4542 4543 4544 4545 4546

      if (pQuery->rec.rows >= pQuery->rec.capacity) {
        setQueryStatus(pQuery, QUERY_RESBUF_FULL);
        break;
      }
    }
H
Haojun Liao 已提交
4547
  } else if (pRuntimeEnv->groupbyNormalCol) { // group-by on normal columns query
4548
    while (pQInfo->groupIndex < numOfGroups) {
H
Haojun Liao 已提交
4549
      SArray* group = taosArrayGetP(pQInfo->tableGroupInfo.pGroupList, pQInfo->groupIndex);
4550

4551
      qDebug("QInfo:%p group by normal columns group:%d, total group:%zu", pQInfo, pQInfo->groupIndex, numOfGroups);
4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572

      STsdbQueryCond cond = {
          .twindow = pQuery->window,
          .colList = pQuery->colList,
          .order   = pQuery->order.order,
          .numOfCols = pQuery->numOfCols,
      };

      SArray *g1 = taosArrayInit(1, POINTER_BYTES);
      SArray *tx = taosArrayClone(group);
      taosArrayPush(g1, &tx);

      STableGroupInfo gp = {.numOfTables = taosArrayGetSize(tx), .pGroupList = g1};

      // include only current table
      if (pRuntimeEnv->pQueryHandle != NULL) {
        tsdbCleanupQueryHandle(pRuntimeEnv->pQueryHandle);
        pRuntimeEnv->pQueryHandle = NULL;
      }

      pRuntimeEnv->pQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &gp, pQInfo);
B
Bomin Zhang 已提交
4573 4574
      taosArrayDestroy(g1);
      taosArrayDestroy(tx);
B
Bomin Zhang 已提交
4575 4576 4577
      if (pRuntimeEnv->pQueryHandle == NULL) {
        longjmp(pRuntimeEnv->env, terrno);
      }
4578

4579
      SArray* s = tsdbGetQueriedTableList(pRuntimeEnv->pQueryHandle);
4580 4581
      assert(taosArrayGetSize(s) >= 1);

4582
      setTagVal(pRuntimeEnv, taosArrayGetP(s, 0), pQInfo->tsdb);
4583 4584 4585 4586 4587 4588 4589 4590

      // here we simply set the first table as current table
      scanMultiTableDataBlocks(pQInfo);
      pQInfo->groupIndex += 1;

      SWindowResInfo *pWindowResInfo = &pRuntimeEnv->windowResInfo;

        // no results generated for current group, continue to try the next group
dengyihao's avatar
dengyihao 已提交
4591
      taosArrayDestroy(s); 
4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605
      if (pWindowResInfo->size <= 0) {
        continue;
      }

      for (int32_t i = 0; i < pWindowResInfo->size; ++i) {
        SWindowStatus *pStatus = &pWindowResInfo->pResult[i].status;
        pStatus->closed = true;  // enable return all results for group by normal columns

        SWindowResult *pResult = &pWindowResInfo->pResult[i];
        for (int32_t j = 0; j < pQuery->numOfOutput; ++j) {
          pResult->numOfRows = MAX(pResult->numOfRows, pResult->resultInfo[j].numOfRes);
        }
      }

4606
      qDebug("QInfo:%p generated groupby columns results %d rows for group %d completed", pQInfo, pWindowResInfo->size,
4607 4608 4609 4610 4611 4612 4613
          pQInfo->groupIndex);
      int32_t currentGroupIndex = pQInfo->groupIndex;

      pQuery->rec.rows = 0;
      pQInfo->groupIndex = 0;

      ensureOutputBufferSimple(pRuntimeEnv, pWindowResInfo->size);
4614
      copyFromWindowResToSData(pQInfo, pWindowResInfo);
4615 4616 4617 4618 4619 4620

      pQInfo->groupIndex = currentGroupIndex;  //restore the group index
      assert(pQuery->rec.rows == pWindowResInfo->size);

      clearClosedTimeWindow(pRuntimeEnv);
      break;
4621 4622 4623
    }
  } else {
    /*
4624
     * 1. super table projection query, 2. ts-comp query
4625 4626 4627
     * if the subgroup index is larger than 0, results generated by group by tbname,k is existed.
     * we need to return it to client in the first place.
     */
4628
    if (pQInfo->groupIndex > 0) {
4629
      copyFromWindowResToSData(pQInfo, &pRuntimeEnv->windowResInfo);
4630
      pQuery->rec.total += pQuery->rec.rows;
4631

4632
      if (pQuery->rec.rows > 0) {
4633 4634 4635
        return;
      }
    }
4636

4637
    // all data have returned already
4638
    if (pQInfo->tableIndex >= pQInfo->tableqinfoGroupInfo.numOfTables) {
4639 4640
      return;
    }
4641

4642 4643
    resetCtxOutputBuf(pRuntimeEnv);
    resetTimeWindowInfo(pRuntimeEnv, &pRuntimeEnv->windowResInfo);
4644

H
Haojun Liao 已提交
4645
    SArray *group = GET_TABLEGROUP(pQInfo, 0);
4646 4647
    assert(taosArrayGetSize(group) == pQInfo->tableqinfoGroupInfo.numOfTables &&
           1 == taosArrayGetSize(pQInfo->tableqinfoGroupInfo.pGroupList));
4648

4649
    while (pQInfo->tableIndex < pQInfo->tableqinfoGroupInfo.numOfTables) {
H
Haojun Liao 已提交
4650
      if (IS_QUERY_KILLED(pQInfo)) {
H
Haojun Liao 已提交
4651
        longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
4652
      }
4653

4654
      pQuery->current = taosArrayGetP(group, pQInfo->tableIndex);
4655
      if (!multiTableMultioutputHelper(pQInfo, pQInfo->tableIndex)) {
4656
        pQInfo->tableIndex++;
4657 4658
        continue;
      }
4659

H
hjxilinx 已提交
4660
      // TODO handle the limit offset problem
4661
      if (pQuery->numOfFilterCols == 0 && pQuery->limit.offset > 0) {
4662 4663
        if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
          pQInfo->tableIndex++;
4664 4665 4666
          continue;
        }
      }
4667

4668
      scanOneTableDataBlocks(pRuntimeEnv, pQuery->current->lastKey);
4669
      skipResults(pRuntimeEnv);
4670

4671
      // the limitation of output result is reached, set the query completed
4672
      if (limitResults(pRuntimeEnv)) {
4673
        pQInfo->tableIndex = pQInfo->tableqinfoGroupInfo.numOfTables;
4674 4675
        break;
      }
4676

4677 4678
      // enable execution for next table, when handling the projection query
      enableExecutionForNextTable(pRuntimeEnv);
4679

4680
      if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
4681 4682 4683 4684 4685 4686
        /*
         * query range is identical in terms of all meters involved in query,
         * so we need to restore them at the *beginning* of query on each meter,
         * not the consecutive query on meter on which is aborted due to buffer limitation
         * to ensure that, we can reset the query range once query on a meter is completed.
         */
4687
        pQInfo->tableIndex++;
weixin_48148422's avatar
weixin_48148422 已提交
4688

H
Haojun Liao 已提交
4689
        STableIdInfo tidInfo = {0};
4690

H
Haojun Liao 已提交
4691 4692 4693
        STableId* id = TSDB_TABLEID(pQuery->current->pTable);
        tidInfo.uid = id->uid;
        tidInfo.tid = id->tid;
weixin_48148422's avatar
weixin_48148422 已提交
4694
        tidInfo.key = pQuery->current->lastKey;
weixin_48148422's avatar
weixin_48148422 已提交
4695 4696
        taosArrayPush(pQInfo->arrTableIdInfo, &tidInfo);

4697
        // if the buffer is full or group by each table, we need to jump out of the loop
4698 4699
        if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL) /*||
            isGroupbyEachTable(pQuery->pGroupbyExpr, pSupporter->pSidSet)*/) {
4700 4701
          break;
        }
4702

4703
      } else {
4704
        // all data in the result buffer are skipped due to the offset, continue to retrieve data from current meter
4705 4706
        if (pQuery->rec.rows == 0) {
          assert(!Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL));
4707 4708
          continue;
        } else {
4709 4710 4711
          // buffer is full, wait for the next round to retrieve data from current meter
          assert(Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL));
          break;
4712 4713 4714
        }
      }
    }
H
Haojun Liao 已提交
4715

4716
    if (pQInfo->tableIndex >= pQInfo->tableqinfoGroupInfo.numOfTables) {
H
Haojun Liao 已提交
4717 4718
      setQueryStatus(pQuery, QUERY_COMPLETED);
    }
4719
  }
4720

4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732
  /*
   * 1. super table projection query, group-by on normal columns query, ts-comp query
   * 2. point interpolation query, last row query
   *
   * group-by on normal columns query and last_row query do NOT invoke the finalizer here,
   * since the finalize stage will be done at the client side.
   *
   * projection query, point interpolation query do not need the finalizer.
   *
   * Only the ts-comp query requires the finalizer function to be executed here.
   */
  if (isTSCompQuery(pQuery)) {
H
hjxilinx 已提交
4733
    finalizeQueryResult(pRuntimeEnv);
4734
  }
4735

4736 4737 4738
  if (pRuntimeEnv->pTSBuf != NULL) {
    pRuntimeEnv->cur = pRuntimeEnv->pTSBuf->cur;
  }
4739

4740
  qDebug(
B
Bomin Zhang 已提交
4741
      "QInfo %p numOfTables:%"PRIu64", index:%d, numOfGroups:%zu, %"PRId64" points returned, total:%"PRId64", offset:%" PRId64,
4742
      pQInfo, pQInfo->tableqinfoGroupInfo.numOfTables, pQInfo->tableIndex, numOfGroups, pQuery->rec.rows, pQuery->rec.total,
4743
      pQuery->limit.offset);
4744 4745
}

4746 4747 4748 4749
static void doSaveContext(SQInfo *pQInfo) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

4750 4751 4752 4753
  SET_REVERSE_SCAN_FLAG(pRuntimeEnv);
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
  SWITCH_ORDER(pQuery->order.order);
  
4754
  if (pRuntimeEnv->pTSBuf != NULL) {
4755
    pRuntimeEnv->pTSBuf->cur.order = pQuery->order.order;
4756
  }
4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768
  
  STsdbQueryCond cond = {
      .twindow = pQuery->window,
      .order   = pQuery->order.order,
      .colList = pQuery->colList,
      .numOfCols = pQuery->numOfCols,
  };
  
  // clean unused handle
  if (pRuntimeEnv->pSecQueryHandle != NULL) {
    tsdbCleanupQueryHandle(pRuntimeEnv->pSecQueryHandle);
  }
H
Haojun Liao 已提交
4769 4770

  pRuntimeEnv->prevGroupId = INT32_MIN;
4771
  pRuntimeEnv->pSecQueryHandle = tsdbQueryTables(pQInfo->tsdb, &cond, &pQInfo->tableGroupInfo, pQInfo);
B
Bomin Zhang 已提交
4772 4773 4774
  if (pRuntimeEnv->pSecQueryHandle == NULL) {
    longjmp(pRuntimeEnv->env, terrno);
  }
H
Haojun Liao 已提交
4775

4776 4777 4778
  setQueryStatus(pQuery, QUERY_NOT_COMPLETED);
  switchCtxOrder(pRuntimeEnv);
  disableFuncInReverseScan(pQInfo);
H
hjxilinx 已提交
4779 4780
}

4781 4782 4783 4784
static void doRestoreContext(SQInfo *pQInfo) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

H
hjxilinx 已提交
4785
  SWAP(pQuery->window.skey, pQuery->window.ekey, TSKEY);
4786

4787
  if (pRuntimeEnv->pTSBuf != NULL) {
4788
    SWITCH_ORDER(pRuntimeEnv->pTSBuf->cur.order);
4789
  }
4790

4791
  switchCtxOrder(pRuntimeEnv);
4792 4793 4794
  SET_MASTER_SCAN_FLAG(pRuntimeEnv);
}

4795 4796 4797
static void doCloseAllTimeWindowAfterScan(SQInfo *pQInfo) {
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;

H
Haojun Liao 已提交
4798
  if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
H
Haojun Liao 已提交
4799
    size_t numOfGroup = GET_NUM_OF_TABLEGROUP(pQInfo);
4800
    for (int32_t i = 0; i < numOfGroup; ++i) {
H
Haojun Liao 已提交
4801
      SArray *group = GET_TABLEGROUP(pQInfo, i);
4802

4803
      size_t num = taosArrayGetSize(group);
4804
      for (int32_t j = 0; j < num; ++j) {
4805 4806
        STableQueryInfo* item = taosArrayGetP(group, j);
        closeAllTimeWindow(&item->windowResInfo);
4807
      }
H
hjxilinx 已提交
4808 4809 4810 4811 4812 4813 4814
    }
  } else {  // close results for group result
    closeAllTimeWindow(&pQInfo->runtimeEnv.windowResInfo);
  }
}

static void multiTableQueryProcess(SQInfo *pQInfo) {
4815 4816 4817
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

4818
  if (pQInfo->groupIndex > 0) {
4819
    /*
4820
     * if the groupIndex > 0, the query process must be completed yet, we only need to
4821 4822
     * copy the data into output buffer
     */
H
Haojun Liao 已提交
4823
    if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
4824 4825
      copyResToQueryResultBuf(pQInfo, pQuery);
#ifdef _DEBUG_VIEW
4826
      displayInterResult(pQuery->sdata, pRuntimeEnv, pQuery->sdata[0]->num);
4827 4828
#endif
    } else {
4829
      copyFromWindowResToSData(pQInfo, &pRuntimeEnv->windowResInfo);
4830
    }
4831

4832
    qDebug("QInfo:%p current:%"PRId64", total:%"PRId64"", pQInfo, pQuery->rec.rows, pQuery->rec.total);
4833 4834
    return;
  }
4835

4836
  qDebug("QInfo:%p query start, qrange:%" PRId64 "-%" PRId64 ", order:%d, forward scan start", pQInfo,
4837 4838
         pQuery->window.skey, pQuery->window.ekey, pQuery->order.order);

H
hjxilinx 已提交
4839
  // do check all qualified data blocks
H
Haojun Liao 已提交
4840
  int64_t el = scanMultiTableDataBlocks(pQInfo);
4841
  qDebug("QInfo:%p master scan completed, elapsed time: %" PRId64 "ms, reverse scan start", pQInfo, el);
4842

H
hjxilinx 已提交
4843
  // query error occurred or query is killed, abort current execution
H
Haojun Liao 已提交
4844
  if (pQInfo->code != TSDB_CODE_SUCCESS || IS_QUERY_KILLED(pQInfo)) {
4845
    qDebug("QInfo:%p query killed or error occurred, code:%s, abort", pQInfo, tstrerror(pQInfo->code));
H
Haojun Liao 已提交
4846 4847
    finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
    longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
4848
  }
4849

H
hjxilinx 已提交
4850 4851
  // close all time window results
  doCloseAllTimeWindowAfterScan(pQInfo);
4852

H
hjxilinx 已提交
4853 4854
  if (needReverseScan(pQuery)) {
    doSaveContext(pQInfo);
4855

H
Haojun Liao 已提交
4856
    el = scanMultiTableDataBlocks(pQInfo);
4857
    qDebug("QInfo:%p reversed scan completed, elapsed time: %" PRId64 "ms", pQInfo, el);
4858

H
Haojun Liao 已提交
4859
//    doCloseAllTimeWindowAfterScan(pQInfo);
H
Haojun Liao 已提交
4860
    doRestoreContext(pQInfo);
H
hjxilinx 已提交
4861
  } else {
4862
    qDebug("QInfo:%p no need to do reversed scan, query completed", pQInfo);
4863
  }
4864

4865
  setQueryStatus(pQuery, QUERY_COMPLETED);
4866

H
Haojun Liao 已提交
4867
  if (pQInfo->code != TSDB_CODE_SUCCESS || IS_QUERY_KILLED(pQInfo)) {
4868
    qDebug("QInfo:%p query killed or error occurred, code:%s, abort", pQInfo, tstrerror(pQInfo->code));
H
Haojun Liao 已提交
4869 4870
    finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
    longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
H
hjxilinx 已提交
4871
  }
4872

H
Haojun Liao 已提交
4873
  if (QUERY_IS_INTERVAL_QUERY(pQuery) || isSumAvgRateQuery(pQuery)) {
4874
    if (mergeIntoGroupResult(pQInfo) == TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
4875
      copyResToQueryResultBuf(pQInfo, pQuery);
4876 4877

#ifdef _DEBUG_VIEW
4878
      displayInterResult(pQuery->sdata, pRuntimeEnv, pQuery->sdata[0]->num);
4879 4880 4881
#endif
    }
  } else {  // not a interval query
4882
    copyFromWindowResToSData(pQInfo, &pRuntimeEnv->windowResInfo);
4883
  }
4884

4885
  // handle the limitation of output buffer
4886
  qDebug("QInfo:%p points returned:%" PRId64 ", total:%" PRId64, pQInfo, pQuery->rec.rows, pQuery->rec.total + pQuery->rec.rows);
4887 4888 4889 4890 4891 4892 4893 4894
}

/*
 * in each query, this function will be called only once, no retry for further result.
 *
 * select count(*)/top(field,k)/avg(field name) from table_name [where ts>now-1a];
 * select count(*) from table_name group by status_column;
 */
H
hjxilinx 已提交
4895
static void tableFixedOutputProcess(SQInfo *pQInfo, STableQueryInfo* pTableInfo) {
4896
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
H
hjxilinx 已提交
4897 4898
  
  SQuery *pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
4899
  if (!pRuntimeEnv->topBotQuery && pQuery->limit.offset > 0) {  // no need to execute, since the output will be ignore.
H
Haojun Liao 已提交
4900 4901 4902
    return;
  }
  
H
hjxilinx 已提交
4903 4904
  pQuery->current = pTableInfo;  // set current query table info
  
4905
  scanOneTableDataBlocks(pRuntimeEnv, pTableInfo->lastKey);
H
hjxilinx 已提交
4906
  finalizeQueryResult(pRuntimeEnv);
4907

H
Haojun Liao 已提交
4908
  if (IS_QUERY_KILLED(pQInfo)) {
H
Haojun Liao 已提交
4909 4910
    finalizeQueryResult(pRuntimeEnv); // clean up allocated resource during query
    longjmp(pRuntimeEnv->env, TSDB_CODE_TSC_QUERY_CANCELLED);
4911
  }
4912

H
Haojun Liao 已提交
4913
  // since the numOfRows must be identical for all sql functions that are allowed to be executed simutaneously.
4914
  pQuery->rec.rows = getNumOfResult(pRuntimeEnv);
4915

4916
  skipResults(pRuntimeEnv);
4917
  limitResults(pRuntimeEnv);
4918 4919
}

H
hjxilinx 已提交
4920
static void tableMultiOutputProcess(SQInfo *pQInfo, STableQueryInfo* pTableInfo) {
4921
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
H
hjxilinx 已提交
4922 4923 4924 4925
  
  SQuery *pQuery = pRuntimeEnv->pQuery;
  pQuery->current = pTableInfo;
  
4926 4927 4928 4929
  // for ts_comp query, re-initialized is not allowed
  if (!isTSCompQuery(pQuery)) {
    resetCtxOutputBuf(pRuntimeEnv);
  }
4930

4931 4932 4933 4934 4935 4936
  // skip blocks without load the actual data block from file if no filter condition present
  skipBlocks(&pQInfo->runtimeEnv);
  if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols == 0) {
    setQueryStatus(pQuery, QUERY_COMPLETED);
    return;
  }
4937 4938

  while (1) {
4939
    scanOneTableDataBlocks(pRuntimeEnv, pQuery->current->lastKey);
H
hjxilinx 已提交
4940
    finalizeQueryResult(pRuntimeEnv);
4941

4942 4943
    pQuery->rec.rows = getNumOfResult(pRuntimeEnv);
    if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols > 0 && pQuery->rec.rows > 0) {
4944
      skipResults(pRuntimeEnv);
4945 4946 4947
    }

    /*
H
hjxilinx 已提交
4948 4949
     * 1. if pQuery->size == 0, pQuery->limit.offset >= 0, still need to check data
     * 2. if pQuery->size > 0, pQuery->limit.offset must be 0
4950
     */
4951
    if (pQuery->rec.rows > 0 || Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
4952 4953 4954
      break;
    }

4955
    qDebug("QInfo:%p skip current result, offset:%" PRId64 ", next qrange:%" PRId64 "-%" PRId64,
B
Bomin Zhang 已提交
4956
           pQInfo, pQuery->limit.offset, pQuery->current->lastKey, pQuery->current->win.ekey);
4957 4958 4959 4960

    resetCtxOutputBuf(pRuntimeEnv);
  }

4961
  limitResults(pRuntimeEnv);
4962
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
4963
    qDebug("QInfo:%p query paused due to output limitation, next qrange:%" PRId64 "-%" PRId64, pQInfo,
H
hjxilinx 已提交
4964
        pQuery->current->lastKey, pQuery->window.ekey);
weixin_48148422's avatar
weixin_48148422 已提交
4965 4966
  } else if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
    STableIdInfo tidInfo;
B
Bomin Zhang 已提交
4967
    STableId* id = TSDB_TABLEID(pQuery->current->pTable);
4968

H
Haojun Liao 已提交
4969 4970
    tidInfo.uid = id->uid;
    tidInfo.tid = id->tid;
weixin_48148422's avatar
weixin_48148422 已提交
4971 4972
    tidInfo.key = pQuery->current->lastKey;
    taosArrayPush(pQInfo->arrTableIdInfo, &tidInfo);
4973 4974
  }

4975 4976 4977
  if (!isTSCompQuery(pQuery)) {
    assert(pQuery->rec.rows <= pQuery->rec.capacity);
  }
4978 4979
}

H
Haojun Liao 已提交
4980
static void tableIntervalProcessImpl(SQueryRuntimeEnv *pRuntimeEnv, TSKEY start) {
4981
  SQuery *pQuery = pRuntimeEnv->pQuery;
4982

4983
  while (1) {
4984
    scanOneTableDataBlocks(pRuntimeEnv, start);
4985

4986
    assert(!Q_STATUS_EQUAL(pQuery->status, QUERY_NOT_COMPLETED));
H
hjxilinx 已提交
4987
    finalizeQueryResult(pRuntimeEnv);
4988

4989 4990 4991
    // here we can ignore the records in case of no interpolation
    // todo handle offset, in case of top/bottom interval query
    if ((pQuery->numOfFilterCols > 0 || pRuntimeEnv->pTSBuf != NULL) && pQuery->limit.offset > 0 &&
4992
        pQuery->fillType == TSDB_FILL_NONE) {
4993 4994
      // maxOutput <= 0, means current query does not generate any results
      int32_t numOfClosed = numOfClosedTimeWindow(&pRuntimeEnv->windowResInfo);
4995

4996 4997 4998 4999
      int32_t c = MIN(numOfClosed, pQuery->limit.offset);
      clearFirstNTimeWindow(pRuntimeEnv, c);
      pQuery->limit.offset -= c;
    }
5000

5001
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED | QUERY_RESBUF_FULL)) {
5002 5003 5004 5005 5006
      break;
    }
  }
}

5007
// handle time interval query on table
H
hjxilinx 已提交
5008
static void tableIntervalProcess(SQInfo *pQInfo, STableQueryInfo* pTableInfo) {
5009 5010
  SQueryRuntimeEnv *pRuntimeEnv = &(pQInfo->runtimeEnv);

H
hjxilinx 已提交
5011 5012
  SQuery *pQuery = pRuntimeEnv->pQuery;
  pQuery->current = pTableInfo;
5013

H
Haojun Liao 已提交
5014
  int32_t numOfFilled = 0;
H
Haojun Liao 已提交
5015 5016
  TSKEY newStartKey = TSKEY_INITIAL_VAL;
  
5017
  // skip blocks without load the actual data block from file if no filter condition present
H
Haojun Liao 已提交
5018
  skipTimeInterval(pRuntimeEnv, &newStartKey);
5019
  if (pQuery->limit.offset > 0 && pQuery->numOfFilterCols == 0 && pRuntimeEnv->pFillInfo == NULL) {
5020 5021 5022 5023
    setQueryStatus(pQuery, QUERY_COMPLETED);
    return;
  }

5024
  while (1) {
H
Haojun Liao 已提交
5025
    tableIntervalProcessImpl(pRuntimeEnv, newStartKey);
5026

H
Haojun Liao 已提交
5027
    if (QUERY_IS_INTERVAL_QUERY(pQuery)) {
5028
      pQInfo->groupIndex = 0;  // always start from 0
5029
      pQuery->rec.rows = 0;
5030
      copyFromWindowResToSData(pQInfo, &pRuntimeEnv->windowResInfo);
5031

5032
      clearFirstNTimeWindow(pRuntimeEnv, pQInfo->groupIndex);
5033
    }
5034

5035
    // the offset is handled at prepare stage if no interpolation involved
5036
    if (pQuery->fillType == TSDB_FILL_NONE || pQuery->rec.rows == 0) {
5037
      limitResults(pRuntimeEnv);
5038 5039
      break;
    } else {
H
Haojun Liao 已提交
5040
      taosFillSetStartInfo(pRuntimeEnv->pFillInfo, pQuery->rec.rows, pQuery->window.ekey);
5041
      taosFillCopyInputDataFromFilePage(pRuntimeEnv->pFillInfo, (tFilePage**) pQuery->sdata);
H
Haojun Liao 已提交
5042
      numOfFilled = 0;
5043
      
H
Haojun Liao 已提交
5044
      pQuery->rec.rows = doFillGapsInResults(pRuntimeEnv, (tFilePage **)pQuery->sdata, &numOfFilled);
5045
      if (pQuery->rec.rows > 0 || Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
5046
        limitResults(pRuntimeEnv);
5047 5048
        break;
      }
5049

5050
      // no result generated yet, continue retrieve data
5051
      pQuery->rec.rows = 0;
5052 5053
    }
  }
5054

5055
  // all data scanned, the group by normal column can return
H
Haojun Liao 已提交
5056
  if (pRuntimeEnv->groupbyNormalCol) {  // todo refactor with merge interval time result
5057
    pQInfo->groupIndex = 0;
5058
    pQuery->rec.rows = 0;
5059
    copyFromWindowResToSData(pQInfo, &pRuntimeEnv->windowResInfo);
5060
    clearFirstNTimeWindow(pRuntimeEnv, pQInfo->groupIndex);
5061
  }
5062

H
Haojun Liao 已提交
5063
  pQInfo->pointsInterpo += numOfFilled;
5064 5065
}

5066 5067 5068 5069
static void tableQueryImpl(SQInfo *pQInfo) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;

5070
  if (queryHasRemainResults(pRuntimeEnv)) {
5071

H
Haojun Liao 已提交
5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083
    if (pQuery->fillType != TSDB_FILL_NONE) {
      /*
       * There are remain results that are not returned due to result interpolation
       * So, we do keep in this procedure instead of launching retrieve procedure for next results.
       */
      int32_t numOfFilled = 0;
      pQuery->rec.rows = doFillGapsInResults(pRuntimeEnv, (tFilePage **)pQuery->sdata, &numOfFilled);

      if (pQuery->rec.rows > 0) {
        limitResults(pRuntimeEnv);
      }

H
Haojun Liao 已提交
5084
      qDebug("QInfo:%p current:%" PRId64 " returned, total:%" PRId64, pQInfo, pQuery->rec.rows, pQuery->rec.total);
5085
      return;
H
Haojun Liao 已提交
5086
    } else {
5087
      pQuery->rec.rows = 0;
5088
      pQInfo->groupIndex = 0;  // always start from 0
5089

5090
      if (pRuntimeEnv->windowResInfo.size > 0) {
5091
        copyFromWindowResToSData(pQInfo, &pRuntimeEnv->windowResInfo);
5092
        clearFirstNTimeWindow(pRuntimeEnv, pQInfo->groupIndex);
5093

5094
        if (pQuery->rec.rows > 0) {
5095
          qDebug("QInfo:%p %"PRId64" rows returned from group results, total:%"PRId64"", pQInfo, pQuery->rec.rows, pQuery->rec.total);
H
Haojun Liao 已提交
5096 5097 5098

          // there are not data remains
          if (pRuntimeEnv->windowResInfo.size <= 0) {
H
Haojun Liao 已提交
5099
            qDebug("QInfo:%p query over, %"PRId64" rows are returned", pQInfo, pQuery->rec.total);
H
Haojun Liao 已提交
5100 5101
          }

5102 5103 5104 5105 5106
          return;
        }
      }
    }
  }
5107

H
hjxilinx 已提交
5108
  // number of points returned during this query
5109
  pQuery->rec.rows = 0;
5110
  int64_t st = taosGetTimestampUs();
H
hjxilinx 已提交
5111
  
5112
  assert(pQInfo->tableqinfoGroupInfo.numOfTables == 1);
H
Haojun Liao 已提交
5113
  SArray* g = GET_TABLEGROUP(pQInfo, 0);
5114
  STableQueryInfo* item = taosArrayGetP(g, 0);
H
hjxilinx 已提交
5115
  
5116
  // group by normal column, sliding window query, interval query are handled by interval query processor
H
Haojun Liao 已提交
5117
  if (QUERY_IS_INTERVAL_QUERY(pQuery) || pRuntimeEnv->groupbyNormalCol) {  // interval (down sampling operation)
5118
    tableIntervalProcess(pQInfo, item);
H
Haojun Liao 已提交
5119
  } else if (isFixedOutputQuery(pRuntimeEnv)) {
5120
    tableFixedOutputProcess(pQInfo, item);
5121 5122
  } else {  // diff/add/multiply/subtract/division
    assert(pQuery->checkBuffer == 1);
5123
    tableMultiOutputProcess(pQInfo, item);
5124
  }
5125

5126
  // record the total elapsed time
5127
  pRuntimeEnv->summary.elapsedTime += (taosGetTimestampUs() - st);
5128
  assert(pQInfo->tableqinfoGroupInfo.numOfTables == 1);
5129 5130
}

5131
static void stableQueryImpl(SQInfo *pQInfo) {
H
Haojun Liao 已提交
5132 5133
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *pQuery = pRuntimeEnv->pQuery;
5134
  pQuery->rec.rows = 0;
5135

5136
  int64_t st = taosGetTimestampUs();
5137

H
Haojun Liao 已提交
5138
  if (QUERY_IS_INTERVAL_QUERY(pQuery) ||
H
Haojun Liao 已提交
5139
      (isFixedOutputQuery(pRuntimeEnv) && (!isPointInterpoQuery(pQuery)) && !pRuntimeEnv->groupbyNormalCol &&
5140
      !isFirstLastRowQuery(pQuery))) {
H
hjxilinx 已提交
5141
    multiTableQueryProcess(pQInfo);
5142
  } else {
5143
    assert((pQuery->checkBuffer == 1 && pQuery->intervalTime == 0) || isPointInterpoQuery(pQuery) ||
H
Haojun Liao 已提交
5144
            isFirstLastRowQuery(pQuery) || pRuntimeEnv->groupbyNormalCol);
5145

5146
    sequentialTableProcess(pQInfo);
H
Haojun Liao 已提交
5147

5148
  }
5149

H
hjxilinx 已提交
5150
  // record the total elapsed time
5151
  pQInfo->runtimeEnv.summary.elapsedTime += (taosGetTimestampUs() - st);
H
hjxilinx 已提交
5152 5153
}

5154
static int32_t getColumnIndexInSource(SQueryTableMsg *pQueryMsg, SSqlFuncMsg *pExprMsg, SColumnInfo* pTagCols) {
5155
  int32_t j = 0;
5156

5157
  if (TSDB_COL_IS_TAG(pExprMsg->colInfo.flag)) {
H
Haojun Liao 已提交
5158 5159 5160 5161
    if (pExprMsg->colInfo.colId == TSDB_TBNAME_COLUMN_INDEX) {
      return -1;
    }

5162 5163 5164 5165
    while(j < pQueryMsg->numOfTags) {
      if (pExprMsg->colInfo.colId == pTagCols[j].colId) {
        return j;
      }
5166

5167 5168
      j += 1;
    }
5169

5170 5171 5172 5173 5174
  } else {
    while (j < pQueryMsg->numOfCols) {
      if (pExprMsg->colInfo.colId == pQueryMsg->colList[j].colId) {
        return j;
      }
5175

5176
      j += 1;
5177 5178 5179
    }
  }

5180
  assert(0);
5181 5182
}

5183 5184 5185
bool validateExprColumnInfo(SQueryTableMsg *pQueryMsg, SSqlFuncMsg *pExprMsg, SColumnInfo* pTagCols) {
  int32_t j = getColumnIndexInSource(pQueryMsg, pExprMsg, pTagCols);
  return j < pQueryMsg->numOfCols || j < pQueryMsg->numOfTags;
5186 5187
}

5188
static bool validateQueryMsg(SQueryTableMsg *pQueryMsg) {
H
hjxilinx 已提交
5189
  if (pQueryMsg->intervalTime < 0) {
5190
    qError("qmsg:%p illegal value of interval time %" PRId64, pQueryMsg, pQueryMsg->intervalTime);
5191
    return false;
5192 5193
  }

H
hjxilinx 已提交
5194
  if (pQueryMsg->numOfTables <= 0) {
S
slguan 已提交
5195
    qError("qmsg:%p illegal value of numOfTables %d", pQueryMsg, pQueryMsg->numOfTables);
5196
    return false;
5197 5198
  }

H
hjxilinx 已提交
5199
  if (pQueryMsg->numOfGroupCols < 0) {
S
slguan 已提交
5200
    qError("qmsg:%p illegal value of numOfGroupbyCols %d", pQueryMsg, pQueryMsg->numOfGroupCols);
5201
    return false;
5202 5203
  }

5204 5205
  if (pQueryMsg->numOfOutput > TSDB_MAX_COLUMNS || pQueryMsg->numOfOutput <= 0) {
    qError("qmsg:%p illegal value of output columns %d", pQueryMsg, pQueryMsg->numOfOutput);
5206
    return false;
5207 5208
  }

5209 5210 5211 5212 5213 5214 5215 5216 5217 5218
  return true;
}

static bool validateQuerySourceCols(SQueryTableMsg *pQueryMsg, SSqlFuncMsg** pExprMsg) {
  int32_t numOfTotal = pQueryMsg->numOfCols + pQueryMsg->numOfTags;
  if (pQueryMsg->numOfCols < 0 || pQueryMsg->numOfTags < 0 || numOfTotal > TSDB_MAX_COLUMNS) {
    qError("qmsg:%p illegal value of numOfCols %d numOfTags:%d", pQueryMsg, pQueryMsg->numOfCols, pQueryMsg->numOfTags);
    return false;
  } else if (numOfTotal == 0) {
    for(int32_t i = 0; i < pQueryMsg->numOfOutput; ++i) {
H
Haojun Liao 已提交
5219 5220 5221 5222 5223
      SSqlFuncMsg* pFuncMsg = pExprMsg[i];

      if ((pFuncMsg->functionId == TSDB_FUNC_TAGPRJ) ||
          (pFuncMsg->functionId == TSDB_FUNC_TID_TAG && pFuncMsg->colInfo.colId == TSDB_TBNAME_COLUMN_INDEX) ||
          (pFuncMsg->functionId == TSDB_FUNC_COUNT && pFuncMsg->colInfo.colId == TSDB_TBNAME_COLUMN_INDEX)) {
5224
        continue;
5225
      }
5226

5227
      return false;
5228 5229
    }
  }
5230

5231
  return true;
5232 5233
}

5234
static char *createTableIdList(SQueryTableMsg *pQueryMsg, char *pMsg, SArray **pTableIdList) {
H
hjxilinx 已提交
5235
  assert(pQueryMsg->numOfTables > 0);
5236

weixin_48148422's avatar
weixin_48148422 已提交
5237
  *pTableIdList = taosArrayInit(pQueryMsg->numOfTables, sizeof(STableIdInfo));
5238

weixin_48148422's avatar
weixin_48148422 已提交
5239 5240
  for (int32_t j = 0; j < pQueryMsg->numOfTables; ++j) {
    STableIdInfo* pTableIdInfo = (STableIdInfo *)pMsg;
5241

5242
    pTableIdInfo->tid = htonl(pTableIdInfo->tid);
H
hjxilinx 已提交
5243 5244
    pTableIdInfo->uid = htobe64(pTableIdInfo->uid);
    pTableIdInfo->key = htobe64(pTableIdInfo->key);
5245

H
hjxilinx 已提交
5246 5247 5248
    taosArrayPush(*pTableIdList, pTableIdInfo);
    pMsg += sizeof(STableIdInfo);
  }
5249

H
hjxilinx 已提交
5250 5251
  return pMsg;
}
5252

5253
/**
H
hjxilinx 已提交
5254
 * pQueryMsg->head has been converted before this function is called.
5255
 *
H
hjxilinx 已提交
5256
 * @param pQueryMsg
5257 5258 5259 5260
 * @param pTableIdList
 * @param pExpr
 * @return
 */
5261
static int32_t convertQueryMsg(SQueryTableMsg *pQueryMsg, SArray **pTableIdList, SSqlFuncMsg ***pExpr,
weixin_48148422's avatar
weixin_48148422 已提交
5262
                               char **tagCond, char** tbnameCond, SColIndex **groupbyCols, SColumnInfo** tagCols) {
5263 5264
  int32_t code = TSDB_CODE_SUCCESS;

5265 5266 5267 5268 5269 5270 5271 5272
  pQueryMsg->numOfTables = htonl(pQueryMsg->numOfTables);

  pQueryMsg->window.skey = htobe64(pQueryMsg->window.skey);
  pQueryMsg->window.ekey = htobe64(pQueryMsg->window.ekey);
  pQueryMsg->intervalTime = htobe64(pQueryMsg->intervalTime);
  pQueryMsg->slidingTime = htobe64(pQueryMsg->slidingTime);
  pQueryMsg->limit = htobe64(pQueryMsg->limit);
  pQueryMsg->offset = htobe64(pQueryMsg->offset);
H
hjxilinx 已提交
5273

5274 5275
  pQueryMsg->order = htons(pQueryMsg->order);
  pQueryMsg->orderColId = htons(pQueryMsg->orderColId);
H
Haojun Liao 已提交
5276
  pQueryMsg->queryType = htonl(pQueryMsg->queryType);
weixin_48148422's avatar
weixin_48148422 已提交
5277
  pQueryMsg->tagNameRelType = htons(pQueryMsg->tagNameRelType);
5278 5279

  pQueryMsg->numOfCols = htons(pQueryMsg->numOfCols);
5280
  pQueryMsg->numOfOutput = htons(pQueryMsg->numOfOutput);
H
hjxilinx 已提交
5281
  pQueryMsg->numOfGroupCols = htons(pQueryMsg->numOfGroupCols);
5282 5283 5284
  pQueryMsg->tagCondLen = htons(pQueryMsg->tagCondLen);
  pQueryMsg->tsOffset = htonl(pQueryMsg->tsOffset);
  pQueryMsg->tsLen = htonl(pQueryMsg->tsLen);
H
hjxilinx 已提交
5285
  pQueryMsg->tsNumOfBlocks = htonl(pQueryMsg->tsNumOfBlocks);
5286
  pQueryMsg->tsOrder = htonl(pQueryMsg->tsOrder);
5287
  pQueryMsg->numOfTags = htonl(pQueryMsg->numOfTags);
5288

5289
  // query msg safety check
5290
  if (!validateQueryMsg(pQueryMsg)) {
5291 5292
    code = TSDB_CODE_QRY_INVALID_MSG;
    goto _cleanup;
5293 5294
  }

H
hjxilinx 已提交
5295 5296
  char *pMsg = (char *)(pQueryMsg->colList) + sizeof(SColumnInfo) * pQueryMsg->numOfCols;
  for (int32_t col = 0; col < pQueryMsg->numOfCols; ++col) {
5297 5298
    SColumnInfo *pColInfo = &pQueryMsg->colList[col];

H
hjxilinx 已提交
5299
    pColInfo->colId = htons(pColInfo->colId);
5300
    pColInfo->type = htons(pColInfo->type);
H
hjxilinx 已提交
5301 5302
    pColInfo->bytes = htons(pColInfo->bytes);
    pColInfo->numOfFilters = htons(pColInfo->numOfFilters);
5303

H
hjxilinx 已提交
5304
    assert(pColInfo->type >= TSDB_DATA_TYPE_BOOL && pColInfo->type <= TSDB_DATA_TYPE_NCHAR);
5305

H
hjxilinx 已提交
5306
    int32_t numOfFilters = pColInfo->numOfFilters;
5307
    if (numOfFilters > 0) {
H
hjxilinx 已提交
5308
      pColInfo->filters = calloc(numOfFilters, sizeof(SColumnFilterInfo));
5309 5310 5311
    }

    for (int32_t f = 0; f < numOfFilters; ++f) {
5312 5313 5314 5315
      SColumnFilterInfo *pFilterMsg = (SColumnFilterInfo *)pMsg;
      
      SColumnFilterInfo *pColFilter = &pColInfo->filters[f];
      pColFilter->filterstr = htons(pFilterMsg->filterstr);
5316 5317 5318

      pMsg += sizeof(SColumnFilterInfo);

5319 5320
      if (pColFilter->filterstr) {
        pColFilter->len = htobe64(pFilterMsg->len);
5321

5322
        pColFilter->pz = (int64_t) calloc(1, pColFilter->len + 1 * TSDB_NCHAR_SIZE); // note: null-terminator
5323 5324
        memcpy((void *)pColFilter->pz, pMsg, pColFilter->len);
        pMsg += (pColFilter->len + 1);
5325
      } else {
5326 5327
        pColFilter->lowerBndi = htobe64(pFilterMsg->lowerBndi);
        pColFilter->upperBndi = htobe64(pFilterMsg->upperBndi);
5328 5329
      }

5330 5331
      pColFilter->lowerRelOptr = htons(pFilterMsg->lowerRelOptr);
      pColFilter->upperRelOptr = htons(pFilterMsg->upperRelOptr);
5332 5333 5334
    }
  }

5335 5336
  *pExpr = calloc(pQueryMsg->numOfOutput, POINTER_BYTES);
  SSqlFuncMsg *pExprMsg = (SSqlFuncMsg *)pMsg;
5337

5338
  for (int32_t i = 0; i < pQueryMsg->numOfOutput; ++i) {
5339
    (*pExpr)[i] = pExprMsg;
5340

5341
    pExprMsg->colInfo.colIndex = htons(pExprMsg->colInfo.colIndex);
5342 5343 5344 5345
    pExprMsg->colInfo.colId = htons(pExprMsg->colInfo.colId);
    pExprMsg->colInfo.flag = htons(pExprMsg->colInfo.flag);
    pExprMsg->functionId = htons(pExprMsg->functionId);
    pExprMsg->numOfParams = htons(pExprMsg->numOfParams);
5346

5347
    pMsg += sizeof(SSqlFuncMsg);
5348 5349

    for (int32_t j = 0; j < pExprMsg->numOfParams; ++j) {
5350
      pExprMsg->arg[j].argType = htons(pExprMsg->arg[j].argType);
5351 5352 5353 5354
      pExprMsg->arg[j].argBytes = htons(pExprMsg->arg[j].argBytes);

      if (pExprMsg->arg[j].argType == TSDB_DATA_TYPE_BINARY) {
        pExprMsg->arg[j].argValue.pz = pMsg;
5355
        pMsg += pExprMsg->arg[j].argBytes;  // one more for the string terminated char.
5356 5357 5358 5359 5360
      } else {
        pExprMsg->arg[j].argValue.i64 = htobe64(pExprMsg->arg[j].argValue.i64);
      }
    }

H
Haojun Liao 已提交
5361 5362
    int16_t functionId = pExprMsg->functionId;
    if (functionId == TSDB_FUNC_TAG || functionId == TSDB_FUNC_TAGPRJ || functionId == TSDB_FUNC_TAG_DUMMY) {
5363
      if (pExprMsg->colInfo.flag != TSDB_COL_TAG) {  // ignore the column  index check for arithmetic expression.
5364 5365
        code = TSDB_CODE_QRY_INVALID_MSG;
        goto _cleanup;
5366 5367
      }
    } else {
5368
//      if (!validateExprColumnInfo(pQueryMsg, pExprMsg)) {
5369
//        return TSDB_CODE_QRY_INVALID_MSG;
5370
//      }
5371 5372
    }

5373
    pExprMsg = (SSqlFuncMsg *)pMsg;
5374
  }
5375

5376
  if (!validateQuerySourceCols(pQueryMsg, *pExpr)) {
5377
    code = TSDB_CODE_QRY_INVALID_MSG;
dengyihao's avatar
dengyihao 已提交
5378
    goto _cleanup;
5379
  }
5380

H
hjxilinx 已提交
5381
  pMsg = createTableIdList(pQueryMsg, pMsg, pTableIdList);
5382

H
hjxilinx 已提交
5383
  if (pQueryMsg->numOfGroupCols > 0) {  // group by tag columns
5384
    *groupbyCols = malloc(pQueryMsg->numOfGroupCols * sizeof(SColIndex));
5385 5386 5387 5388
    if (*groupbyCols == NULL) {
      code = TSDB_CODE_QRY_OUT_OF_MEMORY;
      goto _cleanup;
    }
5389 5390 5391

    for (int32_t i = 0; i < pQueryMsg->numOfGroupCols; ++i) {
      (*groupbyCols)[i].colId = *(int16_t *)pMsg;
5392
      pMsg += sizeof((*groupbyCols)[i].colId);
5393 5394

      (*groupbyCols)[i].colIndex = *(int16_t *)pMsg;
5395 5396
      pMsg += sizeof((*groupbyCols)[i].colIndex);

5397
      (*groupbyCols)[i].flag = *(int16_t *)pMsg;
5398 5399 5400 5401 5402
      pMsg += sizeof((*groupbyCols)[i].flag);

      memcpy((*groupbyCols)[i].name, pMsg, tListLen(groupbyCols[i]->name));
      pMsg += tListLen((*groupbyCols)[i].name);
    }
5403

H
hjxilinx 已提交
5404 5405
    pQueryMsg->orderByIdx = htons(pQueryMsg->orderByIdx);
    pQueryMsg->orderType = htons(pQueryMsg->orderType);
5406 5407
  }

5408 5409
  pQueryMsg->fillType = htons(pQueryMsg->fillType);
  if (pQueryMsg->fillType != TSDB_FILL_NONE) {
5410
    pQueryMsg->fillVal = (uint64_t)(pMsg);
5411 5412

    int64_t *v = (int64_t *)pMsg;
5413
    for (int32_t i = 0; i < pQueryMsg->numOfOutput; ++i) {
5414 5415
      v[i] = htobe64(v[i]);
    }
5416

5417
    pMsg += sizeof(int64_t) * pQueryMsg->numOfOutput;
5418
  }
5419

5420 5421 5422 5423
  if (pQueryMsg->numOfTags > 0) {
    (*tagCols) = calloc(1, sizeof(SColumnInfo) * pQueryMsg->numOfTags);
    for (int32_t i = 0; i < pQueryMsg->numOfTags; ++i) {
      SColumnInfo* pTagCol = (SColumnInfo*) pMsg;
5424

5425 5426 5427 5428
      pTagCol->colId = htons(pTagCol->colId);
      pTagCol->bytes = htons(pTagCol->bytes);
      pTagCol->type  = htons(pTagCol->type);
      pTagCol->numOfFilters = 0;
5429

5430
      (*tagCols)[i] = *pTagCol;
5431
      pMsg += sizeof(SColumnInfo);
5432
    }
H
hjxilinx 已提交
5433
  }
5434

5435 5436 5437 5438 5439 5440
  // the tag query condition expression string is located at the end of query msg
  if (pQueryMsg->tagCondLen > 0) {
    *tagCond = calloc(1, pQueryMsg->tagCondLen);
    memcpy(*tagCond, pMsg, pQueryMsg->tagCondLen);
    pMsg += pQueryMsg->tagCondLen;
  }
5441

weixin_48148422's avatar
weixin_48148422 已提交
5442
  if (*pMsg != 0) {
5443
    size_t len = strlen(pMsg) + 1;
5444

5445
    *tbnameCond = malloc(len);
5446 5447 5448 5449 5450
    if (*tbnameCond == NULL) {
      code = TSDB_CODE_QRY_OUT_OF_MEMORY;
      goto _cleanup;
    }

weixin_48148422's avatar
weixin_48148422 已提交
5451
    strcpy(*tbnameCond, pMsg);
5452
    pMsg += len;
weixin_48148422's avatar
weixin_48148422 已提交
5453
  }
5454

5455
  qDebug("qmsg:%p query %d tables, type:%d, qrange:%" PRId64 "-%" PRId64 ", numOfGroupbyTagCols:%d, order:%d, "
H
Haojun Liao 已提交
5456 5457
         "outputCols:%d, numOfCols:%d, interval:%" PRId64 ", fillType:%d, comptsLen:%d, compNumOfBlocks:%d, limit:%" PRId64 ", offset:%" PRId64,
         pQueryMsg, pQueryMsg->numOfTables, pQueryMsg->queryType, pQueryMsg->window.skey, pQueryMsg->window.ekey, pQueryMsg->numOfGroupCols,
5458
         pQueryMsg->order, pQueryMsg->numOfOutput, pQueryMsg->numOfCols, pQueryMsg->intervalTime,
H
Haojun Liao 已提交
5459
         pQueryMsg->fillType, pQueryMsg->tsLen, pQueryMsg->tsNumOfBlocks, pQueryMsg->limit, pQueryMsg->offset);
5460 5461

  return TSDB_CODE_SUCCESS;
dengyihao's avatar
dengyihao 已提交
5462 5463 5464 5465 5466 5467 5468 5469 5470

_cleanup:
  tfree(*pExpr);
  taosArrayDestroy(*pTableIdList);
  *pTableIdList = NULL;
  tfree(*tbnameCond);
  tfree(*groupbyCols);
  tfree(*tagCols);
  tfree(*tagCond);
5471 5472

  return code;
5473 5474
}

H
hjxilinx 已提交
5475
static int32_t buildAirthmeticExprFromMsg(SExprInfo *pArithExprInfo, SQueryTableMsg *pQueryMsg) {
5476
  qDebug("qmsg:%p create arithmetic expr from binary string: %s", pQueryMsg, pArithExprInfo->base.arg[0].argValue.pz);
weixin_48148422's avatar
weixin_48148422 已提交
5477 5478

  tExprNode* pExprNode = NULL;
dengyihao's avatar
dengyihao 已提交
5479
  TRY(TSDB_MAX_TAGS) {
weixin_48148422's avatar
weixin_48148422 已提交
5480 5481 5482
    pExprNode = exprTreeFromBinary(pArithExprInfo->base.arg[0].argValue.pz, pArithExprInfo->base.arg[0].argBytes);
  } CATCH( code ) {
    CLEANUP_EXECUTE();
dengyihao's avatar
TD-816  
dengyihao 已提交
5483
    qError("qmsg:%p failed to create arithmetic expression string from:%s, reason: %s", pQueryMsg, pArithExprInfo->base.arg[0].argValue.pz, tstrerror(code));
weixin_48148422's avatar
weixin_48148422 已提交
5484 5485 5486
    return code;
  } END_TRY

H
hjxilinx 已提交
5487
  if (pExprNode == NULL) {
5488
    qError("qmsg:%p failed to create arithmetic expression string from:%s", pQueryMsg, pArithExprInfo->base.arg[0].argValue.pz);
5489
    return TSDB_CODE_QRY_APP_ERROR;
5490
  }
5491

5492
  pArithExprInfo->pExpr = pExprNode;
5493 5494 5495
  return TSDB_CODE_SUCCESS;
}

H
Haojun Liao 已提交
5496
static int32_t createQFunctionExprFromMsg(SQueryTableMsg *pQueryMsg, SExprInfo **pExprInfo, SSqlFuncMsg **pExprMsg,
5497 5498
    SColumnInfo* pTagCols) {
  *pExprInfo = NULL;
H
hjxilinx 已提交
5499
  int32_t code = TSDB_CODE_SUCCESS;
5500

H
Haojun Liao 已提交
5501
  SExprInfo *pExprs = (SExprInfo *)calloc(pQueryMsg->numOfOutput, sizeof(SExprInfo));
5502
  if (pExprs == NULL) {
5503
    return TSDB_CODE_QRY_OUT_OF_MEMORY;
5504 5505 5506 5507 5508
  }

  bool    isSuperTable = QUERY_IS_STABLE_QUERY(pQueryMsg->queryType);
  int16_t tagLen = 0;

5509
  for (int32_t i = 0; i < pQueryMsg->numOfOutput; ++i) {
5510
    pExprs[i].base = *pExprMsg[i];
5511
    pExprs[i].bytes = 0;
5512 5513 5514 5515

    int16_t type = 0;
    int16_t bytes = 0;

5516
    // parse the arithmetic expression
5517
    if (pExprs[i].base.functionId == TSDB_FUNC_ARITHM) {
5518
      code = buildAirthmeticExprFromMsg(&pExprs[i], pQueryMsg);
5519

5520 5521 5522
      if (code != TSDB_CODE_SUCCESS) {
        tfree(pExprs);
        return code;
5523 5524
      }

5525
      type  = TSDB_DATA_TYPE_DOUBLE;
5526
      bytes = tDataTypeDesc[type].nSize;
H
Haojun Liao 已提交
5527
    } else if (pExprs[i].base.colInfo.colId == TSDB_TBNAME_COLUMN_INDEX && pExprs[i].base.functionId == TSDB_FUNC_TAGPRJ) {  // parse the normal column
H
Haojun Liao 已提交
5528 5529 5530
      SSchema s = tGetTableNameColumnSchema();
      type  = s.type;
      bytes = s.bytes;
B
Bomin Zhang 已提交
5531
    } else{
5532
      int32_t j = getColumnIndexInSource(pQueryMsg, &pExprs[i].base, pTagCols);
dengyihao's avatar
dengyihao 已提交
5533
      assert(j < pQueryMsg->numOfCols || j < pQueryMsg->numOfTags);
H
Haojun Liao 已提交
5534

dengyihao's avatar
dengyihao 已提交
5535
      if (pExprs[i].base.colInfo.colId != TSDB_TBNAME_COLUMN_INDEX && j >= 0) {
H
Haojun Liao 已提交
5536 5537 5538 5539
        SColumnInfo* pCol = (TSDB_COL_IS_TAG(pExprs[i].base.colInfo.flag))? &pTagCols[j]:&pQueryMsg->colList[j];
        type = pCol->type;
        bytes = pCol->bytes;
      } else {
H
Haojun Liao 已提交
5540
        SSchema s = tGetTableNameColumnSchema();
H
hjxilinx 已提交
5541

H
Haojun Liao 已提交
5542 5543 5544
        type  = s.type;
        bytes = s.bytes;
      }
5545 5546
    }

5547 5548
    int32_t param = pExprs[i].base.arg[0].argValue.i64;
    if (getResultDataInfo(type, bytes, pExprs[i].base.functionId, param, &pExprs[i].type, &pExprs[i].bytes,
5549
                          &pExprs[i].interBytes, 0, isSuperTable) != TSDB_CODE_SUCCESS) {
5550
      tfree(pExprs);
5551
      return TSDB_CODE_QRY_INVALID_MSG;
5552 5553
    }

5554
    if (pExprs[i].base.functionId == TSDB_FUNC_TAG_DUMMY || pExprs[i].base.functionId == TSDB_FUNC_TS_DUMMY) {
5555
      tagLen += pExprs[i].bytes;
5556
    }
5557
    assert(isValidDataType(pExprs[i].type));
5558 5559 5560
  }

  // TODO refactor
5561
  for (int32_t i = 0; i < pQueryMsg->numOfOutput; ++i) {
5562 5563
    pExprs[i].base = *pExprMsg[i];
    int16_t functId = pExprs[i].base.functionId;
5564

5565
    if (functId == TSDB_FUNC_TOP || functId == TSDB_FUNC_BOTTOM) {
5566
      int32_t j = getColumnIndexInSource(pQueryMsg, &pExprs[i].base, pTagCols);
5567 5568 5569 5570 5571
      assert(j < pQueryMsg->numOfCols);

      SColumnInfo *pCol = &pQueryMsg->colList[j];

      int32_t ret =
5572
          getResultDataInfo(pCol->type, pCol->bytes, functId, pExprs[i].base.arg[0].argValue.i64,
5573
                            &pExprs[i].type, &pExprs[i].bytes, &pExprs[i].interBytes, tagLen, isSuperTable);
5574 5575 5576
      assert(ret == TSDB_CODE_SUCCESS);
    }
  }
5577
  *pExprInfo = pExprs;
5578 5579 5580 5581

  return TSDB_CODE_SUCCESS;
}

5582
static SSqlGroupbyExpr *createGroupbyExprFromMsg(SQueryTableMsg *pQueryMsg, SColIndex *pColIndex, int32_t *code) {
5583 5584 5585 5586 5587
  if (pQueryMsg->numOfGroupCols == 0) {
    return NULL;
  }

  // using group by tag columns
5588
  SSqlGroupbyExpr *pGroupbyExpr = (SSqlGroupbyExpr *)calloc(1, sizeof(SSqlGroupbyExpr));
5589
  if (pGroupbyExpr == NULL) {
5590
    *code = TSDB_CODE_QRY_OUT_OF_MEMORY;
5591 5592 5593 5594 5595 5596 5597
    return NULL;
  }

  pGroupbyExpr->numOfGroupCols = pQueryMsg->numOfGroupCols;
  pGroupbyExpr->orderType = pQueryMsg->orderType;
  pGroupbyExpr->orderIndex = pQueryMsg->orderByIdx;

5598 5599 5600 5601
  pGroupbyExpr->columnInfo = taosArrayInit(pQueryMsg->numOfGroupCols, sizeof(SColIndex));
  for(int32_t i = 0; i < pQueryMsg->numOfGroupCols; ++i) {
    taosArrayPush(pGroupbyExpr->columnInfo, &pColIndex[i]);
  }
5602

5603 5604 5605
  return pGroupbyExpr;
}

5606
static int32_t createFilterInfo(void *pQInfo, SQuery *pQuery) {
5607
  for (int32_t i = 0; i < pQuery->numOfCols; ++i) {
5608
    if (pQuery->colList[i].numOfFilters > 0) {
5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619
      pQuery->numOfFilterCols++;
    }
  }

  if (pQuery->numOfFilterCols == 0) {
    return TSDB_CODE_SUCCESS;
  }

  pQuery->pFilterInfo = calloc(1, sizeof(SSingleColumnFilterInfo) * pQuery->numOfFilterCols);

  for (int32_t i = 0, j = 0; i < pQuery->numOfCols; ++i) {
5620
    if (pQuery->colList[i].numOfFilters > 0) {
5621 5622
      SSingleColumnFilterInfo *pFilterInfo = &pQuery->pFilterInfo[j];

B
Bomin Zhang 已提交
5623
      memcpy(&pFilterInfo->info, &pQuery->colList[i], sizeof(SColumnInfo));
5624
      pFilterInfo->info = pQuery->colList[i];
5625

5626
      pFilterInfo->numOfFilters = pQuery->colList[i].numOfFilters;
5627 5628 5629 5630
      pFilterInfo->pFilters = calloc(pFilterInfo->numOfFilters, sizeof(SColumnFilterElem));

      for (int32_t f = 0; f < pFilterInfo->numOfFilters; ++f) {
        SColumnFilterElem *pSingleColFilter = &pFilterInfo->pFilters[f];
5631
        pSingleColFilter->filterInfo = pQuery->colList[i].filters[f];
5632 5633 5634 5635 5636

        int32_t lower = pSingleColFilter->filterInfo.lowerRelOptr;
        int32_t upper = pSingleColFilter->filterInfo.upperRelOptr;

        if (lower == TSDB_RELATION_INVALID && upper == TSDB_RELATION_INVALID) {
S
slguan 已提交
5637
          qError("QInfo:%p invalid filter info", pQInfo);
5638
          return TSDB_CODE_QRY_INVALID_MSG;
5639 5640
        }

5641 5642
        int16_t type  = pQuery->colList[i].type;
        int16_t bytes = pQuery->colList[i].bytes;
5643

5644 5645 5646
        // todo refactor
        __filter_func_t *rangeFilterArray = getRangeFilterFuncArray(type);
        __filter_func_t *filterArray = getValueFilterFuncArray(type);
5647 5648

        if (rangeFilterArray == NULL && filterArray == NULL) {
S
slguan 已提交
5649
          qError("QInfo:%p failed to get filter function, invalid data type:%d", pQInfo, type);
5650
          return TSDB_CODE_QRY_INVALID_MSG;
5651 5652
        }

5653
        if ((lower == TSDB_RELATION_GREATER_EQUAL || lower == TSDB_RELATION_GREATER) &&
5654
            (upper == TSDB_RELATION_LESS_EQUAL || upper == TSDB_RELATION_LESS)) {
dengyihao's avatar
dengyihao 已提交
5655
          assert(rangeFilterArray != NULL);
5656
          if (lower == TSDB_RELATION_GREATER_EQUAL) {
5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669
            if (upper == TSDB_RELATION_LESS_EQUAL) {
              pSingleColFilter->fp = rangeFilterArray[4];
            } else {
              pSingleColFilter->fp = rangeFilterArray[2];
            }
          } else {
            if (upper == TSDB_RELATION_LESS_EQUAL) {
              pSingleColFilter->fp = rangeFilterArray[3];
            } else {
              pSingleColFilter->fp = rangeFilterArray[1];
            }
          }
        } else {  // set callback filter function
dengyihao's avatar
dengyihao 已提交
5670
          assert(filterArray != NULL);
5671 5672 5673 5674
          if (lower != TSDB_RELATION_INVALID) {
            pSingleColFilter->fp = filterArray[lower];

            if (upper != TSDB_RELATION_INVALID) {
dengyihao's avatar
dengyihao 已提交
5675
              qError("pQInfo:%p failed to get filter function, invalid filter condition: %d", pQInfo, type);
5676
              return TSDB_CODE_QRY_INVALID_MSG;
5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692
            }
          } else {
            pSingleColFilter->fp = filterArray[upper];
          }
        }
        assert(pSingleColFilter->fp != NULL);
        pSingleColFilter->bytes = bytes;
      }

      j++;
    }
  }

  return TSDB_CODE_SUCCESS;
}

5693
static void doUpdateExprColumnIndex(SQuery *pQuery) {
5694
  assert(pQuery->pSelectExpr != NULL && pQuery != NULL);
5695

5696
  for (int32_t k = 0; k < pQuery->numOfOutput; ++k) {
5697
    SSqlFuncMsg *pSqlExprMsg = &pQuery->pSelectExpr[k].base;
5698
    if (pSqlExprMsg->functionId == TSDB_FUNC_ARITHM) {
5699 5700
      continue;
    }
5701

5702
    // todo opt performance
H
Haojun Liao 已提交
5703 5704
    SColIndex *pColIndex = &pSqlExprMsg->colInfo;
    if (!TSDB_COL_IS_TAG(pColIndex->flag)) {
5705 5706
      int32_t f = 0;
      for (f = 0; f < pQuery->numOfCols; ++f) {
H
Haojun Liao 已提交
5707 5708
        if (pColIndex->colId == pQuery->colList[f].colId) {
          pColIndex->colIndex = f;
5709 5710 5711
          break;
        }
      }
5712 5713
      
      assert (f < pQuery->numOfCols);
5714
    } else {
5715 5716
      int32_t f = 0;
      for (f = 0; f < pQuery->numOfTags; ++f) {
H
Haojun Liao 已提交
5717 5718
        if (pColIndex->colId == pQuery->tagColList[f].colId) {
          pColIndex->colIndex = f;
5719 5720
          break;
        }
5721
      }
5722 5723
      
      assert(f < pQuery->numOfTags || pColIndex->colId == TSDB_TBNAME_COLUMN_INDEX);
5724 5725 5726 5727
    }
  }
}

5728
static int compareTableIdInfo(const void* a, const void* b) {
weixin_48148422's avatar
weixin_48148422 已提交
5729 5730 5731 5732 5733 5734 5735
  const STableIdInfo* x = (const STableIdInfo*)a;
  const STableIdInfo* y = (const STableIdInfo*)b;
  if (x->uid > y->uid) return 1;
  if (x->uid < y->uid) return -1;
  return 0;
}

dengyihao's avatar
dengyihao 已提交
5736 5737
static void freeQInfo(SQInfo *pQInfo);

H
Haojun Liao 已提交
5738 5739 5740 5741 5742
static void calResultBufSize(SQuery* pQuery) {
  const int32_t RESULT_MSG_MIN_SIZE  = 1024 * (1024 + 512);  // bytes
  const int32_t RESULT_MSG_MIN_ROWS  = 8192;
  const float RESULT_THRESHOLD_RATIO = 0.85;

5743 5744 5745 5746 5747
  if (isProjQuery(pQuery)) {
    int32_t numOfRes = RESULT_MSG_MIN_SIZE / pQuery->rowSize;
    if (numOfRes < RESULT_MSG_MIN_ROWS) {
      numOfRes = RESULT_MSG_MIN_ROWS;
    }
H
Haojun Liao 已提交
5748

5749 5750 5751 5752 5753 5754
    pQuery->rec.capacity  = numOfRes;
    pQuery->rec.threshold = numOfRes * RESULT_THRESHOLD_RATIO;
  } else {  // in case of non-prj query, a smaller output buffer will be used.
    pQuery->rec.capacity = 4096;
    pQuery->rec.threshold = pQuery->rec.capacity * RESULT_THRESHOLD_RATIO;
  }
H
Haojun Liao 已提交
5755 5756
}

weixin_48148422's avatar
weixin_48148422 已提交
5757
static SQInfo *createQInfoImpl(SQueryTableMsg *pQueryMsg, SArray* pTableIdList, SSqlGroupbyExpr *pGroupbyExpr, SExprInfo *pExprs,
5758
                               STableGroupInfo *pTableGroupInfo, SColumnInfo* pTagCols) {
B
Bomin Zhang 已提交
5759 5760 5761
  int16_t numOfCols = pQueryMsg->numOfCols;
  int16_t numOfOutput = pQueryMsg->numOfOutput;

5762 5763
  SQInfo *pQInfo = (SQInfo *)calloc(1, sizeof(SQInfo));
  if (pQInfo == NULL) {
B
Bomin Zhang 已提交
5764
    goto _cleanup_qinfo;
5765
  }
5766

B
Bomin Zhang 已提交
5767 5768 5769
  // to make sure third party won't overwrite this structure
  pQInfo->signature = pQInfo;
  pQInfo->tableGroupInfo = *pTableGroupInfo;
5770 5771

  SQuery *pQuery = calloc(1, sizeof(SQuery));
B
Bomin Zhang 已提交
5772 5773 5774
  if (pQuery == NULL) {
    goto _cleanup_query;
  }
5775 5776
  pQInfo->runtimeEnv.pQuery = pQuery;

5777
  pQuery->numOfCols       = numOfCols;
H
hjxilinx 已提交
5778
  pQuery->numOfOutput     = numOfOutput;
5779 5780 5781
  pQuery->limit.limit     = pQueryMsg->limit;
  pQuery->limit.offset    = pQueryMsg->offset;
  pQuery->order.order     = pQueryMsg->order;
5782
  pQuery->order.orderColId = pQueryMsg->orderColId;
5783 5784 5785 5786
  pQuery->pSelectExpr     = pExprs;
  pQuery->pGroupbyExpr    = pGroupbyExpr;
  pQuery->intervalTime    = pQueryMsg->intervalTime;
  pQuery->slidingTime     = pQueryMsg->slidingTime;
5787
  pQuery->slidingTimeUnit = pQueryMsg->slidingTimeUnit;
5788
  pQuery->fillType        = pQueryMsg->fillType;
5789
  pQuery->numOfTags       = pQueryMsg->numOfTags;
B
Bomin Zhang 已提交
5790
  pQuery->tagColList      = pTagCols;
H
Haojun Liao 已提交
5791

5792
  pQuery->colList = calloc(numOfCols, sizeof(SSingleColumnFilterInfo));
5793
  if (pQuery->colList == NULL) {
5794
    goto _cleanup;
5795
  }
5796

H
hjxilinx 已提交
5797
  for (int16_t i = 0; i < numOfCols; ++i) {
5798
    pQuery->colList[i] = pQueryMsg->colList[i];
5799
    pQuery->colList[i].filters = tscFilterInfoClone(pQueryMsg->colList[i].filters, pQuery->colList[i].numOfFilters);
H
hjxilinx 已提交
5800
  }
5801

5802
  // calculate the result row size
5803 5804 5805
  for (int16_t col = 0; col < numOfOutput; ++col) {
    assert(pExprs[col].bytes > 0);
    pQuery->rowSize += pExprs[col].bytes;
5806
  }
5807

5808
  doUpdateExprColumnIndex(pQuery);
5809

5810
  int32_t ret = createFilterInfo(pQInfo, pQuery);
5811
  if (ret != TSDB_CODE_SUCCESS) {
5812
    goto _cleanup;
5813 5814 5815
  }

  // prepare the result buffer
5816
  pQuery->sdata = (tFilePage **)calloc(pQuery->numOfOutput, POINTER_BYTES);
5817
  if (pQuery->sdata == NULL) {
5818
    goto _cleanup;
5819 5820
  }

H
Haojun Liao 已提交
5821
  calResultBufSize(pQuery);
5822

5823
  for (int32_t col = 0; col < pQuery->numOfOutput; ++col) {
5824
    assert(pExprs[col].interBytes >= pExprs[col].bytes);
5825 5826

    // allocate additional memory for interResults that are usually larger then final results
5827 5828
    size_t size = (pQuery->rec.capacity + 1) * pExprs[col].bytes + pExprs[col].interBytes + sizeof(tFilePage);
    pQuery->sdata[col] = (tFilePage *)calloc(1, size);
5829
    if (pQuery->sdata[col] == NULL) {
5830
      goto _cleanup;
5831 5832 5833
    }
  }

5834
  if (pQuery->fillType != TSDB_FILL_NONE) {
5835 5836
    pQuery->fillVal = malloc(sizeof(int64_t) * pQuery->numOfOutput);
    if (pQuery->fillVal == NULL) {
5837
      goto _cleanup;
5838 5839 5840
    }

    // the first column is the timestamp
5841
    memcpy(pQuery->fillVal, (char *)pQueryMsg->fillVal, pQuery->numOfOutput * sizeof(int64_t));
5842 5843
  }

dengyihao's avatar
dengyihao 已提交
5844 5845 5846 5847 5848 5849
  size_t numOfGroups = 0;
  if (pTableGroupInfo->pGroupList != NULL) {
    numOfGroups = taosArrayGetSize(pTableGroupInfo->pGroupList);

    pQInfo->tableqinfoGroupInfo.pGroupList = taosArrayInit(numOfGroups, POINTER_BYTES);
    pQInfo->tableqinfoGroupInfo.numOfTables = pTableGroupInfo->numOfTables;
H
Haojun Liao 已提交
5850 5851 5852
    pQInfo->tableqinfoGroupInfo.map = taosHashInit(pTableGroupInfo->numOfTables,
                                                   taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false);
  }
5853

weixin_48148422's avatar
weixin_48148422 已提交
5854 5855
  int tableIndex = 0;
  STimeWindow window = pQueryMsg->window;
5856
  taosArraySort(pTableIdList, compareTableIdInfo);
5857

H
Haojun Liao 已提交
5858
  pQInfo->runtimeEnv.interBufSize = getOutputInterResultBufSize(pQuery);
H
Haojun Liao 已提交
5859 5860 5861
  pQInfo->pBuf = calloc(pTableGroupInfo->numOfTables, sizeof(STableQueryInfo));
  int32_t index = 0;

H
hjxilinx 已提交
5862
  for(int32_t i = 0; i < numOfGroups; ++i) {
5863
    SArray* pa = taosArrayGetP(pTableGroupInfo->pGroupList, i);
5864

H
Haojun Liao 已提交
5865
    size_t s = taosArrayGetSize(pa);
5866
    SArray* p1 = taosArrayInit(s, POINTER_BYTES);
B
Bomin Zhang 已提交
5867 5868 5869
    if (p1 == NULL) {
      goto _cleanup;
    }
5870

H
hjxilinx 已提交
5871
    for(int32_t j = 0; j < s; ++j) {
5872
      void* pTable = taosArrayGetP(pa, j);
H
Haojun Liao 已提交
5873
      STableId* id = TSDB_TABLEID(pTable);
5874

H
Haojun Liao 已提交
5875
      STableIdInfo* pTableId = taosArraySearch(pTableIdList, id, compareTableIdInfo);
weixin_48148422's avatar
weixin_48148422 已提交
5876 5877 5878
      if (pTableId != NULL ) {
        window.skey = pTableId->key;
      } else {
B
Bomin Zhang 已提交
5879
        window.skey = pQueryMsg->window.skey;
weixin_48148422's avatar
weixin_48148422 已提交
5880
      }
5881

H
Haojun Liao 已提交
5882 5883
      void* buf = pQInfo->pBuf + index * sizeof(STableQueryInfo);
      STableQueryInfo* item = createTableQueryInfo(&pQInfo->runtimeEnv, pTable, window, buf);
B
Bomin Zhang 已提交
5884 5885 5886
      if (item == NULL) {
        goto _cleanup;
      }
5887
      item->groupIndex = i;
H
hjxilinx 已提交
5888
      taosArrayPush(p1, &item);
H
Haojun Liao 已提交
5889 5890
      taosHashPut(pQInfo->tableqinfoGroupInfo.map, &id->tid, sizeof(id->tid), &item, POINTER_BYTES);
      index += 1;
H
hjxilinx 已提交
5891
    }
5892

5893
    taosArrayPush(pQInfo->tableqinfoGroupInfo.pGroupList, &p1);
H
hjxilinx 已提交
5894
  }
5895

weixin_48148422's avatar
weixin_48148422 已提交
5896 5897
  pQInfo->arrTableIdInfo = taosArrayInit(tableIndex, sizeof(STableIdInfo));

5898
  pQuery->pos = -1;
5899
  pQuery->window = pQueryMsg->window;
5900

5901
  if (sem_init(&pQInfo->dataReady, 0, 0) != 0) {
5902 5903
    int32_t code = TAOS_SYSTEM_ERROR(errno);
    qError("QInfo:%p init dataReady sem failed, reason:%s", pQInfo, tstrerror(code));
5904
    goto _cleanup;
5905
  }
5906

5907
  colIdCheck(pQuery);
5908

5909
  qDebug("qmsg:%p QInfo:%p created", pQueryMsg, pQInfo);
5910 5911
  return pQInfo;

B
Bomin Zhang 已提交
5912
_cleanup_qinfo:
H
Haojun Liao 已提交
5913
  tsdbDestroyTableGroup(pTableGroupInfo);
B
Bomin Zhang 已提交
5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926

_cleanup_query:
  taosArrayDestroy(pGroupbyExpr->columnInfo);
  tfree(pGroupbyExpr);
  tfree(pTagCols);
  for (int32_t i = 0; i < numOfOutput; ++i) {
    SExprInfo* pExprInfo = &pExprs[i];
    if (pExprInfo->pExpr != NULL) {
      tExprTreeDestroy(&pExprInfo->pExpr, NULL);
    }
  }
  tfree(pExprs);

5927
_cleanup:
dengyihao's avatar
dengyihao 已提交
5928
  freeQInfo(pQInfo);
5929 5930 5931
  return NULL;
}

H
hjxilinx 已提交
5932
static bool isValidQInfo(void *param) {
H
hjxilinx 已提交
5933 5934 5935 5936
  SQInfo *pQInfo = (SQInfo *)param;
  if (pQInfo == NULL) {
    return false;
  }
5937

H
hjxilinx 已提交
5938 5939 5940 5941
  /*
   * pQInfo->signature may be changed by another thread, so we assign value of signature
   * into local variable, then compare by using local variable
   */
5942
  uint64_t sig = (uint64_t)pQInfo->signature;
H
hjxilinx 已提交
5943 5944 5945
  return (sig == (uint64_t)pQInfo);
}

H
Haojun Liao 已提交
5946
static int32_t initQInfo(SQueryTableMsg *pQueryMsg, void *tsdb, int32_t vgId, SQInfo *pQInfo, bool isSTable, void* param) {
H
hjxilinx 已提交
5947
  int32_t code = TSDB_CODE_SUCCESS;
5948
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
5949

H
hjxilinx 已提交
5950 5951
  STSBuf *pTSBuf = NULL;
  if (pQueryMsg->tsLen > 0) {  // open new file to save the result
H
Haojun Liao 已提交
5952
    char *tsBlock = (char *) pQueryMsg + pQueryMsg->tsOffset;
H
hjxilinx 已提交
5953
    pTSBuf = tsBufCreateFromCompBlocks(tsBlock, pQueryMsg->tsNumOfBlocks, pQueryMsg->tsLen, pQueryMsg->tsOrder);
5954

H
hjxilinx 已提交
5955
    tsBufResetPos(pTSBuf);
dengyihao's avatar
dengyihao 已提交
5956 5957
    bool ret = tsBufNextPos(pTSBuf);
    UNUSED(ret);
H
hjxilinx 已提交
5958
  }
5959

5960 5961
  if ((QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.skey > pQuery->window.ekey)) ||
      (!QUERY_IS_ASC_QUERY(pQuery) && (pQuery->window.ekey > pQuery->window.skey))) {
5962
    qDebug("QInfo:%p no result in time range %" PRId64 "-%" PRId64 ", order %d", pQInfo, pQuery->window.skey,
5963
           pQuery->window.ekey, pQuery->order.order);
5964
    setQueryStatus(pQuery, QUERY_COMPLETED);
B
Bomin Zhang 已提交
5965
    pQInfo->tableqinfoGroupInfo.numOfTables = 0;
5966

5967 5968 5969
    sem_post(&pQInfo->dataReady);
    return TSDB_CODE_SUCCESS;
  }
5970

5971 5972
  pQInfo->param = param;

5973
  if (pQInfo->tableqinfoGroupInfo.numOfTables == 0) {
5974
    qDebug("QInfo:%p no table qualified for tag filter, abort query", pQInfo);
5975 5976 5977 5978 5979
    setQueryStatus(pQuery, QUERY_COMPLETED);
  
    sem_post(&pQInfo->dataReady);
    return TSDB_CODE_SUCCESS;
  }
H
hjxilinx 已提交
5980 5981

  // filter the qualified
5982
  if ((code = doInitQInfo(pQInfo, pTSBuf, tsdb, vgId, isSTable)) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
5983 5984
    goto _error;
  }
H
hjxilinx 已提交
5985
  
H
hjxilinx 已提交
5986 5987 5988 5989
  return code;

_error:
  // table query ref will be decrease during error handling
5990
  freeQInfo(pQInfo);
H
hjxilinx 已提交
5991 5992 5993
  return code;
}

B
Bomin Zhang 已提交
5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005
static void freeColumnFilterInfo(SColumnFilterInfo* pFilter, int32_t numOfFilters) {
    if (pFilter == NULL) {
      return;
    }
    for (int32_t i = 0; i < numOfFilters; i++) {
      if (pFilter[i].filterstr) {
        free((void*)(pFilter[i].pz));
      }
    }
    free(pFilter);
}

H
hjxilinx 已提交
6006 6007 6008 6009
static void freeQInfo(SQInfo *pQInfo) {
  if (!isValidQInfo(pQInfo)) {
    return;
  }
6010 6011

  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
hjxilinx 已提交
6012
  setQueryKilled(pQInfo);
6013

6014
  qDebug("QInfo:%p start to free QInfo", pQInfo);
6015
  for (int32_t col = 0; col < pQuery->numOfOutput; ++col) {
H
hjxilinx 已提交
6016 6017
    tfree(pQuery->sdata[col]);
  }
6018

H
hjxilinx 已提交
6019
  sem_destroy(&(pQInfo->dataReady));
6020
  teardownQueryRuntimeEnv(&pQInfo->runtimeEnv);
6021

H
hjxilinx 已提交
6022 6023 6024 6025 6026 6027
  for (int32_t i = 0; i < pQuery->numOfFilterCols; ++i) {
    SSingleColumnFilterInfo *pColFilter = &pQuery->pFilterInfo[i];
    if (pColFilter->numOfFilters > 0) {
      tfree(pColFilter->pFilters);
    }
  }
6028

H
hjxilinx 已提交
6029
  if (pQuery->pSelectExpr != NULL) {
6030
    for (int32_t i = 0; i < pQuery->numOfOutput; ++i) {
H
hjxilinx 已提交
6031
      SExprInfo* pExprInfo = &pQuery->pSelectExpr[i];
6032

H
hjxilinx 已提交
6033 6034 6035
      if (pExprInfo->pExpr != NULL) {
        tExprTreeDestroy(&pExprInfo->pExpr, NULL);
      }
H
hjxilinx 已提交
6036
    }
6037

H
hjxilinx 已提交
6038 6039
    tfree(pQuery->pSelectExpr);
  }
6040

6041 6042
  if (pQuery->fillVal != NULL) {
    tfree(pQuery->fillVal);
H
hjxilinx 已提交
6043
  }
6044

6045
  // todo refactor, extract method to destroytableDataInfo
B
Bomin Zhang 已提交
6046 6047 6048 6049 6050 6051 6052 6053
  if (pQInfo->tableqinfoGroupInfo.pGroupList != NULL) {
    int32_t numOfGroups = GET_NUM_OF_TABLEGROUP(pQInfo);
    for (int32_t i = 0; i < numOfGroups; ++i) {
      SArray *p = GET_TABLEGROUP(pQInfo, i);

      size_t num = taosArrayGetSize(p);
      for(int32_t j = 0; j < num; ++j) {
        STableQueryInfo* item = taosArrayGetP(p, j);
H
Haojun Liao 已提交
6054
        destroyTableQueryInfo(item);
6055
      }
6056

B
Bomin Zhang 已提交
6057 6058
      taosArrayDestroy(p);
    }
H
hjxilinx 已提交
6059
  }
6060

H
Haojun Liao 已提交
6061
  tfree(pQInfo->pBuf);
6062
  taosArrayDestroy(pQInfo->tableqinfoGroupInfo.pGroupList);
H
Haojun Liao 已提交
6063
  taosHashCleanup(pQInfo->tableqinfoGroupInfo.map);
H
Haojun Liao 已提交
6064
  tsdbDestroyTableGroup(&pQInfo->tableGroupInfo);
weixin_48148422's avatar
weixin_48148422 已提交
6065
  taosArrayDestroy(pQInfo->arrTableIdInfo);
H
hjxilinx 已提交
6066
  
6067 6068 6069 6070
  if (pQuery->pGroupbyExpr != NULL) {
    taosArrayDestroy(pQuery->pGroupbyExpr->columnInfo);
    tfree(pQuery->pGroupbyExpr);
  }
6071

6072 6073
  tfree(pQuery->tagColList);
  tfree(pQuery->pFilterInfo);
B
Bomin Zhang 已提交
6074 6075 6076 6077 6078 6079 6080 6081 6082

  if (pQuery->colList != NULL) {
    for (int32_t i = 0; i < pQuery->numOfCols; i++) {
      SColumnInfo* column = pQuery->colList + i;
      freeColumnFilterInfo(column->filters, column->numOfFilters);
    }
    tfree(pQuery->colList);
  }

6083 6084
  tfree(pQuery->sdata);
  tfree(pQuery);
6085
  pQInfo->signature = 0;
6086

6087
  qDebug("QInfo:%p QInfo is freed", pQInfo);
6088

H
hjxilinx 已提交
6089 6090 6091
  tfree(pQInfo);
}

H
hjxilinx 已提交
6092
static size_t getResultSize(SQInfo *pQInfo, int64_t *numOfRows) {
6093 6094
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;

H
hjxilinx 已提交
6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105
  /*
   * get the file size and set the numOfRows to be the file size, since for tsComp query,
   * the returned row size is equalled to 1
   * TODO handle the case that the file is too large to send back one time
   */
  if (isTSCompQuery(pQuery) && (*numOfRows) > 0) {
    struct stat fstat;
    if (stat(pQuery->sdata[0]->data, &fstat) == 0) {
      *numOfRows = fstat.st_size;
      return fstat.st_size;
    } else {
S
slguan 已提交
6106
      qError("QInfo:%p failed to get file info, path:%s, reason:%s", pQInfo, pQuery->sdata[0]->data, strerror(errno));
H
hjxilinx 已提交
6107 6108 6109 6110
      return 0;
    }
  } else {
    return pQuery->rowSize * (*numOfRows);
6111
  }
H
hjxilinx 已提交
6112
}
6113

H
hjxilinx 已提交
6114 6115 6116
static int32_t doDumpQueryResult(SQInfo *pQInfo, char *data) {
  // the remained number of retrieved rows, not the interpolated result
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
6117

H
hjxilinx 已提交
6118 6119 6120
  // load data from file to msg buffer
  if (isTSCompQuery(pQuery)) {
    int32_t fd = open(pQuery->sdata[0]->data, O_RDONLY, 0666);
6121

H
hjxilinx 已提交
6122 6123
    // make sure file exist
    if (FD_VALID(fd)) {
dengyihao's avatar
dengyihao 已提交
6124 6125
      int32_t s = lseek(fd, 0, SEEK_END);
      UNUSED(s);
6126
      qDebug("QInfo:%p ts comp data return, file:%s, size:%d", pQInfo, pQuery->sdata[0]->data, s);
H
Haojun Liao 已提交
6127
      if (lseek(fd, 0, SEEK_SET) >= 0) {
dengyihao's avatar
dengyihao 已提交
6128 6129
        size_t sz = read(fd, data, s);
        UNUSED(sz);
H
Haojun Liao 已提交
6130 6131
      } else {
        // todo handle error
dengyihao's avatar
dengyihao 已提交
6132
      }
H
Haojun Liao 已提交
6133

H
hjxilinx 已提交
6134 6135 6136
      close(fd);
      unlink(pQuery->sdata[0]->data);
    } else {
dengyihao's avatar
dengyihao 已提交
6137
      // todo return the error code to client and handle invalid fd
S
slguan 已提交
6138
      qError("QInfo:%p failed to open tmp file to send ts-comp data to client, path:%s, reason:%s", pQInfo,
H
hjxilinx 已提交
6139
             pQuery->sdata[0]->data, strerror(errno));
dengyihao's avatar
dengyihao 已提交
6140 6141 6142
      if (fd != -1) {
        close(fd); 
      }
H
hjxilinx 已提交
6143
    }
6144

H
hjxilinx 已提交
6145 6146 6147 6148
    // all data returned, set query over
    if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
      setQueryStatus(pQuery, QUERY_OVER);
    }
H
hjxilinx 已提交
6149
  } else {
6150
    doCopyQueryResultToMsg(pQInfo, pQuery->rec.rows, data);
6151
  }
6152

6153
  pQuery->rec.total += pQuery->rec.rows;
6154
  qDebug("QInfo:%p current numOfRes rows:%" PRId64 ", total:%" PRId64, pQInfo, pQuery->rec.rows, pQuery->rec.total);
6155

6156
  if (pQuery->limit.limit > 0 && pQuery->limit.limit == pQuery->rec.total) {
6157
    qDebug("QInfo:%p results limitation reached, limitation:%"PRId64, pQInfo, pQuery->limit.limit);
6158 6159 6160
    setQueryStatus(pQuery, QUERY_OVER);
  }
  
H
hjxilinx 已提交
6161
  return TSDB_CODE_SUCCESS;
6162 6163
}

6164 6165 6166 6167 6168 6169 6170
typedef struct SQueryMgmt {
  SCacheObj      *qinfoPool;      // query handle pool
  int32_t         vgId;
  bool            closed;
  pthread_mutex_t lock;
} SQueryMgmt;

H
Haojun Liao 已提交
6171
int32_t qCreateQueryInfo(void* tsdb, int32_t vgId, SQueryTableMsg* pQueryMsg, void* param, qinfo_t* pQInfo) {
6172
  assert(pQueryMsg != NULL && tsdb != NULL);
6173 6174

  int32_t code = TSDB_CODE_SUCCESS;
6175

6176 6177 6178 6179 6180 6181 6182 6183
  char            *tagCond  = NULL;
  char            *tbnameCond = NULL;
  SArray          *pTableIdList = NULL;
  SSqlFuncMsg    **pExprMsg = NULL;
  SExprInfo       *pExprs   = NULL;
  SColIndex       *pGroupColIndex = NULL;
  SColumnInfo     *pTagColumnInfo = NULL;
  SSqlGroupbyExpr *pGroupbyExpr   = NULL;
6184

6185 6186
  code = convertQueryMsg(pQueryMsg, &pTableIdList, &pExprMsg, &tagCond, &tbnameCond, &pGroupColIndex, &pTagColumnInfo);
  if (code != TSDB_CODE_SUCCESS) {
B
Bomin Zhang 已提交
6187
    goto _over;
6188 6189
  }

H
hjxilinx 已提交
6190
  if (pQueryMsg->numOfTables <= 0) {
S
slguan 已提交
6191
    qError("Invalid number of tables to query, numOfTables:%d", pQueryMsg->numOfTables);
6192
    code = TSDB_CODE_QRY_INVALID_MSG;
H
hjxilinx 已提交
6193
    goto _over;
6194 6195
  }

H
hjxilinx 已提交
6196
  if (pTableIdList == NULL || taosArrayGetSize(pTableIdList) == 0) {
S
slguan 已提交
6197
    qError("qmsg:%p, SQueryTableMsg wrong format", pQueryMsg);
6198
    code = TSDB_CODE_QRY_INVALID_MSG;
H
hjxilinx 已提交
6199
    goto _over;
6200 6201
  }

H
Haojun Liao 已提交
6202
  if ((code = createQFunctionExprFromMsg(pQueryMsg, &pExprs, pExprMsg, pTagColumnInfo)) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
6203
    goto _over;
6204 6205
  }

dengyihao's avatar
dengyihao 已提交
6206
  pGroupbyExpr = createGroupbyExprFromMsg(pQueryMsg, pGroupColIndex, &code);
H
hjxilinx 已提交
6207
  if ((pGroupbyExpr == NULL && pQueryMsg->numOfGroupCols != 0) || code != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
6208
    goto _over;
6209
  }
6210

H
hjxilinx 已提交
6211
  bool isSTableQuery = false;
6212
  STableGroupInfo tableGroupInfo = {0};
6213 6214
  int64_t st = taosGetTimestampUs();

H
Haojun Liao 已提交
6215
  if (TSDB_QUERY_HAS_TYPE(pQueryMsg->queryType, TSDB_QUERY_TYPE_TABLE_QUERY)) {
weixin_48148422's avatar
weixin_48148422 已提交
6216
    STableIdInfo *id = taosArrayGet(pTableIdList, 0);
H
Haojun Liao 已提交
6217

6218
    qDebug("qmsg:%p query normal table, uid:%"PRId64", tid:%d", pQueryMsg, id->uid, id->tid);
6219
    if ((code = tsdbGetOneTableGroup(tsdb, id->uid, &tableGroupInfo)) != TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
6220
      goto _over;
6221
    }
H
Haojun Liao 已提交
6222
  } else if (TSDB_QUERY_HAS_TYPE(pQueryMsg->queryType, TSDB_QUERY_TYPE_MULTITABLE_QUERY|TSDB_QUERY_TYPE_STABLE_QUERY)) {
6223
    isSTableQuery = true;
H
Haojun Liao 已提交
6224 6225 6226

    // also note there's possibility that only one table in the super table
    if (!TSDB_QUERY_HAS_TYPE(pQueryMsg->queryType, TSDB_QUERY_TYPE_MULTITABLE_QUERY)) {
weixin_48148422's avatar
weixin_48148422 已提交
6227 6228 6229 6230 6231 6232 6233
      STableIdInfo *id = taosArrayGet(pTableIdList, 0);

      // group by normal column, do not pass the group by condition to tsdb to group table into different group
      int32_t numOfGroupByCols = pQueryMsg->numOfGroupCols;
      if (pQueryMsg->numOfGroupCols == 1 && !TSDB_COL_IS_TAG(pGroupColIndex->flag)) {
        numOfGroupByCols = 0;
      }
6234 6235

      qDebug("qmsg:%p query stable, uid:%"PRId64", tid:%d", pQueryMsg, id->uid, id->tid);
6236
      code = tsdbQuerySTableByTagCond(tsdb, id->uid, tagCond, pQueryMsg->tagCondLen, pQueryMsg->tagNameRelType, tbnameCond, &tableGroupInfo, pGroupColIndex,
weixin_48148422's avatar
weixin_48148422 已提交
6237
                                          numOfGroupByCols);
6238
      if (code != TSDB_CODE_SUCCESS) {
6239
        qError("qmsg:%p failed to query stable, reason: %s", pQueryMsg, tstrerror(code));
6240 6241
        goto _over;
      }
weixin_48148422's avatar
weixin_48148422 已提交
6242
    } else {
6243 6244 6245 6246
      code = tsdbGetTableGroupFromIdList(tsdb, pTableIdList, &tableGroupInfo);
      if (code != TSDB_CODE_SUCCESS) {
        goto _over;
      }
H
Haojun Liao 已提交
6247

6248
      qDebug("qmsg:%p query on %zu tables in one group from client", pQueryMsg, tableGroupInfo.numOfTables);
6249
    }
6250 6251

    int64_t el = taosGetTimestampUs() - st;
6252
    qDebug("qmsg:%p tag filter completed, numOfTables:%zu, elapsed time:%"PRId64"us", pQueryMsg, tableGroupInfo.numOfTables, el);
H
hjxilinx 已提交
6253
  } else {
6254
    assert(0);
6255
  }
6256

6257
  (*pQInfo) = createQInfoImpl(pQueryMsg, pTableIdList, pGroupbyExpr, pExprs, &tableGroupInfo, pTagColumnInfo);
dengyihao's avatar
dengyihao 已提交
6258 6259 6260 6261
  pExprs = NULL;
  pGroupbyExpr = NULL;
  pTagColumnInfo = NULL;
  
6262
  if ((*pQInfo) == NULL) {
6263
    code = TSDB_CODE_QRY_OUT_OF_MEMORY;
H
hjxilinx 已提交
6264
    goto _over;
6265
  }
6266

H
Haojun Liao 已提交
6267
  code = initQInfo(pQueryMsg, tsdb, vgId, *pQInfo, isSTableQuery, param);
6268

H
hjxilinx 已提交
6269
_over:
dengyihao's avatar
dengyihao 已提交
6270 6271 6272
  free(tagCond);
  free(tbnameCond);
  free(pGroupColIndex);
dengyihao's avatar
dengyihao 已提交
6273 6274
  if (pGroupbyExpr != NULL) {
    taosArrayDestroy(pGroupbyExpr->columnInfo);
dengyihao's avatar
dengyihao 已提交
6275
    free(pGroupbyExpr);
dengyihao's avatar
dengyihao 已提交
6276
  } 
dengyihao's avatar
dengyihao 已提交
6277 6278
  free(pTagColumnInfo);
  free(pExprs);
dengyihao's avatar
dengyihao 已提交
6279
  free(pExprMsg);
H
hjxilinx 已提交
6280
  taosArrayDestroy(pTableIdList);
6281

B
Bomin Zhang 已提交
6282 6283 6284 6285 6286
  for (int32_t i = 0; i < pQueryMsg->numOfCols; i++) {
    SColumnInfo* column = pQueryMsg->colList + i;
    freeColumnFilterInfo(column->filters, column->numOfFilters);
  }

H
Haojun Liao 已提交
6287
  //pQInfo already freed in initQInfo, but *pQInfo may not pointer to null;
6288 6289 6290 6291
  if (code != TSDB_CODE_SUCCESS) {
    *pQInfo = NULL;
  }

6292
  // if failed to add ref for all tables in this query, abort current query
6293
  return code;
H
hjxilinx 已提交
6294 6295
}

H
Haojun Liao 已提交
6296
void qDestroyQueryInfo(qinfo_t qHandle) {
H
Haojun Liao 已提交
6297 6298 6299 6300 6301
  SQInfo* pQInfo = (SQInfo*) qHandle;
  if (!isValidQInfo(pQInfo)) {
    return;
  }

H
Haojun Liao 已提交
6302 6303 6304
  qDebug("QInfo:%p query completed", pQInfo);
  queryCostStatis(pQInfo);   // print the query cost summary
  freeQInfo(pQInfo);
H
Haojun Liao 已提交
6305 6306
}

6307
void qTableQuery(qinfo_t qinfo) {
6308 6309
  SQInfo *pQInfo = (SQInfo *)qinfo;

H
hjxilinx 已提交
6310
  if (pQInfo == NULL || pQInfo->signature != pQInfo) {
6311
    qDebug("QInfo:%p has been freed, no need to execute", pQInfo);
H
hjxilinx 已提交
6312 6313
    return;
  }
6314

H
Haojun Liao 已提交
6315
  if (IS_QUERY_KILLED(pQInfo)) {
6316
    qDebug("QInfo:%p it is already killed, abort", pQInfo);
6317
    sem_post(&pQInfo->dataReady);
H
hjxilinx 已提交
6318 6319
    return;
  }
6320

6321 6322
  if (pQInfo->tableqinfoGroupInfo.numOfTables == 0) {
    qDebug("QInfo:%p no table exists for query, abort", pQInfo);
6323 6324 6325 6326 6327
    sem_post(&pQInfo->dataReady);
    return;
  }

  // error occurs, record the error code and return to client
H
Haojun Liao 已提交
6328
  int32_t ret = setjmp(pQInfo->runtimeEnv.env);
6329 6330
  if (ret != TSDB_CODE_SUCCESS) {
    pQInfo->code = ret;
H
Haojun Liao 已提交
6331
    qDebug("QInfo:%p query abort due to error/cancel occurs, code:%s", pQInfo, tstrerror(pQInfo->code));
6332
    sem_post(&pQInfo->dataReady);
6333 6334 6335
    return;
  }

6336
  qDebug("QInfo:%p query task is launched", pQInfo);
6337

6338
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
H
hjxilinx 已提交
6339
  if (onlyQueryTags(pQInfo->runtimeEnv.pQuery)) {
H
Haojun Liao 已提交
6340
    assert(pQInfo->runtimeEnv.pQueryHandle == NULL);
6341
    buildTagQueryResult(pQInfo);
H
hjxilinx 已提交
6342
  } else if (pQInfo->runtimeEnv.stableQuery) {
6343
    stableQueryImpl(pQInfo);
H
hjxilinx 已提交
6344
  } else {
6345
    tableQueryImpl(pQInfo);
H
hjxilinx 已提交
6346
  }
6347

6348
  SQuery* pQuery = pRuntimeEnv->pQuery;
H
Haojun Liao 已提交
6349
  if (IS_QUERY_KILLED(pQInfo)) {
6350 6351 6352 6353 6354 6355 6356 6357
    qDebug("QInfo:%p query is killed", pQInfo);
  } else if (pQuery->rec.rows == 0) {
    qDebug("QInfo:%p over, %zu tables queried, %"PRId64" rows are returned", pQInfo, pQInfo->tableqinfoGroupInfo.numOfTables, pQuery->rec.total);
  } else {
    qDebug("QInfo:%p query paused, %" PRId64 " rows returned, numOfTotal:%" PRId64 " rows",
           pQInfo, pQuery->rec.rows, pQuery->rec.total + pQuery->rec.rows);
  }

H
hjxilinx 已提交
6358
  sem_post(&pQInfo->dataReady);
H
hjxilinx 已提交
6359 6360
}

H
hjxilinx 已提交
6361
int32_t qRetrieveQueryResultInfo(qinfo_t qinfo) {
6362 6363
  SQInfo *pQInfo = (SQInfo *)qinfo;

H
hjxilinx 已提交
6364
  if (pQInfo == NULL || !isValidQInfo(pQInfo)) {
6365
    return TSDB_CODE_QRY_INVALID_QHANDLE;
H
hjxilinx 已提交
6366
  }
6367

H
hjxilinx 已提交
6368
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
Haojun Liao 已提交
6369
  if (IS_QUERY_KILLED(pQInfo)) {
6370
    qDebug("QInfo:%p query is killed, code:%d", pQInfo, pQInfo->code);
H
hjxilinx 已提交
6371
    return pQInfo->code;
H
hjxilinx 已提交
6372
  }
6373

H
hjxilinx 已提交
6374
  sem_wait(&pQInfo->dataReady);
6375
  qDebug("QInfo:%p retrieve result info, rowsize:%d, rows:%"PRId64", code:%d", pQInfo, pQuery->rowSize, pQuery->rec.rows,
6376 6377
         pQInfo->code);

H
hjxilinx 已提交
6378
  return pQInfo->code;
H
hjxilinx 已提交
6379
}
6380

H
hjxilinx 已提交
6381
bool qHasMoreResultsToRetrieve(qinfo_t qinfo) {
6382 6383
  SQInfo *pQInfo = (SQInfo *)qinfo;

H
Haojun Liao 已提交
6384
  if (!isValidQInfo(pQInfo) || pQInfo->code != TSDB_CODE_SUCCESS) {
6385
    qDebug("QInfo:%p invalid qhandle or error occurs, abort query, code:%x", pQInfo, pQInfo->code);
H
hjxilinx 已提交
6386 6387
    return false;
  }
6388 6389

  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
H
Haojun Liao 已提交
6390
  bool ret = false;
H
hjxilinx 已提交
6391
  if (Q_STATUS_EQUAL(pQuery->status, QUERY_OVER)) {
H
Haojun Liao 已提交
6392
    ret = false;
H
hjxilinx 已提交
6393
  } else if (Q_STATUS_EQUAL(pQuery->status, QUERY_RESBUF_FULL)) {
H
Haojun Liao 已提交
6394
    ret = true;
H
hjxilinx 已提交
6395
  } else if (Q_STATUS_EQUAL(pQuery->status, QUERY_COMPLETED)) {
H
Haojun Liao 已提交
6396
    ret = true;
H
hjxilinx 已提交
6397 6398
  } else {
    assert(0);
6399
  }
H
Haojun Liao 已提交
6400 6401

  if (ret) {
6402
    qDebug("QInfo:%p has more results waits for client retrieve", pQInfo);
H
Haojun Liao 已提交
6403 6404 6405
  }

  return ret;
6406 6407
}

6408 6409 6410
int32_t qDumpRetrieveResult(qinfo_t qinfo, SRetrieveTableRsp **pRsp, int32_t *contLen) {
  SQInfo *pQInfo = (SQInfo *)qinfo;

H
hjxilinx 已提交
6411
  if (pQInfo == NULL || !isValidQInfo(pQInfo)) {
6412
    return TSDB_CODE_QRY_INVALID_QHANDLE;
6413
  }
6414

6415
  SQueryRuntimeEnv* pRuntimeEnv = &pQInfo->runtimeEnv;
6416 6417
  SQuery *pQuery = pQInfo->runtimeEnv.pQuery;
  size_t  size = getResultSize(pQInfo, &pQuery->rec.rows);
weixin_48148422's avatar
weixin_48148422 已提交
6418 6419
  size += sizeof(int32_t);
  size += sizeof(STableIdInfo) * taosArrayGetSize(pQInfo->arrTableIdInfo);
6420
  *contLen = size + sizeof(SRetrieveTableRsp);
6421

B
Bomin Zhang 已提交
6422 6423
  // todo proper handle failed to allocate memory,
  // current solution only avoid crash, but cannot return error code to client
6424
  *pRsp = (SRetrieveTableRsp *)rpcMallocCont(*contLen);
B
Bomin Zhang 已提交
6425 6426 6427
  if (*pRsp == NULL) {
    return TSDB_CODE_QRY_OUT_OF_MEMORY;
  }
6428
  (*pRsp)->numOfRows = htonl(pQuery->rec.rows);
6429

6430 6431 6432
  int32_t code = pQInfo->code;
  if (code == TSDB_CODE_SUCCESS) {
    (*pRsp)->offset = htobe64(pQuery->limit.offset);
6433
    (*pRsp)->useconds = htobe64(pRuntimeEnv->summary.elapsedTime);
6434 6435 6436 6437
  } else {
    (*pRsp)->offset = 0;
    (*pRsp)->useconds = 0;
  }
6438 6439
  
  (*pRsp)->precision = htons(pQuery->precision);
6440
  if (pQuery->rec.rows > 0 && code == TSDB_CODE_SUCCESS) {
H
hjxilinx 已提交
6441
    code = doDumpQueryResult(pQInfo, (*pRsp)->data);
6442
  } else {
H
hjxilinx 已提交
6443
    setQueryStatus(pQuery, QUERY_OVER);
6444
    code = pQInfo->code;
6445
  }
6446

H
Haojun Liao 已提交
6447
  if (IS_QUERY_KILLED(pQInfo) || Q_STATUS_EQUAL(pQuery->status, QUERY_OVER)) {
6448
    (*pRsp)->completed = 1;  // notify no more result to client
H
hjxilinx 已提交
6449
  }
6450

H
hjxilinx 已提交
6451
  return code;
6452
}
H
hjxilinx 已提交
6453

H
Haojun Liao 已提交
6454
int32_t qKillQuery(qinfo_t qinfo) {
H
Haojun Liao 已提交
6455 6456 6457 6458 6459 6460
  SQInfo *pQInfo = (SQInfo *)qinfo;

  if (pQInfo == NULL || !isValidQInfo(pQInfo)) {
    return TSDB_CODE_QRY_INVALID_QHANDLE;
  }

6461
  sem_post(&pQInfo->dataReady);
H
Haojun Liao 已提交
6462 6463 6464 6465
  setQueryKilled(pQInfo);
  return TSDB_CODE_SUCCESS;
}

6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481
static void doSetTagValueToResultBuf(char* output, const char* val, int16_t type, int16_t bytes) {
  if (type == TSDB_DATA_TYPE_BINARY || type == TSDB_DATA_TYPE_NCHAR) {
    if (val == NULL) {
      setVardataNull(output, type);
    } else {
      memcpy(output, val, varDataTLen(val));
    }
  } else {
    if (val == NULL) {
      setNull(output, type, bytes);
    } else {  // todo here stop will cause client crash
      memcpy(output, val, bytes);
    }
  }
}

H
hjxilinx 已提交
6482 6483 6484
static void buildTagQueryResult(SQInfo* pQInfo) {
  SQueryRuntimeEnv *pRuntimeEnv = &pQInfo->runtimeEnv;
  SQuery *          pQuery = pRuntimeEnv->pQuery;
6485

H
Haojun Liao 已提交
6486
  size_t numOfGroup = GET_NUM_OF_TABLEGROUP(pQInfo);
H
Haojun Liao 已提交
6487
  assert(numOfGroup == 0 || numOfGroup == 1);
6488

H
Haojun Liao 已提交
6489
  if (numOfGroup == 0) {
6490 6491
    return;
  }
H
hjxilinx 已提交
6492
  
H
Haojun Liao 已提交
6493
  SArray* pa = GET_TABLEGROUP(pQInfo, 0);
6494

H
Haojun Liao 已提交
6495
  size_t num = taosArrayGetSize(pa);
6496
  assert(num == pQInfo->tableqinfoGroupInfo.numOfTables);
6497

H
Haojun Liao 已提交
6498
  int32_t count = 0;
6499 6500 6501
  int32_t functionId = pQuery->pSelectExpr[0].base.functionId;
  if (functionId == TSDB_FUNC_TID_TAG) { // return the tags & table Id
    assert(pQuery->numOfOutput == 1);
6502

6503 6504
    SExprInfo* pExprInfo = &pQuery->pSelectExpr[0];
    int32_t rsize = pExprInfo->bytes;
H
Haojun Liao 已提交
6505
    count = 0;
6506

H
Haojun Liao 已提交
6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517
    int16_t bytes = pExprInfo->bytes;
    int16_t type = pExprInfo->type;

    for(int32_t i = 0; i < pQuery->numOfTags; ++i) {
      if (pQuery->tagColList[i].colId == pExprInfo->base.colInfo.colId) {
        bytes = pQuery->tagColList[i].bytes;
        type = pQuery->tagColList[i].type;
        break;
      }
    }

H
Haojun Liao 已提交
6518 6519
    while(pQInfo->tableIndex < num && count < pQuery->rec.capacity) {
      int32_t i = pQInfo->tableIndex++;
6520
      STableQueryInfo *item = taosArrayGetP(pa, i);
6521

6522
      char *output = pQuery->sdata[0]->data + i * rsize;
6523
      varDataSetLen(output, rsize - VARSTR_HEADER_SIZE);
6524

6525
      output = varDataVal(output);
H
Haojun Liao 已提交
6526
      STableId* id = TSDB_TABLEID(item->pTable);
6527

H
Haojun Liao 已提交
6528 6529
      *(int64_t *)output = id->uid;  // memory align problem, todo serialize
      output += sizeof(id->uid);
6530

H
Haojun Liao 已提交
6531 6532
      *(int32_t *)output = id->tid;
      output += sizeof(id->tid);
6533

6534
      *(int32_t *)output = pQInfo->vgId;
6535
      output += sizeof(pQInfo->vgId);
6536

6537
      if (pExprInfo->base.colInfo.colId == TSDB_TBNAME_COLUMN_INDEX) {
6538
        char* data = tsdbGetTableName(item->pTable);
6539
        memcpy(output, data, varDataTLen(data));
H
[td-90]  
Haojun Liao 已提交
6540
      } else {
6541 6542
        char* data = tsdbGetTableTagVal(item->pTable, pExprInfo->base.colInfo.colId, type, bytes);
        doSetTagValueToResultBuf(output, data, type, bytes);
H
[td-90]  
Haojun Liao 已提交
6543
      }
6544

H
Haojun Liao 已提交
6545
      count += 1;
6546
    }
6547

6548
    qDebug("QInfo:%p create (tableId, tag) info completed, rows:%d", pQInfo, count);
6549

H
Haojun Liao 已提交
6550 6551 6552 6553 6554
  } else if (functionId == TSDB_FUNC_COUNT) {// handle the "count(tbname)" query
    *(int64_t*) pQuery->sdata[0]->data = num;

    count = 1;
    pQInfo->tableIndex = num;  //set query completed
6555
    qDebug("QInfo:%p create count(tbname) query, res:%d rows:1", pQInfo, count);
6556
  } else {  // return only the tags|table name etc.
H
Haojun Liao 已提交
6557
    count = 0;
H
Haojun Liao 已提交
6558
    SSchema tbnameSchema = tGetTableNameColumnSchema();
6559

6560 6561 6562 6563 6564
    int32_t maxNumOfTables = pQuery->rec.capacity;
    if (pQuery->limit.limit >= 0 && pQuery->limit.limit < pQuery->rec.capacity) {
      maxNumOfTables = pQuery->limit.limit;
    }

6565
    while(pQInfo->tableIndex < num && count < maxNumOfTables) {
H
Haojun Liao 已提交
6566
      int32_t i = pQInfo->tableIndex++;
6567

6568 6569 6570 6571 6572 6573
      // discard current result due to offset
      if (pQuery->limit.offset > 0) {
        pQuery->limit.offset -= 1;
        continue;
      }

6574
      SExprInfo* pExprInfo = pQuery->pSelectExpr;
6575
      STableQueryInfo* item = taosArrayGetP(pa, i);
6576

6577 6578
      char *data = NULL, *dst = NULL;
      int16_t type = 0, bytes = 0;
6579
      for(int32_t j = 0; j < pQuery->numOfOutput; ++j) {
6580

6581
        if (pExprInfo[j].base.colInfo.colId == TSDB_TBNAME_COLUMN_INDEX) {
6582 6583 6584 6585 6586 6587 6588 6589
          bytes = tbnameSchema.bytes;
          type = tbnameSchema.type;

          data = tsdbGetTableName(item->pTable);
          dst = pQuery->sdata[j]->data + count * tbnameSchema.bytes;
        } else {
          type = pExprInfo[j].type;
          bytes = pExprInfo[j].bytes;
H
[td-90]  
Haojun Liao 已提交
6590
          
6591 6592
          data = tsdbGetTableTagVal(item->pTable, pExprInfo[j].base.colInfo.colId, type, bytes);
          dst = pQuery->sdata[j]->data + count * pExprInfo[j].bytes;
6593

6594
        }
6595 6596

        doSetTagValueToResultBuf(dst, data, type, bytes);
H
hjxilinx 已提交
6597
      }
H
Haojun Liao 已提交
6598
      count += 1;
H
hjxilinx 已提交
6599
    }
6600

6601
    qDebug("QInfo:%p create tag values results completed, rows:%d", pQInfo, count);
H
hjxilinx 已提交
6602
  }
6603

H
Haojun Liao 已提交
6604
  pQuery->rec.rows = count;
H
hjxilinx 已提交
6605
  setQueryStatus(pQuery, QUERY_COMPLETED);
H
hjxilinx 已提交
6606 6607
}

6608 6609 6610 6611 6612 6613 6614
void freeqinfoFn(void *qhandle) {
  void** handle = qhandle;
  if (handle == NULL || *handle == NULL) {
    return;
  }

  qKillQuery(*handle);
H
Haojun Liao 已提交
6615
  qDestroyQueryInfo(*handle);
6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633
}

void* qOpenQueryMgmt(int32_t vgId) {
  const int32_t REFRESH_HANDLE_INTERVAL = 2; // every 2 seconds, refresh handle pool

  char cacheName[128] = {0};
  sprintf(cacheName, "qhandle_%d", vgId);

  SQueryMgmt* pQueryHandle = calloc(1, sizeof(SQueryMgmt));

  pQueryHandle->qinfoPool = taosCacheInit(TSDB_DATA_TYPE_BIGINT, REFRESH_HANDLE_INTERVAL, true, freeqinfoFn, cacheName);
  pQueryHandle->closed    = false;
  pthread_mutex_init(&pQueryHandle->lock, NULL);

  qDebug("vgId:%d, open querymgmt success", vgId);
  return pQueryHandle;
}

H
Haojun Liao 已提交
6634
static void queryMgmtKillQueryFn(void* handle) {
H
Hui Li 已提交
6635 6636
  void** fp = (void**)handle;
  qKillQuery(*fp);
H
Haojun Liao 已提交
6637 6638 6639
}

void qQueryMgmtNotifyClosed(void* pQMgmt) {
6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650
  if (pQMgmt == NULL) {
    return;
  }

  SQueryMgmt* pQueryMgmt = pQMgmt;
  qDebug("vgId:%d, set querymgmt closed, wait for all queries cancelled", pQueryMgmt->vgId);

  pthread_mutex_lock(&pQueryMgmt->lock);
  pQueryMgmt->closed = true;
  pthread_mutex_unlock(&pQueryMgmt->lock);

H
Haojun Liao 已提交
6651
  taosCacheRefresh(pQueryMgmt->qinfoPool, queryMgmtKillQueryFn);
6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673
}

void qCleanupQueryMgmt(void* pQMgmt) {
  if (pQMgmt == NULL) {
    return;
  }

  SQueryMgmt* pQueryMgmt = pQMgmt;
  int32_t vgId = pQueryMgmt->vgId;

  assert(pQueryMgmt->closed);

  SCacheObj* pqinfoPool = pQueryMgmt->qinfoPool;
  pQueryMgmt->qinfoPool = NULL;

  taosCacheCleanup(pqinfoPool);
  pthread_mutex_destroy(&pQueryMgmt->lock);
  tfree(pQueryMgmt);

  qDebug("vgId:%d querymgmt cleanup completed", vgId);
}

6674
void** qRegisterQInfo(void* pMgmt, uint64_t qInfo) {
6675 6676 6677 6678
  if (pMgmt == NULL) {
    return NULL;
  }

6679 6680
  const int32_t DEFAULT_QHANDLE_LIFE_SPAN = tsShellActivityTimer * 2;

6681 6682
  SQueryMgmt *pQueryMgmt = pMgmt;
  if (pQueryMgmt->qinfoPool == NULL) {
6683
    qError("QInfo:%p failed to add qhandle into qMgmt, since qMgmt is closed", (void *)qInfo);
6684 6685 6686 6687 6688 6689
    return NULL;
  }

  pthread_mutex_lock(&pQueryMgmt->lock);
  if (pQueryMgmt->closed) {
    pthread_mutex_unlock(&pQueryMgmt->lock);
6690
    qError("QInfo:%p failed to add qhandle into cache, since qMgmt is colsing", (void *)qInfo);
6691 6692
    return NULL;
  } else {
6693 6694 6695
    uint64_t handleVal = (uint64_t) qInfo;

    void** handle = taosCachePut(pQueryMgmt->qinfoPool, &handleVal, sizeof(int64_t), &qInfo, POINTER_BYTES, DEFAULT_QHANDLE_LIFE_SPAN);
6696 6697 6698 6699 6700 6701
    pthread_mutex_unlock(&pQueryMgmt->lock);

    return handle;
  }
}

6702
void** qAcquireQInfo(void* pMgmt, uint64_t key) {
6703 6704 6705 6706 6707 6708
  SQueryMgmt *pQueryMgmt = pMgmt;

  if (pQueryMgmt->qinfoPool == NULL || pQueryMgmt->closed) {
    return NULL;
  }

6709
  void** handle = taosCacheAcquireByKey(pQueryMgmt->qinfoPool, &key, sizeof(uint64_t));
6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727
  if (handle == NULL || *handle == NULL) {
    return NULL;
  } else {
    return handle;
  }
}

void** qReleaseQInfo(void* pMgmt, void* pQInfo, bool needFree) {
  SQueryMgmt *pQueryMgmt = pMgmt;

  if (pQueryMgmt->qinfoPool == NULL) {
    return NULL;
  }

  taosCacheRelease(pQueryMgmt->qinfoPool, pQInfo, needFree);
  return 0;
}