schedulerInt.h 12.8 KB
Newer Older
H
refact  
Hongze Cheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef _TD_SCHEDULER_INT_H_
#define _TD_SCHEDULER_INT_H_

#ifdef __cplusplus
extern "C" {
#endif

23 24 25 26
#include "os.h"
#include "tarray.h"
#include "planner.h"
#include "scheduler.h"
27
#include "thash.h"
D
dapan1121 已提交
28
#include "trpc.h"
D
dapan1121 已提交
29
#include "command.h"
30

D
dapan1121 已提交
31 32
#define SCHEDULE_DEFAULT_MAX_JOB_NUM 1000
#define SCHEDULE_DEFAULT_MAX_TASK_NUM 1000
D
dapan1121 已提交
33
#define SCHEDULE_DEFAULT_MAX_NODE_TABLE_NUM 200  // unit is TSDB_TABLE_NUM_UNIT
34

35
#define SCH_MAX_CANDIDATE_EP_NUM TSDB_MAX_REPLICA
D
dapan 已提交
36

D
dapan1121 已提交
37 38 39 40 41
enum {
  SCH_READ = 1,
  SCH_WRITE,
};

D
dapan1121 已提交
42 43 44 45 46
enum {
  SCH_EXEC_CB = 1,
  SCH_FETCH_CB,
};

D
dapan1121 已提交
47
typedef struct SSchTrans {
D
dapan1121 已提交
48 49
  void *pTrans;
  void *pHandle;
D
dapan1121 已提交
50 51
} SSchTrans;

D
dapan1121 已提交
52 53
typedef struct SSchHbTrans {
  SRWLatch  lock;
D
dapan1121 已提交
54
  SRpcCtx   rpcCtx;
D
dapan1121 已提交
55 56 57
  SSchTrans trans;
} SSchHbTrans;

D
dapan1121 已提交
58 59
typedef struct SSchApiStat {

wafwerar's avatar
wafwerar 已提交
60 61 62 63
#ifdef WINDOWS
  size_t avoidCompilationErrors;
#endif

D
dapan1121 已提交
64 65 66 67
} SSchApiStat;

typedef struct SSchRuntimeStat {

wafwerar's avatar
wafwerar 已提交
68 69 70 71
#ifdef WINDOWS
  size_t avoidCompilationErrors;
#endif

D
dapan1121 已提交
72 73 74 75
} SSchRuntimeStat;

typedef struct SSchJobStat {

wafwerar's avatar
wafwerar 已提交
76 77 78 79
#ifdef WINDOWS
  size_t avoidCompilationErrors;
#endif

D
dapan1121 已提交
80 81
} SSchJobStat;

D
dapan1121 已提交
82
typedef struct SSchStat {
D
dapan1121 已提交
83 84 85
  SSchApiStat      api;
  SSchRuntimeStat  runtime;
  SSchJobStat      job;
D
dapan1121 已提交
86
} SSchStat;
D
dapan1121 已提交
87

D
dapan1121 已提交
88
typedef struct SSchResInfo {
D
dapan1121 已提交
89 90 91 92 93
  SQueryResult*          queryRes;
  void**                 fetchRes;
  schedulerExecCallback  execFp; 
  schedulerFetchCallback fetchFp; 
  void*                  userParam;
D
dapan1121 已提交
94
} SSchResInfo;
D
dapan1121 已提交
95

96
typedef struct SSchedulerMgmt {
D
dapan1121 已提交
97 98 99
  uint64_t        taskId; // sequential taksId
  uint64_t        sId;    // schedulerId
  SSchedulerCfg   cfg;
D
dapan1121 已提交
100
  SRWLatch        lock;
D
dapan1121 已提交
101
  bool            exit;
D
dapan1121 已提交
102
  int32_t         jobRef;
D
dapan1121 已提交
103
  int32_t         jobNum;
D
dapan1121 已提交
104
  SSchStat        stat;
D
dapan1121 已提交
105
  SHashObj       *hbConnections;
106
} SSchedulerMgmt;
107

D
dapan1121 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
typedef struct SSchCallbackParamHeader {
  bool isHbParam;
} SSchCallbackParamHeader;

typedef struct SSchTaskCallbackParam {
  SSchCallbackParamHeader head;
  uint64_t                queryId;
  int64_t                 refId;
  uint64_t                taskId;
  void                   *transport;
} SSchTaskCallbackParam;

typedef struct SSchHbCallbackParam {
  SSchCallbackParamHeader head;
  SQueryNodeEpId          nodeEpId;
D
dapan1121 已提交
123
  void                   *pTrans;
D
dapan1121 已提交
124
} SSchHbCallbackParam;
D
dapan1121 已提交
125

D
dapan1121 已提交
126 127
typedef struct SSchFlowControl {
  SRWLatch  lock;
D
dapan1121 已提交
128
  bool      sorted;
D
dapan 已提交
129
  int32_t   tableNumSum;
D
dapan1121 已提交
130
  uint32_t  execTaskNum;
D
dapan1121 已提交
131
  SArray   *taskList;      // Element is SSchTask*
D
dapan1121 已提交
132 133
} SSchFlowControl;

D
dapan1121 已提交
134 135 136 137 138
typedef struct SSchNodeInfo {
  SQueryNodeAddr addr;
  void          *handle;
} SSchNodeInfo;

D
dapan 已提交
139
typedef struct SSchLevel {
D
dapan1121 已提交
140 141 142 143 144 145 146
  int32_t         level;
  int8_t          status;
  SRWLatch        lock;
  int32_t         taskFailed;
  int32_t         taskSucceed;
  int32_t         taskNum;
  int32_t         taskLaunchedNum;
D
dapan1121 已提交
147
  int32_t         taskDoneNum;
D
dapan1121 已提交
148
  SArray         *subTasks;      // Element is SQueryTask
D
dapan 已提交
149
} SSchLevel;
D
dapan1121 已提交
150

D
dapan 已提交
151
typedef struct SSchTask {
D
dapan1121 已提交
152
  uint64_t             taskId;         // task id
D
dapan1121 已提交
153
  SRWLatch             lock;           // task lock
D
dapan1121 已提交
154 155 156 157 158
  SSchLevel           *level;          // level
  SSubplan            *plan;           // subplan
  char                *msg;            // operator tree
  int32_t              msgLen;         // msg length
  int8_t               status;         // task status
D
dapan1121 已提交
159
  int32_t              lastMsgType;    // last sent msg type
D
dapan1121 已提交
160
  int32_t              tryTimes;       // task already tried times
D
dapan1121 已提交
161
  SQueryNodeAddr       succeedAddr;    // task executed success node address
162 163
  int8_t               candidateIdx;   // current try condidation index
  SArray              *candidateAddrs; // condidate node addresses, element is SQueryNodeAddr
D
dapan1121 已提交
164
  SArray              *execNodes;      // all tried node for current task, element is SSchNodeInfo
D
dapan1121 已提交
165 166 167 168
  SQueryProfileSummary summary;        // task execution summary
  int32_t              childReady;     // child task ready number
  SArray              *children;       // the datasource tasks,from which to fetch the result, element is SQueryTask*
  SArray              *parents;        // the data destination tasks, get data from current task, element is SQueryTask*
D
dapan1121 已提交
169
  void*                handle;          // task send handle 
D
dapan 已提交
170
} SSchTask;
D
dapan1121 已提交
171

D
dapan 已提交
172
typedef struct SSchJobAttr {
D
dapan1121 已提交
173 174 175 176
  EExplainMode explainMode;
  bool         syncSchedule;
  bool         queryJob;
  bool         needFlowCtrl;
D
dapan 已提交
177
} SSchJobAttr;
D
dapan1121 已提交
178

D
dapan 已提交
179
typedef struct SSchJob {
D
dapan1121 已提交
180
  int64_t          refId;
181
  uint64_t         queryId;
D
dapan1121 已提交
182
  SSchJobAttr      attr;
183
  int32_t          levelNum;
D
dapan1121 已提交
184
  int32_t          taskNum;
D
dapan1121 已提交
185
  void            *pTrans;
D
dapan1121 已提交
186 187
  SArray          *nodeList;   // qnode/vnode list, SArray<SQueryNodeAddr>
  SArray          *levels;    // starting from 0. SArray<SSchLevel>
X
Xiaoyu Wang 已提交
188
  SNodeList       *subPlans;  // subplan pointer copied from DAG, no need to free it in scheduler
D
dapan1121 已提交
189

D
dapan1121 已提交
190
  SArray          *dataSrcTasks; // SArray<SQueryTask*>
191
  int32_t          levelIdx;
D
dapan1121 已提交
192
  SEpSet           dataSrcEps;
D
dapan1121 已提交
193 194 195
  SHashObj        *execTasks; // executing tasks, key:taskid, value:SQueryTask*
  SHashObj        *succTasks; // succeed tasks, key:taskid, value:SQueryTask*
  SHashObj        *failTasks; // failed tasks, key:taskid, value:SQueryTask*
D
dapan1121 已提交
196
  SHashObj        *flowCtrl;  // key is ep, element is SSchFlowControl
D
dapan1121 已提交
197

D
dapan1121 已提交
198
  SExplainCtx     *explainCtx;
D
dapan1121 已提交
199
  int8_t           status;  
D
dapan1121 已提交
200
  SQueryNodeAddr   resNode;
D
dapan 已提交
201
  tsem_t           rspSem;
D
dapan1121 已提交
202
  int8_t           userFetch;
D
dapan 已提交
203
  int32_t          remoteFetch;
D
dapan1121 已提交
204
  SSchTask        *fetchTask;
D
dapan 已提交
205
  int32_t          errCode;
D
dapan 已提交
206
  SRWLatch         resLock;
D
dapan1121 已提交
207
  void            *queryRes;
D
dapan1121 已提交
208
  void            *resData;         //TODO free it or not
D
dapan1121 已提交
209
  int32_t          resNumOfRows;
D
dapan1121 已提交
210
  SSchResInfo      userRes;
211
  const char      *sql;
D
dapan1121 已提交
212
  int32_t          userCb;
213
  SQueryProfileSummary summary;
D
dapan 已提交
214
} SSchJob;
D
dapan1121 已提交
215

D
dapan1121 已提交
216 217
extern SSchedulerMgmt schMgmt;

D
dapan1121 已提交
218
#define SCH_TASK_READY_FOR_LAUNCH(readyNum, task) ((readyNum) >= taosArrayGetSize((task)->children))
D
dapan1121 已提交
219

D
dapan1121 已提交
220 221 222 223
#define SCH_TASK_ID(_task) ((_task) ? (_task)->taskId : -1)
#define SCH_SET_TASK_LASTMSG_TYPE(_task, _type) do { if(_task) { atomic_store_32(&(_task)->lastMsgType, _type); } } while (0)
#define SCH_GET_TASK_LASTMSG_TYPE(_task) ((_task) ? atomic_load_32(&(_task)->lastMsgType) : -1)

D
dapan1121 已提交
224 225 226
#define SCH_IS_DATA_SRC_QRY_TASK(task) ((task)->plan->subplanType == SUBPLAN_TYPE_SCAN)
#define SCH_IS_DATA_SRC_TASK(task) (((task)->plan->subplanType == SUBPLAN_TYPE_SCAN) || ((task)->plan->subplanType == SUBPLAN_TYPE_MODIFY))
#define SCH_IS_LEAF_TASK(_job, _task) (((_task)->level->level + 1) == (_job)->levelNum)
227

H
Haojun Liao 已提交
228
#define SCH_SET_TASK_STATUS(task, st) atomic_store_8(&(task)->status, st)
D
dapan1121 已提交
229
#define SCH_GET_TASK_STATUS(task) atomic_load_8(&(task)->status)
D
dapan1121 已提交
230 231
#define SCH_GET_TASK_STATUS_STR(task) jobTaskStatusStr(SCH_GET_TASK_STATUS(task))

D
dapan1121 已提交
232 233
#define SCH_GET_TASK_HANDLE(_task) ((_task) ? (_task)->handle : NULL)
#define SCH_SET_TASK_HANDLE(_task, _handle) ((_task)->handle = (_handle))
D
dapan1121 已提交
234

H
Haojun Liao 已提交
235
#define SCH_SET_JOB_STATUS(job, st) atomic_store_8(&(job)->status, st)
D
dapan1121 已提交
236
#define SCH_GET_JOB_STATUS(job) atomic_load_8(&(job)->status)
D
dapan1121 已提交
237
#define SCH_GET_JOB_STATUS_STR(job) jobTaskStatusStr(SCH_GET_JOB_STATUS(job))
D
dapan1121 已提交
238

D
dapan1121 已提交
239 240
#define SCH_SET_JOB_NEED_FLOW_CTRL(_job) (_job)->attr.needFlowCtrl = true
#define SCH_JOB_NEED_FLOW_CTRL(_job) ((_job)->attr.needFlowCtrl)
D
dapan1121 已提交
241
#define SCH_TASK_NEED_FLOW_CTRL(_job, _task) (SCH_IS_DATA_SRC_QRY_TASK(_task) && SCH_JOB_NEED_FLOW_CTRL(_job) && SCH_IS_LEVEL_UNFINISHED((_task)->level))
D
dapan1121 已提交
242 243 244 245

#define SCH_SET_JOB_TYPE(_job, type) (_job)->attr.queryJob = ((type) != SUBPLAN_TYPE_MODIFY)
#define SCH_IS_QUERY_JOB(_job) ((_job)->attr.queryJob) 
#define SCH_JOB_NEED_FETCH(_job) SCH_IS_QUERY_JOB(_job)
D
dapan1121 已提交
246 247
#define SCH_IS_WAIT_ALL_JOB(_job) (!SCH_IS_QUERY_JOB(_job))
#define SCH_IS_NEED_DROP_JOB(_job) (SCH_IS_QUERY_JOB(_job))
D
dapan1121 已提交
248
#define SCH_IS_EXPLAIN_JOB(_job) (EXPLAIN_MODE_ANALYZE == (_job)->attr.explainMode)
D
dapan1121 已提交
249

D
dapan1121 已提交
250
#define SCH_IS_LEVEL_UNFINISHED(_level) ((_level)->taskLaunchedNum < (_level)->taskNum)
X
Xiaoyu Wang 已提交
251 252
#define SCH_GET_CUR_EP(_addr) (&(_addr)->epSet.eps[(_addr)->epSet.inUse])
#define SCH_SWITCH_EPSET(_addr) ((_addr)->epSet.inUse = ((_addr)->epSet.inUse + 1) % (_addr)->epSet.numOfEps)
D
dapan1121 已提交
253
#define SCH_TASK_NUM_OF_EPS(_addr) ((_addr)->epSet.numOfEps)
D
dapan1121 已提交
254

H
Haojun Liao 已提交
255 256
#define SCH_JOB_ELOG(param, ...) qError("QID:0x%" PRIx64 " " param, pJob->queryId, __VA_ARGS__)
#define SCH_JOB_DLOG(param, ...) qDebug("QID:0x%" PRIx64 " " param, pJob->queryId, __VA_ARGS__)
S
Shengliang Guan 已提交
257 258

#define SCH_TASK_ELOG(param, ...) \
D
dapan1121 已提交
259
  qError("QID:0x%" PRIx64 ",TID:0x%" PRIx64 " " param, pJob->queryId, SCH_TASK_ID(pTask), __VA_ARGS__)
S
Shengliang Guan 已提交
260
#define SCH_TASK_DLOG(param, ...) \
D
dapan1121 已提交
261
  qDebug("QID:0x%" PRIx64 ",TID:0x%" PRIx64 " " param, pJob->queryId, SCH_TASK_ID(pTask), __VA_ARGS__)
D
dapan1121 已提交
262
#define SCH_TASK_DLOGL(param, ...) \
D
dapan1121 已提交
263
  qDebugL("QID:0x%" PRIx64 ",TID:0x%" PRIx64 " " param, pJob->queryId, SCH_TASK_ID(pTask), __VA_ARGS__)
S
Shengliang Guan 已提交
264
#define SCH_TASK_WLOG(param, ...) \
D
dapan1121 已提交
265
  qWarn("QID:0x%" PRIx64 ",TID:0x%" PRIx64 " " param, pJob->queryId, SCH_TASK_ID(pTask), __VA_ARGS__)
D
dapan1121 已提交
266 267 268 269

#define SCH_ERR_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; return _code; } } while (0)
#define SCH_RET(c) do { int32_t _code = c; if (_code != TSDB_CODE_SUCCESS) { terrno = _code; } return _code; } while (0)
#define SCH_ERR_JRET(c) do { code = c; if (code != TSDB_CODE_SUCCESS) { terrno = code; goto _return; } } while (0)
270

D
dapan1121 已提交
271 272 273
#define SCH_LOCK(type, _lock) (SCH_READ == (type) ? taosRLockLatch(_lock) : taosWLockLatch(_lock))
#define SCH_UNLOCK(type, _lock) (SCH_READ == (type) ? taosRUnLockLatch(_lock) : taosWUnLockLatch(_lock))

274

D
dapan1121 已提交
275 276
int32_t schLaunchTask(SSchJob *job, SSchTask *task);
int32_t schBuildAndSendMsg(SSchJob *job, SSchTask *task, SQueryNodeAddr *addr, int32_t msgType);
D
dapan1121 已提交
277 278
SSchJob *schAcquireJob(int64_t refId);
int32_t schReleaseJob(int64_t refId);
D
dapan1121 已提交
279
void schFreeFlowCtrl(SSchJob *pJob);
D
dapan1121 已提交
280
int32_t schChkJobNeedFlowCtrl(SSchJob *pJob, SSchLevel *pLevel);
D
dapan1121 已提交
281 282 283 284 285
int32_t schDecTaskFlowQuota(SSchJob *pJob, SSchTask *pTask);
int32_t schCheckIncTaskFlowQuota(SSchJob *pJob, SSchTask *pTask, bool *enough);
int32_t schLaunchTasksInFlowCtrlList(SSchJob *pJob, SSchTask *pTask);
int32_t schLaunchTaskImpl(SSchJob *pJob, SSchTask *pTask);
int32_t schFetchFromRemote(SSchJob *pJob);
D
dapan1121 已提交
286
int32_t schProcessOnTaskFailure(SSchJob *pJob, SSchTask *pTask, int32_t errCode);
D
dapan1121 已提交
287
int32_t schBuildAndSendHbMsg(SQueryNodeEpId *nodeEpId);
D
dapan1121 已提交
288
int32_t schCloneSMsgSendInfo(void *src, void **dst);
D
dapan1121 已提交
289 290
int32_t schValidateAndBuildJob(SQueryPlan *pDag, SSchJob *pJob);
void schFreeJobImpl(void *job);
D
dapan1121 已提交
291 292 293 294 295 296 297 298 299
int32_t schMakeHbCallbackParam(SSchJob *pJob, SSchTask *pTask, void **pParam);
int32_t schMakeHbRpcCtx(SSchJob *pJob, SSchTask *pTask, SRpcCtx *pCtx);
int32_t schEnsureHbConnection(SSchJob *pJob, SSchTask *pTask);
int32_t schUpdateHbConnection(SQueryNodeEpId *epId, SSchTrans *trans);
int32_t schHandleHbCallback(void *param, const SDataBuf *pMsg, int32_t code);
void schFreeRpcCtx(SRpcCtx *pCtx);
int32_t schGetCallbackFp(int32_t msgType, __async_send_cb_fn_t *fp);
bool schJobNeedToStop(SSchJob *pJob, int8_t *pStatus);
int32_t schProcessOnTaskSuccess(SSchJob *pJob, SSchTask *pTask);
D
dapan1121 已提交
300
int32_t schSaveJobQueryRes(SSchJob *pJob, SQueryTableRsp *rsp);
D
dapan1121 已提交
301 302
int32_t schProcessOnExplainDone(SSchJob *pJob, SSchTask *pTask, SRetrieveTableRsp *pRsp);
void schProcessOnDataFetched(SSchJob *job);
303
int32_t schGetTaskInJob(SSchJob *pJob, uint64_t taskId, SSchTask **pTask);
D
dapan1121 已提交
304 305 306 307
int32_t schUpdateTaskExecNodeHandle(SSchTask *pTask, void *handle, int32_t rspCode);
void schFreeRpcCtxVal(const void *arg);
int32_t schMakeBrokenLinkVal(SSchJob *pJob, SSchTask *pTask, SRpcBrokenlinkVal *brokenVal, bool isHb);
int32_t schRecordTaskExecNode(SSchJob *pJob, SSchTask *pTask, SQueryNodeAddr *addr, void *handle);
D
dapan1121 已提交
308 309 310 311
int32_t schExecStaticExplainJob(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql,
                             SSchResInfo *pRes, bool sync);
int32_t schExecJobImpl(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64_t *job, const char *sql,
                              SSchResInfo *pRes, int64_t startTs, bool sync);
D
dapan1121 已提交
312 313 314 315 316
int32_t schChkUpdateJobStatus(SSchJob *pJob, int8_t newStatus);
int32_t schCancelJob(SSchJob *pJob);
int32_t schProcessOnJobDropped(SSchJob *pJob, int32_t errCode);
uint64_t schGenTaskId(void);
void schCloseJobRef(void);
317 318
int32_t schExecJob(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64_t *pJob, const char *sql, int64_t startTs, SSchResInfo *pRes);
int32_t schAsyncExecJob(void *pTrans, SArray *pNodeList, SQueryPlan *pDag, int64_t *pJob, const char *sql, int64_t startTs, SSchResInfo *pRes);
D
dapan1121 已提交
319 320
int32_t schFetchRows(SSchJob *pJob);
int32_t schAsyncFetchRows(SSchJob *pJob);
321
int32_t schUpdateTaskHandle(SSchJob *pJob, SSchTask *pTask, int32_t msgType, void *handle, int32_t rspCode);
D
dapan1121 已提交
322

D
dapan 已提交
323

H
refact  
Hongze Cheng 已提交
324 325 326 327
#ifdef __cplusplus
}
#endif

D
dapan1121 已提交
328
#endif /*_TD_SCHEDULER_INT_H_*/