dag.py 24.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import threading
import multiprocessing
import sys
B
barrierye 已提交
18
import copy
19 20 21 22 23 24 25 26
if sys.version_info.major == 2:
    import Queue
elif sys.version_info.major == 3:
    import queue as Queue
else:
    raise Exception("Error Python version")
import os
import logging
B
barrierye 已提交
27
import collections
T
TeslaZhao 已提交
28
import json
29 30

from .operator import Op, RequestOp, ResponseOp, VirtualOp
B
barrierye 已提交
31
from .channel import (ThreadChannel, ProcessChannel, ChannelData,
T
TeslaZhao 已提交
32 33
                      ChannelDataErrcode, ChannelDataType, ChannelStopError,
                      ProductErrCode)
B
barriery 已提交
34
from .profiler import TimeProfiler, PerformanceTracer
35
from .util import NameGenerator, ThreadIdGenerator, PipelineProcSyncManager
B
barriery 已提交
36
from .proto import pipeline_service_pb2
37

38
_LOGGER = logging.getLogger(__name__)
39 40 41


class DAGExecutor(object):
42
    def __init__(self, response_op, server_conf, worker_idx):
B
barriery 已提交
43 44 45 46
        build_dag_each_worker = server_conf["build_dag_each_worker"]
        server_worker_num = server_conf["worker_num"]
        dag_conf = server_conf["dag"]

47 48 49 50
        self._retry = dag_conf["retry"]
        self._server_use_profile = dag_conf["use_profile"]
        channel_size = dag_conf["channel_size"]
        self._is_thread_op = dag_conf["is_thread_op"]
B
barrierye 已提交
51

B
barriery 已提交
52 53 54 55
        tracer_conf = dag_conf["tracer"]
        tracer_interval_s = tracer_conf["interval_s"]

        self.name = "@DAGExecutor"
B
barrierye 已提交
56
        self._profiler = TimeProfiler()
B
barrierye 已提交
57
        self._profiler.enable(True)
B
barrierye 已提交
58

B
barriery 已提交
59 60 61 62
        self._tracer = None
        if tracer_interval_s >= 1:
            self._tracer = PerformanceTracer(
                self._is_thread_op, tracer_interval_s, server_worker_num)
B
barriery 已提交
63

B
barrierye 已提交
64
        self._dag = DAG(self.name, response_op, self._server_use_profile,
W
wangjiawei04 已提交
65 66
                        self._is_thread_op, channel_size, build_dag_each_worker,
                        self._tracer)
B
barrierye 已提交
67 68
        (in_channel, out_channel, pack_rpc_func,
         unpack_rpc_func) = self._dag.build()
69 70 71
        self._dag.start()

        self._set_in_channel(in_channel)
72
        self._set_out_channel(out_channel)
73 74 75
        self._pack_rpc_func = pack_rpc_func
        self._unpack_rpc_func = unpack_rpc_func

B
barriery 已提交
76 77 78
        if self._tracer is not None:
            self._tracer.start()

79 80 81 82 83 84
        # generate id: data_id == request_id == log_id
        base_counter = 0
        gen_id_step = 1
        if build_dag_each_worker:
            base_counter = worker_idx
            gen_id_step = server_worker_num
B
barriery 已提交
85
        self._id_generator = ThreadIdGenerator(
86 87 88
            max_id=1000000000000000000,
            base_counter=base_counter,
            step=gen_id_step)
B
barriery 已提交
89

B
barrierye 已提交
90 91
        self._cv_pool = {}
        self._cv_for_cv_pool = threading.Condition()
92
        self._fetch_buffer = {}
93 94
        self._recive_func = None

B
barrierye 已提交
95 96 97
        self._client_profile_key = "pipeline.profile"
        self._client_profile_value = "1"

98
    def start(self):
99 100
        self._recive_func = threading.Thread(
            target=DAGExecutor._recive_out_channel_func, args=(self, ))
B
barriery 已提交
101
        self._recive_func.daemon = True
102
        self._recive_func.start()
B
barriery 已提交
103
        _LOGGER.debug("[DAG Executor] Start recive thread")
104 105 106 107

    def stop(self):
        self._dag.stop()
        self._dag.join()
B
barriery 已提交
108
        _LOGGER.info("[DAG Executor] Stop")
109 110

    def _get_next_data_id(self):
B
barriery 已提交
111
        data_id = self._id_generator.next()
B
bug fix  
barriery 已提交
112 113 114
        cond_v = threading.Condition()
        with self._cv_for_cv_pool:
            self._cv_pool[data_id] = cond_v
115
            self._fetch_buffer[data_id] = None
B
bug fix  
barriery 已提交
116
        return data_id, cond_v
117 118 119

    def _set_in_channel(self, in_channel):
        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
B
barriery 已提交
120 121 122
            _LOGGER.critical("[DAG Executor] Failed to set in_channel: "
                             "in_channel must be Channel type, but get {}".
                             format(type(in_channel)))
123
            os._exit(-1)
124 125 126 127 128
        in_channel.add_producer(self.name)
        self._in_channel = in_channel

    def _set_out_channel(self, out_channel):
        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
B
barriery 已提交
129 130 131
            _LOGGER.critical("[DAG Executor] Failed to set out_channel: "
                             "must be Channel type, but get {}".format(
                                 type(out_channel)))
132
            os._exit(-1)
133 134 135 136
        out_channel.add_consumer(self.name)
        self._out_channel = out_channel

    def _recive_out_channel_func(self):
B
barrierye 已提交
137
        cv = None
B
barrierye 已提交
138 139 140 141
        while True:
            try:
                channeldata_dict = self._out_channel.front(self.name)
            except ChannelStopError:
B
barriery 已提交
142
                _LOGGER.info("[DAG Executor] Stop.")
B
barrierye 已提交
143 144 145
                with self._cv_for_cv_pool:
                    for data_id, cv in self._cv_pool.items():
                        closed_errror_data = ChannelData(
T
TeslaZhao 已提交
146
                            error_code=ChannelDataErrcode.CLOSED_ERROR.value,
B
barrierye 已提交
147 148 149
                            error_info="dag closed.",
                            data_id=data_id)
                        with cv:
150
                            self._fetch_buffer[data_id] = closed_errror_data
B
barrierye 已提交
151 152 153
                            cv.notify_all()
                break

154
            if len(channeldata_dict) != 1:
155
                _LOGGER.critical(
B
barriery 已提交
156 157
                    "[DAG Executor] Failed to fetch result: out_channel "
                    "cannot have multiple input ops")
158 159 160
                os._exit(-1)
            (_, channeldata), = channeldata_dict.items()
            if not isinstance(channeldata, ChannelData):
161
                _LOGGER.critical(
B
barriery 已提交
162 163
                    '[DAG Executor] Failed to fetch result: data in out_channel" \
                    " must be ChannelData type, but get {}'
B
barriery 已提交
164
                    .format(type(channeldata)))
B
barriery 已提交
165
                os._exit(-1)
B
barrierye 已提交
166 167

            data_id = channeldata.id
B
barriery 已提交
168 169
            _LOGGER.debug("(logid={}) [recive thread] Fetched data".format(
                data_id))
B
barrierye 已提交
170
            with self._cv_for_cv_pool:
171 172 173 174
                cond_v = self._cv_pool[data_id]
            with cond_v:
                self._fetch_buffer[data_id] = channeldata
                cond_v.notify_all()
175

B
bug fix  
barriery 已提交
176
    def _get_channeldata_from_fetch_buffer(self, data_id, cond_v):
177 178
        ready_data = None

B
bug fix  
barriery 已提交
179
        with cond_v:
180 181 182 183 184 185 186 187 188 189 190 191 192
            with self._cv_for_cv_pool:
                if self._fetch_buffer[data_id] is not None:
                    # The requested data is already ready
                    ready_data = self._fetch_buffer[data_id]
                    self._cv_pool.pop(data_id)
                    self._fetch_buffer.pop(data_id)
            if ready_data is None:
                # Wait for data ready
                cond_v.wait()
                with self._cv_for_cv_pool:
                    ready_data = self._fetch_buffer[data_id]
                    self._cv_pool.pop(data_id)
                    self._fetch_buffer.pop(data_id)
B
barriery 已提交
193
        _LOGGER.debug("(logid={}) [resp thread] Got data".format(data_id))
194
        return ready_data
195

B
barrierye 已提交
196
    def _pack_channeldata(self, rpc_request, data_id):
197
        dictdata = None
T
TeslaZhao 已提交
198
        log_id = None
199
        try:
T
TeslaZhao 已提交
200 201
            dictdata, log_id, prod_errcode, prod_errinfo = self._unpack_rpc_func(
                rpc_request)
202
        except Exception as e:
B
barriery 已提交
203 204 205 206
            _LOGGER.error(
                "(logid={}) Failed to parse RPC request package: {}"
                .format(data_id, e),
                exc_info=True)
207
            return ChannelData(
T
TeslaZhao 已提交
208
                error_code=ChannelDataErrcode.RPC_PACKAGE_ERROR.value,
209
                error_info="rpc package error: {}".format(e),
T
TeslaZhao 已提交
210 211
                data_id=data_id,
                log_id=log_id)
212
        else:
T
TeslaZhao 已提交
213 214 215 216 217 218 219 220 221 222 223 224 225
            # because unpack_rpc_func is rewritten by user, we need to look
            # for product_errcode in returns, and  client_profile_key field
            # in rpc_request
            if prod_errcode is not None:
                # product errors occured
                return ChannelData(
                    error_code=ChannelDataErrcode.PRODUCT_ERROR.value,
                    error_info="",
                    prod_error_code=prod_errcode,
                    prod_error_info=prod_errinfo,
                    data_id=data_id,
                    log_id=log_id)

B
barrierye 已提交
226
            profile_value = None
T
TeslaZhao 已提交
227
            profile_value = dictdata.get(self._client_profile_key)
B
barriery 已提交
228
            client_need_profile = (profile_value == self._client_profile_value)
B
barriery 已提交
229
            _LOGGER.debug("(logid={}) Need profile in client: {}".format(
B
barriery 已提交
230
                data_id, client_need_profile))
231 232 233
            return ChannelData(
                datatype=ChannelDataType.DICT.value,
                dictdata=dictdata,
B
barrierye 已提交
234
                data_id=data_id,
T
TeslaZhao 已提交
235
                log_id=log_id,
B
barriery 已提交
236
                client_need_profile=client_need_profile)
237 238

    def call(self, rpc_request):
B
barriery 已提交
239 240
        if self._tracer is not None:
            trace_buffer = self._tracer.data_buffer()
B
barriery 已提交
241

B
bug fix  
barriery 已提交
242
        data_id, cond_v = self._get_next_data_id()
B
barriery 已提交
243
        _LOGGER.info("(logid={}) Succ generate id".format(data_id))
B
barriery 已提交
244

B
barriery 已提交
245
        start_call, end_call = None, None
B
barrierye 已提交
246
        if not self._is_thread_op:
B
barriery 已提交
247 248
            start_call = self._profiler.record("call_{}#DAG-{}_0".format(
                data_id, data_id))
B
barrierye 已提交
249
        else:
B
barriery 已提交
250
            start_call = self._profiler.record("call_{}#DAG_0".format(data_id))
B
barrierye 已提交
251

B
barriery 已提交
252
        _LOGGER.debug("(logid={}) Parsing RPC request package".format(data_id))
B
barrierye 已提交
253 254 255
        self._profiler.record("prepack_{}#{}_0".format(data_id, self.name))
        req_channeldata = self._pack_channeldata(rpc_request, data_id)
        self._profiler.record("prepack_{}#{}_1".format(data_id, self.name))
256 257 258

        resp_channeldata = None
        for i in range(self._retry):
B
barriery 已提交
259 260
            _LOGGER.debug("(logid={}) Pushing data into Graph engine".format(
                data_id))
B
barrierye 已提交
261 262 263
            try:
                self._in_channel.push(req_channeldata, self.name)
            except ChannelStopError:
B
barriery 已提交
264
                _LOGGER.debug("[DAG Executor] Stop")
B
bug fix  
barriery 已提交
265 266
                with self._cv_for_cv_pool:
                    self._cv_pool.pop(data_id)
B
barrierye 已提交
267 268
                return self._pack_for_rpc_resp(
                    ChannelData(
T
TeslaZhao 已提交
269
                        error_code=ChannelDataErrcode.CLOSED_ERROR.value,
B
barrierye 已提交
270 271
                        error_info="dag closed.",
                        data_id=data_id))
272

B
barriery 已提交
273
            _LOGGER.debug("(logid={}) Wait for Graph engine...".format(data_id))
B
bug fix  
barriery 已提交
274 275
            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id,
                                                                       cond_v)
276

T
TeslaZhao 已提交
277
            if resp_channeldata.error_code == ChannelDataErrcode.OK.value:
B
barriery 已提交
278
                _LOGGER.info("(logid={}) Succ predict".format(data_id))
279
                break
B
barriery 已提交
280
            else:
B
barriery 已提交
281 282
                _LOGGER.error("(logid={}) Failed to predict: {}"
                              .format(data_id, resp_channeldata.error_info))
T
TeslaZhao 已提交
283
                if resp_channeldata.error_code != ChannelDataErrcode.TIMEOUT.value:
B
barriery 已提交
284 285
                    break

286
            if i + 1 < self._retry:
B
barriery 已提交
287 288
                _LOGGER.warning("(logid={}) DAGExecutor retry({}/{})".format(
                    data_id, i + 1, self._retry))
289

B
barriery 已提交
290
        _LOGGER.debug("(logid={}) Packing RPC response package".format(data_id))
B
barrierye 已提交
291
        self._profiler.record("postpack_{}#{}_0".format(data_id, self.name))
292
        rpc_resp = self._pack_for_rpc_resp(resp_channeldata)
B
barrierye 已提交
293
        self._profiler.record("postpack_{}#{}_1".format(data_id, self.name))
B
barrierye 已提交
294
        if not self._is_thread_op:
B
barriery 已提交
295 296
            end_call = self._profiler.record("call_{}#DAG-{}_1".format(data_id,
                                                                       data_id))
B
barrierye 已提交
297
        else:
B
barriery 已提交
298
            end_call = self._profiler.record("call_{}#DAG_1".format(data_id))
B
barriery 已提交
299 300

        if self._tracer is not None:
B
barrierye 已提交
301
            trace_buffer.put({
B
barrierye 已提交
302 303
                "name": "DAG",
                "id": data_id,
T
TeslaZhao 已提交
304 305
                "succ":
                resp_channeldata.error_code == ChannelDataErrcode.OK.value,
B
barrierye 已提交
306 307 308 309
                "actions": {
                    "call_{}".format(data_id): end_call - start_call,
                },
            })
B
barrierye 已提交
310 311 312 313 314 315 316

        profile_str = self._profiler.gen_profile_str()
        if self._server_use_profile:
            sys.stderr.write(profile_str)

        # add profile info into rpc_resp
        if resp_channeldata.client_need_profile:
B
barrierye 已提交
317 318 319
            profile_set = resp_channeldata.profile_data_set
            profile_set.add(profile_str)
            profile_value = "".join(list(profile_set))
B
barriery 已提交
320 321
            rpc_resp.key.append(self._client_profile_key)
            rpc_resp.value.append(profile_value)
B
barrierye 已提交
322

323 324 325
        return rpc_resp

    def _pack_for_rpc_resp(self, channeldata):
B
barriery 已提交
326 327 328 329 330 331 332 333
        try:
            return self._pack_rpc_func(channeldata)
        except Exception as e:
            _LOGGER.error(
                "(logid={}) Failed to pack RPC response package: {}"
                .format(channeldata.id, e),
                exc_info=True)
            resp = pipeline_service_pb2.Response()
T
TeslaZhao 已提交
334 335 336
            resp.err_no = ChannelDataErrcode.RPC_PACKAGE_ERROR.value
            resp.err_msg = "rpc package error: {}".format(e)
            resp.result = ""
B
barriery 已提交
337
            return resp
338 339 340


class DAG(object):
B
barrierye 已提交
341
    def __init__(self, request_name, response_op, use_profile, is_thread_op,
W
wangjiawei04 已提交
342
                 channel_size, build_dag_each_worker, tracer):
B
barrierye 已提交
343
        self._request_name = request_name
344
        self._response_op = response_op
B
barrierye 已提交
345
        self._use_profile = use_profile
B
barrierye 已提交
346
        self._is_thread_op = is_thread_op
347
        self._channel_size = channel_size
B
barriery 已提交
348
        self._build_dag_each_worker = build_dag_each_worker
B
barriery 已提交
349
        self._tracer = tracer
B
barrierye 已提交
350
        if not self._is_thread_op:
351
            self._manager = PipelineProcSyncManager()
B
barriery 已提交
352
        _LOGGER.info("[DAG] Succ init")
353

354
    @staticmethod
B
barriery 已提交
355
    def get_use_ops(response_op):
356
        unique_names = set()
357
        used_ops = set()
358 359 360 361 362 363 364 365 366 367
        succ_ops_of_use_op = {}  # {op_name: succ_ops}
        que = Queue.Queue()
        que.put(response_op)
        while que.qsize() != 0:
            op = que.get()
            for pred_op in op.get_input_ops():
                if pred_op.name not in succ_ops_of_use_op:
                    succ_ops_of_use_op[pred_op.name] = []
                if op != response_op:
                    succ_ops_of_use_op[pred_op.name].append(op)
368
                if pred_op not in used_ops:
369
                    que.put(pred_op)
370
                    used_ops.add(pred_op)
371 372
                    # check the name of op is globally unique
                    if pred_op.name in unique_names:
B
barriery 已提交
373 374
                        _LOGGER.critical("Failed to get used Ops: the"
                                         " name of Op must be unique: {}".
375 376
                                         format(pred_op.name))
                        os._exit(-1)
377
                    unique_names.add(pred_op.name)
378
        return used_ops, succ_ops_of_use_op
379 380 381

    def _gen_channel(self, name_gen):
        channel = None
B
barrierye 已提交
382
        if self._is_thread_op:
383 384 385 386 387
            channel = ThreadChannel(
                name=name_gen.next(), maxsize=self._channel_size)
        else:
            channel = ProcessChannel(
                self._manager, name=name_gen.next(), maxsize=self._channel_size)
B
barriery 已提交
388
        _LOGGER.debug("[DAG] Generate channel: {}".format(channel.name))
389 390 391
        return channel

    def _gen_virtual_op(self, name_gen):
B
barriery 已提交
392
        vir_op = VirtualOp(name=name_gen.next())
B
barriery 已提交
393
        _LOGGER.debug("[DAG] Generate virtual_op: {}".format(vir_op.name))
B
barriery 已提交
394
        return vir_op
395 396 397 398 399 400 401 402 403

    def _topo_sort(self, used_ops, response_op, out_degree_ops):
        out_degree_num = {
            name: len(ops)
            for name, ops in out_degree_ops.items()
        }
        que_idx = 0  # scroll queue 
        ques = [Queue.Queue() for _ in range(2)]
        zero_indegree_num = 0
404
        for op in used_ops:
405 406 407
            if len(op.get_input_ops()) == 0:
                zero_indegree_num += 1
        if zero_indegree_num != 1:
B
barriery 已提交
408 409
            _LOGGER.critical("Failed to topo sort: DAG contains "
                             "multiple RequestOps")
410
            os._exit(-1)
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
        last_op = response_op.get_input_ops()[0]
        ques[que_idx].put(last_op)

        # topo sort to get dag_views
        dag_views = []
        sorted_op_num = 0
        while True:
            que = ques[que_idx]
            next_que = ques[(que_idx + 1) % 2]
            dag_view = []
            while que.qsize() != 0:
                op = que.get()
                dag_view.append(op)
                sorted_op_num += 1
                for pred_op in op.get_input_ops():
                    out_degree_num[pred_op.name] -= 1
                    if out_degree_num[pred_op.name] == 0:
                        next_que.put(pred_op)
            dag_views.append(dag_view)
            if next_que.qsize() == 0:
                break
            que_idx = (que_idx + 1) % 2
433
        if sorted_op_num < len(used_ops):
B
barriery 已提交
434
            _LOGGER.critical("Failed to topo sort: not legal DAG")
435
            os._exit(-1)
436 437 438

        return dag_views, last_op

439
    def _build_dag(self, response_op):
440
        if response_op is None:
B
barriery 已提交
441 442
            _LOGGER.critical("Failed to build DAG: ResponseOp"
                             " has not been set.")
443
            os._exit(-1)
444
        used_ops, out_degree_ops = DAG.get_use_ops(response_op)
445
        if not self._build_dag_each_worker:
B
barrierye 已提交
446 447
            _LOGGER.info("================= USED OP =================")
            for op in used_ops:
B
barriery 已提交
448
                if not isinstance(op, RequestOp):
B
barrierye 已提交
449 450
                    _LOGGER.info(op.name)
            _LOGGER.info("-------------------------------------------")
451
        if len(used_ops) <= 1:
452
            _LOGGER.critical(
B
barriery 已提交
453 454
                "Failed to build DAG: besides RequestOp and ResponseOp, "
                "there should be at least one Op in DAG.")
455
            os._exit(-1)
B
barriery 已提交
456 457
        if self._build_dag_each_worker:
            _LOGGER.info("Because `build_dag_each_worker` mode is used, "
B
barriery 已提交
458 459
                         "Auto-batching is set to the default config: "
                         "batch_size=1, auto_batching_timeout=None")
B
barriery 已提交
460 461
            for op in used_ops:
                op.use_default_auto_batching_config()
462 463 464

        dag_views, last_op = self._topo_sort(used_ops, response_op,
                                             out_degree_ops)
B
barrierye 已提交
465
        dag_views = list(reversed(dag_views))
466 467
        if not self._build_dag_each_worker:
            _LOGGER.debug("================== DAG ====================")
B
barrierye 已提交
468
            for idx, view in enumerate(dag_views):
469
                _LOGGER.debug("(VIEW {})".format(idx))
B
barrierye 已提交
470
                for op in view:
471
                    _LOGGER.debug("  [{}]".format(op.name))
B
barrierye 已提交
472
                    for out_op in out_degree_ops[op.name]:
473 474
                        _LOGGER.debug("    - {}".format(out_op.name))
            _LOGGER.debug("-------------------------------------------")
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547

        # create channels and virtual ops
        virtual_op_name_gen = NameGenerator("vir")
        channel_name_gen = NameGenerator("chl")
        virtual_ops = []
        channels = []
        input_channel = None
        actual_view = None
        for v_idx, view in enumerate(dag_views):
            if v_idx + 1 >= len(dag_views):
                break
            next_view = dag_views[v_idx + 1]
            if actual_view is None:
                actual_view = view
            actual_next_view = []
            pred_op_of_next_view_op = {}
            for op in actual_view:
                # find actual succ op in next view and create virtual op
                for succ_op in out_degree_ops[op.name]:
                    if succ_op in next_view:
                        if succ_op not in actual_next_view:
                            actual_next_view.append(succ_op)
                        if succ_op.name not in pred_op_of_next_view_op:
                            pred_op_of_next_view_op[succ_op.name] = []
                        pred_op_of_next_view_op[succ_op.name].append(op)
                    else:
                        # create virtual op
                        virtual_op = self._gen_virtual_op(virtual_op_name_gen)
                        virtual_ops.append(virtual_op)
                        out_degree_ops[virtual_op.name] = [succ_op]
                        actual_next_view.append(virtual_op)
                        pred_op_of_next_view_op[virtual_op.name] = [op]
                        virtual_op.add_virtual_pred_op(op)
            actual_view = actual_next_view
            # create channel
            processed_op = set()
            for o_idx, op in enumerate(actual_next_view):
                if op.name in processed_op:
                    continue
                channel = self._gen_channel(channel_name_gen)
                channels.append(channel)
                op.add_input_channel(channel)
                pred_ops = pred_op_of_next_view_op[op.name]
                if v_idx == 0:
                    input_channel = channel
                else:
                    # if pred_op is virtual op, it will use ancestors as producers to channel
                    for pred_op in pred_ops:
                        pred_op.add_output_channel(channel)
                processed_op.add(op.name)
                # find same input op to combine channel
                for other_op in actual_next_view[o_idx + 1:]:
                    if other_op.name in processed_op:
                        continue
                    other_pred_ops = pred_op_of_next_view_op[other_op.name]
                    if len(other_pred_ops) != len(pred_ops):
                        continue
                    same_flag = True
                    for pred_op in pred_ops:
                        if pred_op not in other_pred_ops:
                            same_flag = False
                            break
                    if same_flag:
                        other_op.add_input_channel(channel)
                        processed_op.add(other_op.name)
        output_channel = self._gen_channel(channel_name_gen)
        channels.append(output_channel)
        last_op.add_output_channel(output_channel)

        pack_func, unpack_func = None, None
        pack_func = response_op.pack_response_package

        actual_ops = virtual_ops
548
        for op in used_ops:
549 550 551 552 553 554
            if len(op.get_input_ops()) == 0:
                unpack_func = op.unpack_request_package
                continue
            actual_ops.append(op)

        for c in channels:
B
barriery 已提交
555
            _LOGGER.debug("Channel({}):\n\t- producers: {}\n\t- consumers: {}"
B
barriery 已提交
556
                          .format(c.name, c.get_producers(), c.get_consumers()))
557 558 559 560

        return (actual_ops, channels, input_channel, output_channel, pack_func,
                unpack_func)

B
barriery 已提交
561 562 563
    def get_channels(self):
        return self._channels

564 565
    def build(self):
        (actual_ops, channels, input_channel, output_channel, pack_func,
566
         unpack_func) = self._build_dag(self._response_op)
B
barriery 已提交
567
        _LOGGER.info("[DAG] Succ build DAG")
568 569 570 571 572 573 574 575

        self._actual_ops = actual_ops
        self._channels = channels
        self._input_channel = input_channel
        self._output_channel = output_channel
        self._pack_func = pack_func
        self._unpack_func = unpack_func

B
bug fix  
barrierye 已提交
576 577
        if self._tracer is not None:
            self._tracer.set_channels(self._channels)
B
barriery 已提交
578

579 580 581 582 583
        return self._input_channel, self._output_channel, self._pack_func, self._unpack_func

    def start(self):
        self._threads_or_proces = []
        for op in self._actual_ops:
B
barrierye 已提交
584
            op.use_profiler(self._use_profile)
B
barriery 已提交
585
            op.set_tracer(self._tracer)
B
barrierye 已提交
586
            if self._is_thread_op:
W
wangjiawei04 已提交
587
                self._threads_or_proces.extend(op.start_with_thread())
588
            else:
W
wangjiawei04 已提交
589
                self._threads_or_proces.extend(op.start_with_process())
B
barriery 已提交
590 591
        _LOGGER.info("[DAG] start")

592 593 594 595 596
        # not join yet
        return self._threads_or_proces

    def join(self):
        for x in self._threads_or_proces:
W
wangjiawei04 已提交
597 598
            if x is not None:
                x.join()
599 600 601 602

    def stop(self):
        for chl in self._channels:
            chl.stop()
603 604 605
        for op in self._actual_ops:
            op.clean_input_channel()
            op.clean_output_channels()