dag.py 35.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: disable=doc-string-missing
import threading
import multiprocessing
import sys
B
barrierye 已提交
18
import copy
19 20 21 22 23 24 25 26
if sys.version_info.major == 2:
    import Queue
elif sys.version_info.major == 3:
    import queue as Queue
else:
    raise Exception("Error Python version")
import os
import logging
B
barrierye 已提交
27
import collections
T
TeslaZhao 已提交
28
import json
29
from .error_catch import ErrorCatch, CustomException, CustomExceptionCode, ParamChecker, ParamVerify
30
from .operator import Op, RequestOp, ResponseOp, VirtualOp
B
barrierye 已提交
31
from .channel import (ThreadChannel, ProcessChannel, ChannelData,
32 33 34
                      ChannelDataType, ChannelStopError)
from .error_catch import  ProductErrCode
from .error_catch import CustomExceptionCode as ChannelDataErrcode
B
barriery 已提交
35
from .profiler import TimeProfiler, PerformanceTracer
36
from .util import NameGenerator, ThreadIdGenerator, PipelineProcSyncManager
B
barriery 已提交
37
from .proto import pipeline_service_pb2
38

39
_LOGGER = logging.getLogger(__name__)
40 41 42


class DAGExecutor(object):
43 44 45
    """
    DAG Executor, the service entrance of DAG.
    """
46
    def __init__(self, response_op, server_conf, worker_idx):
47 48 49 50 51 52 53 54 55 56 57 58
        """
        Initialize DAGExecutor.

        Args:
            response_op: Response OP
            server_conf: server conf. config.yaml
            worker_idx: DAGExecutor index, PipelineServer creates many
                DAGExecutors when _build_dag_each_worker is true.

        Returns:
            None.
        """
B
barriery 已提交
59 60 61 62
        build_dag_each_worker = server_conf["build_dag_each_worker"]
        server_worker_num = server_conf["worker_num"]
        dag_conf = server_conf["dag"]

63 64
        self._retry = dag_conf["retry"]
        self._server_use_profile = dag_conf["use_profile"]
S
ShiningZhang 已提交
65 66 67 68
        self._enable_prometheus = False
        if "enable_prometheus" in dag_conf:
            self._enable_prometheus = dag_conf["enable_prometheus"]
        if "prometheus_port" in dag_conf and self._enable_prometheus:
B
bjjwwang 已提交
69 70 71
            self._prometheus_port = dag_conf["prometheus_port"]
        else:
            self._prometheus_port = None
72
        channel_size = dag_conf["channel_size"]
73
        channel_recv_frist_arrive = dag_conf["channel_recv_frist_arrive"]
74
        self._is_thread_op = dag_conf["is_thread_op"]
B
barrierye 已提交
75

B
barriery 已提交
76 77 78 79
        tracer_conf = dag_conf["tracer"]
        tracer_interval_s = tracer_conf["interval_s"]

        self.name = "@DAGExecutor"
B
barrierye 已提交
80
        self._profiler = TimeProfiler()
B
barrierye 已提交
81
        self._profiler.enable(True)
B
barrierye 已提交
82

B
barriery 已提交
83 84 85 86
        self._tracer = None
        if tracer_interval_s >= 1:
            self._tracer = PerformanceTracer(
                self._is_thread_op, tracer_interval_s, server_worker_num)
S
ShiningZhang 已提交
87 88
            if self._enable_prometheus:
                self._tracer.set_enable_dict(True)
B
barriery 已提交
89

B
bjjwwang 已提交
90
        self._dag = DAG(self.name, response_op, self._server_use_profile, self._prometheus_port,
W
wangjiawei04 已提交
91
                        self._is_thread_op, channel_size, build_dag_each_worker,
92
                        self._tracer, channel_recv_frist_arrive)
B
barrierye 已提交
93 94
        (in_channel, out_channel, pack_rpc_func,
         unpack_rpc_func) = self._dag.build()
95 96 97
        self._dag.start()

        self._set_in_channel(in_channel)
98
        self._set_out_channel(out_channel)
99 100 101
        self._pack_rpc_func = pack_rpc_func
        self._unpack_rpc_func = unpack_rpc_func

B
barriery 已提交
102 103 104
        if self._tracer is not None:
            self._tracer.start()

105 106 107
        # generate id 
        # data_id: Server Unique ID, automatically generated by the framework
        # log_id: Trace one product request, can be empty, not unique.
108 109 110 111 112
        base_counter = 0
        gen_id_step = 1
        if build_dag_each_worker:
            base_counter = worker_idx
            gen_id_step = server_worker_num
B
barriery 已提交
113
        self._id_generator = ThreadIdGenerator(
114 115 116
            max_id=1000000000000000000,
            base_counter=base_counter,
            step=gen_id_step)
B
barriery 已提交
117

B
barrierye 已提交
118 119
        self._cv_pool = {}
        self._cv_for_cv_pool = threading.Condition()
120
        self._fetch_buffer = {}
121 122
        self._recive_func = None

B
barrierye 已提交
123 124 125
        self._client_profile_key = "pipeline.profile"
        self._client_profile_value = "1"

126
    @ErrorCatch
127
    def start(self):
128 129 130 131 132 133 134 135 136
        """
        Starting one thread for receiving data from the last channel background.

        Args:
            None

        Returns:
            None
        """
137 138
        self._recive_func = threading.Thread(
            target=DAGExecutor._recive_out_channel_func, args=(self, ))
B
barriery 已提交
139
        self._recive_func.daemon = True
140
        self._recive_func.start()
B
barriery 已提交
141
        _LOGGER.debug("[DAG Executor] Start recive thread")
142 143

    def stop(self):
144 145 146 147 148 149 150 151 152
        """
        Stopping DAG

        Args:
            None

        Returns:
            None
        """
153 154
        self._dag.stop()
        self._dag.join()
B
barriery 已提交
155
        _LOGGER.info("[DAG Executor] Stop")
156 157

    def _get_next_data_id(self):
158 159 160 161 162 163 164 165 166 167
        """
        Generate data_id incrementally and Uniquely
   
        Args:
            None

        Returns:
            data_id: uniq id
            cond_v: condition variable
        """
B
barriery 已提交
168
        data_id = self._id_generator.next()
B
bug fix  
barriery 已提交
169 170 171
        cond_v = threading.Condition()
        with self._cv_for_cv_pool:
            self._cv_pool[data_id] = cond_v
172
            self._fetch_buffer[data_id] = None
B
bug fix  
barriery 已提交
173
        return data_id, cond_v
174 175

    def _set_in_channel(self, in_channel):
176 177 178 179 180 181 182 183 184
        """
        Set in_channel of DAG

        Args:
            in_channel: input channel of DAG

        Returns:
            None 
        """
185
        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
B
barriery 已提交
186 187 188
            _LOGGER.critical("[DAG Executor] Failed to set in_channel: "
                             "in_channel must be Channel type, but get {}".
                             format(type(in_channel)))
189
            os._exit(-1)
190

191
        self._in_channel = in_channel
192
        _LOGGER.info("[DAG] set in channel succ, name [{}]".format(self.name))
193 194

    def _set_out_channel(self, out_channel):
195 196 197 198 199 200 201 202 203
        """
        Set out_channel of DAG

        Args:
            out_channel: output channel of DAG

        Returns:
            None 
        """
204
        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
B
barriery 已提交
205 206 207
            _LOGGER.critical("[DAG Executor] Failed to set out_channel: "
                             "must be Channel type, but get {}".format(
                                 type(out_channel)))
208
            os._exit(-1)
209 210 211 212
        out_channel.add_consumer(self.name)
        self._out_channel = out_channel

    def _recive_out_channel_func(self):
213 214 215 216 217 218 219 220 221 222 223
        """
        Receiving data from the output channel, and pushing data into 
        _fetch_buffer. Function _get_channeldata_from_fetch_buffer gets 
        data by retry time.

        Args:
            None

        Returns:
            None
        """
B
barrierye 已提交
224
        cv = None
B
barrierye 已提交
225 226 227 228
        while True:
            try:
                channeldata_dict = self._out_channel.front(self.name)
            except ChannelStopError:
B
barriery 已提交
229
                _LOGGER.info("[DAG Executor] Stop.")
B
barrierye 已提交
230 231 232
                with self._cv_for_cv_pool:
                    for data_id, cv in self._cv_pool.items():
                        closed_errror_data = ChannelData(
T
TeslaZhao 已提交
233
                            error_code=ChannelDataErrcode.CLOSED_ERROR.value,
B
barrierye 已提交
234 235 236
                            error_info="dag closed.",
                            data_id=data_id)
                        with cv:
237
                            self._fetch_buffer[data_id] = closed_errror_data
B
barrierye 已提交
238 239
                            cv.notify_all()
                break
240
            if len(channeldata_dict) != 1:
241
                _LOGGER.critical(
B
barriery 已提交
242 243
                    "[DAG Executor] Failed to fetch result: out_channel "
                    "cannot have multiple input ops")
244 245 246
                os._exit(-1)
            (_, channeldata), = channeldata_dict.items()
            if not isinstance(channeldata, ChannelData):
247
                _LOGGER.critical(
B
barriery 已提交
248 249
                    '[DAG Executor] Failed to fetch result: data in out_channel" \
                    " must be ChannelData type, but get {}'
B
barriery 已提交
250
                    .format(type(channeldata)))
B
barriery 已提交
251
                os._exit(-1)
B
barrierye 已提交
252 253

            data_id = channeldata.id
B
barriery 已提交
254 255
            _LOGGER.debug("(logid={}) [recive thread] Fetched data".format(
                data_id))
B
barrierye 已提交
256
            with self._cv_for_cv_pool:
257 258 259 260
                cond_v = self._cv_pool[data_id]
            with cond_v:
                self._fetch_buffer[data_id] = channeldata
                cond_v.notify_all()
261

B
bug fix  
barriery 已提交
262
    def _get_channeldata_from_fetch_buffer(self, data_id, cond_v):
263 264 265 266 267 268 269 270 271 272
        """
        Getting the channel data from _fetch_buffer.

        Args:
            data_id: search key
            cond_v: conditional variable

        Returns:
            ready_data: one channel data processed
        """
273 274
        ready_data = None

B
bug fix  
barriery 已提交
275
        with cond_v:
276 277 278 279 280 281 282 283 284 285 286 287 288
            with self._cv_for_cv_pool:
                if self._fetch_buffer[data_id] is not None:
                    # The requested data is already ready
                    ready_data = self._fetch_buffer[data_id]
                    self._cv_pool.pop(data_id)
                    self._fetch_buffer.pop(data_id)
            if ready_data is None:
                # Wait for data ready
                cond_v.wait()
                with self._cv_for_cv_pool:
                    ready_data = self._fetch_buffer[data_id]
                    self._cv_pool.pop(data_id)
                    self._fetch_buffer.pop(data_id)
289
        _LOGGER.debug("(data_id={}) [resp thread] Got data".format(data_id))
290
        return ready_data
291

B
barrierye 已提交
292
    def _pack_channeldata(self, rpc_request, data_id):
293 294 295 296 297 298 299 300 301 302
        """
        Unpacking data from RPC request. and creating one channelData.

        Args:
           rpc_request: one RPC request
           data_id: data id, unique

        Returns:
            ChannelData: one channel data to be processed
        """
303
        dictdata = None
T
TeslaZhao 已提交
304
        log_id = None
305
        try:
T
TeslaZhao 已提交
306 307
            dictdata, log_id, prod_errcode, prod_errinfo = self._unpack_rpc_func(
                rpc_request)
308
        except Exception as e:
B
barriery 已提交
309 310 311 312
            _LOGGER.error(
                "(logid={}) Failed to parse RPC request package: {}"
                .format(data_id, e),
                exc_info=True)
313
            return ChannelData(
T
TeslaZhao 已提交
314
                error_code=ChannelDataErrcode.RPC_PACKAGE_ERROR.value,
315
                error_info="rpc package error: {}".format(e),
T
TeslaZhao 已提交
316 317
                data_id=data_id,
                log_id=log_id)
318
        else:
T
TeslaZhao 已提交
319 320 321 322 323
            # because unpack_rpc_func is rewritten by user, we need to look
            # for product_errcode in returns, and  client_profile_key field
            # in rpc_request
            if prod_errcode is not None:
                # product errors occured
324 325
                _LOGGER.error("unpack_rpc_func prod_errcode:{}".format(
                    prod_errcode))
T
TeslaZhao 已提交
326 327 328 329 330 331 332 333
                return ChannelData(
                    error_code=ChannelDataErrcode.PRODUCT_ERROR.value,
                    error_info="",
                    prod_error_code=prod_errcode,
                    prod_error_info=prod_errinfo,
                    data_id=data_id,
                    log_id=log_id)

B
barrierye 已提交
334
            profile_value = None
T
TeslaZhao 已提交
335
            profile_value = dictdata.get(self._client_profile_key)
B
barriery 已提交
336
            client_need_profile = (profile_value == self._client_profile_value)
337 338 339
            return ChannelData(
                datatype=ChannelDataType.DICT.value,
                dictdata=dictdata,
B
barrierye 已提交
340
                data_id=data_id,
T
TeslaZhao 已提交
341
                log_id=log_id,
B
barriery 已提交
342
                client_need_profile=client_need_profile)
343 344

    def call(self, rpc_request):
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
        """
        DAGExcutor enterance function. There are 5 steps:
        1._get_next_data_id: Generate an incremental ID
        2._pack_channeldata: pack the channel data from request.
        3.retry loop: 
            a. push channel_data into _in_channel
            b. get_channeldata_from_fetch_buffer: get results.
        4._pack_for_rpc_resp: pack RPC responses
        5.profile: generte profile string and pack into response.

        Args:
            rpc_request: one RPC request
   
        Returns:
            rpc_resp: one RPC response
        """
B
barriery 已提交
361 362
        if self._tracer is not None:
            trace_buffer = self._tracer.data_buffer()
B
barriery 已提交
363

B
bug fix  
barriery 已提交
364
        data_id, cond_v = self._get_next_data_id()
B
barriery 已提交
365

B
barriery 已提交
366
        start_call, end_call = None, None
B
barrierye 已提交
367
        if not self._is_thread_op:
B
barriery 已提交
368 369
            start_call = self._profiler.record("call_{}#DAG-{}_0".format(
                data_id, data_id))
B
barrierye 已提交
370
        else:
B
barriery 已提交
371
            start_call = self._profiler.record("call_{}#DAG_0".format(data_id))
B
barrierye 已提交
372

B
barrierye 已提交
373 374 375
        self._profiler.record("prepack_{}#{}_0".format(data_id, self.name))
        req_channeldata = self._pack_channeldata(rpc_request, data_id)
        self._profiler.record("prepack_{}#{}_1".format(data_id, self.name))
376

377 378 379 380
        log_id = req_channeldata.log_id
        _LOGGER.info("(data_id={} log_id={}) Succ Generate ID ".format(data_id,
                                                                       log_id))

381 382
        resp_channeldata = None
        for i in range(self._retry):
383
            _LOGGER.debug("(data_id={}) Pushing data into Graph engine".format(
B
barriery 已提交
384
                data_id))
B
barrierye 已提交
385
            try:
386 387 388 389 390 391 392 393 394 395
                if req_channeldata is None:
                    _LOGGER.critical(
                        "(data_id={} log_id={}) req_channeldata is None"
                        .format(data_id, log_id))
                if not isinstance(self._in_channel,
                                  (ThreadChannel, ProcessChannel)):
                    _LOGGER.critical(
                        "(data_id={} log_id={})[DAG Executor] Failed to "
                        "set in_channel: in_channel must be Channel type, but get {}".
                        format(data_id, log_id, type(self._in_channel)))
B
barrierye 已提交
396 397
                self._in_channel.push(req_channeldata, self.name)
            except ChannelStopError:
398 399
                _LOGGER.error("(data_id:{} log_id={})[DAG Executor] Stop".
                              format(data_id, log_id))
B
bug fix  
barriery 已提交
400 401
                with self._cv_for_cv_pool:
                    self._cv_pool.pop(data_id)
B
barrierye 已提交
402 403
                return self._pack_for_rpc_resp(
                    ChannelData(
T
TeslaZhao 已提交
404
                        error_code=ChannelDataErrcode.CLOSED_ERROR.value,
B
barrierye 已提交
405 406
                        error_info="dag closed.",
                        data_id=data_id))
407

408 409
            _LOGGER.debug("(data_id={} log_id={}) Wait for Graph engine...".
                          format(data_id, log_id))
B
bug fix  
barriery 已提交
410 411
            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id,
                                                                       cond_v)
412

T
TeslaZhao 已提交
413
            if resp_channeldata.error_code == ChannelDataErrcode.OK.value:
414 415
                _LOGGER.info("(data_id={} log_id={}) Succ predict".format(
                    data_id, log_id))
416
                break
B
barriery 已提交
417
            else:
418 419 420
                _LOGGER.error("(data_id={} log_id={}) Failed to predict: {}"
                              .format(data_id, log_id,
                                      resp_channeldata.error_info))
T
TeslaZhao 已提交
421
                if resp_channeldata.error_code != ChannelDataErrcode.TIMEOUT.value:
B
barriery 已提交
422 423
                    break

424
            if i + 1 < self._retry:
425 426 427
                _LOGGER.warning(
                    "(data_id={} log_id={}) DAGExecutor retry({}/{})"
                    .format(data_id, log_id, i + 1, self._retry))
428

429 430
        _LOGGER.debug("(data_id={} log_id={}) Packing RPC response package"
                      .format(data_id, log_id))
B
barrierye 已提交
431
        self._profiler.record("postpack_{}#{}_0".format(data_id, self.name))
432
        rpc_resp = self._pack_for_rpc_resp(resp_channeldata)
B
barrierye 已提交
433
        self._profiler.record("postpack_{}#{}_1".format(data_id, self.name))
B
barrierye 已提交
434
        if not self._is_thread_op:
B
barriery 已提交
435 436
            end_call = self._profiler.record("call_{}#DAG-{}_1".format(data_id,
                                                                       data_id))
B
barrierye 已提交
437
        else:
B
barriery 已提交
438
            end_call = self._profiler.record("call_{}#DAG_1".format(data_id))
B
barriery 已提交
439 440

        if self._tracer is not None:
B
barrierye 已提交
441
            trace_buffer.put({
B
barrierye 已提交
442 443
                "name": "DAG",
                "id": data_id,
T
TeslaZhao 已提交
444 445
                "succ":
                resp_channeldata.error_code == ChannelDataErrcode.OK.value,
B
barrierye 已提交
446 447 448 449
                "actions": {
                    "call_{}".format(data_id): end_call - start_call,
                },
            })
B
barrierye 已提交
450 451 452 453 454 455 456

        profile_str = self._profiler.gen_profile_str()
        if self._server_use_profile:
            sys.stderr.write(profile_str)

        # add profile info into rpc_resp
        if resp_channeldata.client_need_profile:
B
barrierye 已提交
457 458 459
            profile_set = resp_channeldata.profile_data_set
            profile_set.add(profile_str)
            profile_value = "".join(list(profile_set))
B
barriery 已提交
460 461
            rpc_resp.key.append(self._client_profile_key)
            rpc_resp.value.append(profile_value)
B
barrierye 已提交
462

463 464 465
        return rpc_resp

    def _pack_for_rpc_resp(self, channeldata):
466 467 468 469 470 471 472 473 474
        """
        Packing one RPC response

        Args:
            channeldata: one channel data to be packed

        Returns:
            resp: one RPC response
        """
B
barriery 已提交
475 476 477 478 479 480 481 482
        try:
            return self._pack_rpc_func(channeldata)
        except Exception as e:
            _LOGGER.error(
                "(logid={}) Failed to pack RPC response package: {}"
                .format(channeldata.id, e),
                exc_info=True)
            resp = pipeline_service_pb2.Response()
T
TeslaZhao 已提交
483 484
            resp.err_no = ChannelDataErrcode.RPC_PACKAGE_ERROR.value
            resp.err_msg = "rpc package error: {}".format(e)
B
barriery 已提交
485
            return resp
486 487 488


class DAG(object):
489 490 491
    """
    Directed Acyclic Graph(DAG) engine, builds one DAG topology.
    """
B
bjjwwang 已提交
492
    def __init__(self, request_name, response_op, use_profile, prometheus_port, is_thread_op,
493 494
                 channel_size, build_dag_each_worker, tracer,
                 channel_recv_frist_arrive):
B
bjjwwang 已提交
495
        _LOGGER.info("{}, {}, {}, {}, {}, {} ,{} ,{} ,{}".format(request_name, response_op, use_profile, prometheus_port, is_thread_op,
496 497 498 499 500 501 502
                         channel_size, build_dag_each_worker, tracer,
                                          channel_recv_frist_arrive))
        @ErrorCatch
        @ParamChecker
        def init_helper(self, request_name: str,
                         response_op, 
                         use_profile: [bool, None], 
B
bjjwwang 已提交
503
                         prometheus_port: [int, None],
504 505 506
                         is_thread_op: bool,
                         channel_size, 
                         build_dag_each_worker: [bool, None],
F
felixhjh 已提交
507
                         tracer,
508 509 510 511
                        channel_recv_frist_arrive):
            self._request_name = request_name
            self._response_op = response_op
            self._use_profile = use_profile
B
bjjwwang 已提交
512 513
            self._prometheus_port = prometheus_port
            self._use_prometheus = (self._prometheus_port is not None)
514 515 516 517 518 519 520
            self._is_thread_op = is_thread_op
            self._channel_size = channel_size
            self._build_dag_each_worker = build_dag_each_worker
            self._tracer = tracer
            self._channel_recv_frist_arrive = channel_recv_frist_arrive
            if not self._is_thread_op:
                self._manager = PipelineProcSyncManager()
B
bjjwwang 已提交
521
        init_helper(self, request_name, response_op, use_profile, prometheus_port, is_thread_op,
522 523
                    channel_size, build_dag_each_worker, tracer,
                    channel_recv_frist_arrive)
F
felixhjh 已提交
524
        print("[DAG] Succ init")
B
barriery 已提交
525
        _LOGGER.info("[DAG] Succ init")
526

527
    @staticmethod
B
barriery 已提交
528
    def get_use_ops(response_op):
529 530 531 532 533 534 535 536 537 538 539 540
        """
        Starting from ResponseOp, recursively traverse the front OPs. Getting
        all used ops and the post op list of each op (excluding ResponseOp)

        Args:
            response_op: ResponseOp

        Returns:
            used_ops: used ops, set
            succ_ops_of_use_op: op and the next op list, dict.
            
        """
541
        unique_names = set()
542
        used_ops = set()
543 544 545 546 547 548 549 550 551 552
        succ_ops_of_use_op = {}  # {op_name: succ_ops}
        que = Queue.Queue()
        que.put(response_op)
        while que.qsize() != 0:
            op = que.get()
            for pred_op in op.get_input_ops():
                if pred_op.name not in succ_ops_of_use_op:
                    succ_ops_of_use_op[pred_op.name] = []
                if op != response_op:
                    succ_ops_of_use_op[pred_op.name].append(op)
553
                if pred_op not in used_ops:
554
                    que.put(pred_op)
555
                    used_ops.add(pred_op)
556 557
                    # check the name of op is globally unique
                    if pred_op.name in unique_names:
B
barriery 已提交
558 559
                        _LOGGER.critical("Failed to get used Ops: the"
                                         " name of Op must be unique: {}".
560 561
                                         format(pred_op.name))
                        os._exit(-1)
562
                    unique_names.add(pred_op.name)
563
        return used_ops, succ_ops_of_use_op
564 565

    def _gen_channel(self, name_gen):
566 567 568 569 570 571 572 573 574
        """
        Generate one ThreadChannel or ProcessChannel.

        Args:
            name_gen: channel name

        Returns:
            channel: one channel generated
        """
575
        channel = None
B
barrierye 已提交
576
        if self._is_thread_op:
577
            channel = ThreadChannel(
578 579 580
                name=name_gen.next(),
                maxsize=self._channel_size,
                channel_recv_frist_arrive=self._channel_recv_frist_arrive)
581 582
        else:
            channel = ProcessChannel(
583 584 585 586
                self._manager,
                name=name_gen.next(),
                maxsize=self._channel_size,
                channel_recv_frist_arrive=self._channel_recv_frist_arrive)
B
barriery 已提交
587
        _LOGGER.debug("[DAG] Generate channel: {}".format(channel.name))
588 589 590
        return channel

    def _gen_virtual_op(self, name_gen):
591 592 593 594 595 596 597 598 599
        """
        Generate one virtual Op

        Args:
            name_gen: Op name

        Returns:
            vir_op: one virtual Op object.
        """
B
barriery 已提交
600
        vir_op = VirtualOp(name=name_gen.next())
B
barriery 已提交
601
        _LOGGER.debug("[DAG] Generate virtual_op: {}".format(vir_op.name))
B
barriery 已提交
602
        return vir_op
603 604

    def _topo_sort(self, used_ops, response_op, out_degree_ops):
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
        """
        Topological sort of DAG, creates inverted multi-layers views.

        Args:
            used_ops: op used in DAG
            response_op: response op
            out_degree_ops: Next op list for each op, dict. the output of 
                get_use_ops()

        Returns:
            dag_views: the inverted hierarchical topology list. examples:
                DAG :[A -> B -> C -> E]
                            \-> D /
                dag_views: [[E], [C, D], [B], [A]]
                         
            last_op:the last op front of ResponseOp
        """
622 623 624 625 626 627 628
        out_degree_num = {
            name: len(ops)
            for name, ops in out_degree_ops.items()
        }
        que_idx = 0  # scroll queue 
        ques = [Queue.Queue() for _ in range(2)]
        zero_indegree_num = 0
629
        for op in used_ops:
630 631 632
            if len(op.get_input_ops()) == 0:
                zero_indegree_num += 1
        if zero_indegree_num != 1:
B
barriery 已提交
633 634
            _LOGGER.critical("Failed to topo sort: DAG contains "
                             "multiple RequestOps")
635
            os._exit(-1)
636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
        last_op = response_op.get_input_ops()[0]
        ques[que_idx].put(last_op)

        # topo sort to get dag_views
        dag_views = []
        sorted_op_num = 0
        while True:
            que = ques[que_idx]
            next_que = ques[(que_idx + 1) % 2]
            dag_view = []
            while que.qsize() != 0:
                op = que.get()
                dag_view.append(op)
                sorted_op_num += 1
                for pred_op in op.get_input_ops():
                    out_degree_num[pred_op.name] -= 1
                    if out_degree_num[pred_op.name] == 0:
                        next_que.put(pred_op)
            dag_views.append(dag_view)
            if next_que.qsize() == 0:
                break
            que_idx = (que_idx + 1) % 2
658
        if sorted_op_num < len(used_ops):
B
barriery 已提交
659
            _LOGGER.critical("Failed to topo sort: not legal DAG")
660
            os._exit(-1)
661 662 663

        return dag_views, last_op

664
    def _build_dag(self, response_op):
665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
        """
        Building DAG, the most important function in class DAG. Core steps:
        1.get_use_ops: Getting used ops, and out degree op list for each op.
        2._topo_sort: Topological sort creates inverted multi-layers views.
        3.create channels and virtual ops.

        Args:
            response_op: ResponseOp

        Returns:
            actual_ops: all OPs used in DAG, including virtual OPs
            channels: all channels used in DAG 
            input_channel: the channel of first OP 
            output_channel: the channel of last OP
            pack_func: pack_response_package function of response_op
            unpack_func: unpack_request_package function of request_op
        """
682
        if response_op is None:
B
barriery 已提交
683 684
            _LOGGER.critical("Failed to build DAG: ResponseOp"
                             " has not been set.")
685
            os._exit(-1)
686
        used_ops, out_degree_ops = DAG.get_use_ops(response_op)
687
        if not self._build_dag_each_worker:
B
barrierye 已提交
688 689
            _LOGGER.info("================= USED OP =================")
            for op in used_ops:
B
barriery 已提交
690
                if not isinstance(op, RequestOp):
B
barrierye 已提交
691 692
                    _LOGGER.info(op.name)
            _LOGGER.info("-------------------------------------------")
693
        if len(used_ops) <= 1:
694
            _LOGGER.critical(
B
barriery 已提交
695 696
                "Failed to build DAG: besides RequestOp and ResponseOp, "
                "there should be at least one Op in DAG.")
697
            os._exit(-1)
B
barriery 已提交
698 699
        if self._build_dag_each_worker:
            _LOGGER.info("Because `build_dag_each_worker` mode is used, "
B
barriery 已提交
700 701
                         "Auto-batching is set to the default config: "
                         "batch_size=1, auto_batching_timeout=None")
B
barriery 已提交
702 703
            for op in used_ops:
                op.use_default_auto_batching_config()
704 705 706

        dag_views, last_op = self._topo_sort(used_ops, response_op,
                                             out_degree_ops)
B
barrierye 已提交
707
        dag_views = list(reversed(dag_views))
708
        if not self._build_dag_each_worker:
709
            _LOGGER.info("================== DAG ====================")
B
barrierye 已提交
710
            for idx, view in enumerate(dag_views):
711
                _LOGGER.info("(VIEW {})".format(idx))
B
barrierye 已提交
712
                for op in view:
713
                    _LOGGER.info("  [{}]".format(op.name))
B
barrierye 已提交
714
                    for out_op in out_degree_ops[op.name]:
715 716
                        _LOGGER.info("    - {}".format(out_op.name))
            _LOGGER.info("-------------------------------------------")
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758

        # create channels and virtual ops
        virtual_op_name_gen = NameGenerator("vir")
        channel_name_gen = NameGenerator("chl")
        virtual_ops = []
        channels = []
        input_channel = None
        actual_view = None
        for v_idx, view in enumerate(dag_views):
            if v_idx + 1 >= len(dag_views):
                break
            next_view = dag_views[v_idx + 1]
            if actual_view is None:
                actual_view = view
            actual_next_view = []
            pred_op_of_next_view_op = {}
            for op in actual_view:
                # find actual succ op in next view and create virtual op
                for succ_op in out_degree_ops[op.name]:
                    if succ_op in next_view:
                        if succ_op not in actual_next_view:
                            actual_next_view.append(succ_op)
                        if succ_op.name not in pred_op_of_next_view_op:
                            pred_op_of_next_view_op[succ_op.name] = []
                        pred_op_of_next_view_op[succ_op.name].append(op)
                    else:
                        # create virtual op
                        virtual_op = self._gen_virtual_op(virtual_op_name_gen)
                        virtual_ops.append(virtual_op)
                        out_degree_ops[virtual_op.name] = [succ_op]
                        actual_next_view.append(virtual_op)
                        pred_op_of_next_view_op[virtual_op.name] = [op]
                        virtual_op.add_virtual_pred_op(op)
            actual_view = actual_next_view
            # create channel
            processed_op = set()
            for o_idx, op in enumerate(actual_next_view):
                if op.name in processed_op:
                    continue
                channel = self._gen_channel(channel_name_gen)
                channels.append(channel)
                op.add_input_channel(channel)
759
                _LOGGER.info("op:{} add input channel.".format(op.name))
760 761 762 763 764 765 766
                pred_ops = pred_op_of_next_view_op[op.name]
                if v_idx == 0:
                    input_channel = channel
                else:
                    # if pred_op is virtual op, it will use ancestors as producers to channel
                    for pred_op in pred_ops:
                        pred_op.add_output_channel(channel)
767 768
                        _LOGGER.info("pred_op:{} add output channel".format(
                            pred_op.name))
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
                processed_op.add(op.name)
                # find same input op to combine channel
                for other_op in actual_next_view[o_idx + 1:]:
                    if other_op.name in processed_op:
                        continue
                    other_pred_ops = pred_op_of_next_view_op[other_op.name]
                    if len(other_pred_ops) != len(pred_ops):
                        continue
                    same_flag = True
                    for pred_op in pred_ops:
                        if pred_op not in other_pred_ops:
                            same_flag = False
                            break
                    if same_flag:
                        other_op.add_input_channel(channel)
                        processed_op.add(other_op.name)
        output_channel = self._gen_channel(channel_name_gen)
        channels.append(output_channel)
        last_op.add_output_channel(output_channel)
788
        _LOGGER.info("last op:{} add output channel".format(last_op.name))
789 790 791 792 793

        pack_func, unpack_func = None, None
        pack_func = response_op.pack_response_package

        actual_ops = virtual_ops
794
        for op in used_ops:
795
            if len(op.get_input_ops()) == 0:
796 797 798
                #set special features of the request op. 
                #1.set unpack function.
                #2.set output channel. 
799
                unpack_func = op.unpack_request_package
800
                op.add_output_channel(input_channel)
801 802 803 804
                continue
            actual_ops.append(op)

        for c in channels:
B
barriery 已提交
805
            _LOGGER.debug("Channel({}):\n\t- producers: {}\n\t- consumers: {}"
B
barriery 已提交
806
                          .format(c.name, c.get_producers(), c.get_consumers()))
807 808 809 810

        return (actual_ops, channels, input_channel, output_channel, pack_func,
                unpack_func)

B
barriery 已提交
811 812 813
    def get_channels(self):
        return self._channels

814
    def build(self):
815 816 817 818 819 820 821 822 823 824 825 826
        """
        Interface for building one DAG outside.

        Args:
            None

        Returns:
            _input_channel: the channel of first OP
            _output_channel:  the channel of last OP
            _pack_func: pack_response_package function of response_op
            _unpack_func: unpack_request_package function of request_op
        """
827
        (actual_ops, channels, input_channel, output_channel, pack_func,
828
         unpack_func) = self._build_dag(self._response_op)
B
barriery 已提交
829
        _LOGGER.info("[DAG] Succ build DAG")
830 831 832 833 834 835 836 837

        self._actual_ops = actual_ops
        self._channels = channels
        self._input_channel = input_channel
        self._output_channel = output_channel
        self._pack_func = pack_func
        self._unpack_func = unpack_func

B
bug fix  
barrierye 已提交
838 839
        if self._tracer is not None:
            self._tracer.set_channels(self._channels)
B
barriery 已提交
840

841 842
        return self._input_channel, self._output_channel, self._pack_func, self._unpack_func

B
bjjwwang 已提交
843
    def start_prom(self, prometheus_port):
B
bjjwwang 已提交
844 845 846 847 848 849
        import prometheus_client
        from prometheus_client import Counter
        from prometheus_client.core import CollectorRegistry

        from flask import Response, Flask
        from .prometheus_metrics import registry 
B
bjjwwang 已提交
850
        from .prometheus_metrics import metric_query_success, metric_query_failure, metric_inf_count, metric_query_duration_us, metric_inf_duration_us 
B
bjjwwang 已提交
851
        app = Flask(__name__)
S
ShiningZhang 已提交
852
        # requests_total = Counter('c1','A counter') 
B
bjjwwang 已提交
853
        
S
ShiningZhang 已提交
854
        @app.route("/metrics")
B
bjjwwang 已提交
855
        def requests_count():
B
bjjwwang 已提交
856 857 858 859 860 861 862 863
            item = self._tracer.profile_dict
            _LOGGER.info("metrics: {}".format(item))
            # {'uci': {'in': 727.443, 'prep': 0.5525833333333333, 'midp': 2.21375, 'postp': 1.32375, 'out': 0.9396666666666667}, 'DAG': {'call_0': 29.479, 'call_1': 8.176, 'call_2': 8.045, 'call_3': 7.988, 'call_4': 7.609, 'call_5': 7.629, 'call_6': 7.625, 'call_7': 8.32, 'call_8': 8.57, 'call_9': 8.055, 'call_10': 7.915, 'call_11': 7.873, 'query_count': 12, 'qps': 1.2, 'succ': 1.0, 'avg': 9.773666666666667, '50': 8.045, '60': 8.055, '70': 8.176, '80': 8.32, '90': 8.57, '95': 29.479, '99': 29.479}}
            if "DAG" in item:
                total = item["DAG"]["query_count"]
                succ = total * item["DAG"]["succ"]
                fail = total * (1 - item["DAG"]["succ"])
                query_duration = total *item["DAG"]["avg"]
S
ShiningZhang 已提交
864 865 866 867 868 869 870 871 872 873 874 875 876 877
                metric_query_success.inc(succ)
                metric_query_failure._value.inc(fail)
                metric_query_duration_us._value.inc(query_duration)

                inf_cnt = 0
                infer_duration = 0.0
                for name in item:
                    if name != "DAG":
                        if "count" in item[name]:
                            inf_cnt += item[name]["count"]
                            if "midp" in item[name]:
                                infer_duration += item[name]["count"]*item[name]["midp"]
                metric_inf_count._value.inc(inf_cnt)
                metric_inf_duration_us._value.inc(infer_duration)
B
bjjwwang 已提交
878 879
            
            #return str(item)
S
ShiningZhang 已提交
880
            self._tracer.profile_dict = {}
B
bjjwwang 已提交
881 882 883
            return Response(prometheus_client.generate_latest(registry),mimetype="text/plain")

        def prom_run():
B
bjjwwang 已提交
884 885 886
            app.run(host="0.0.0.0",port=prometheus_port)
       
        p = threading.Thread(
B
bjjwwang 已提交
887 888 889 890 891 892
                target=prom_run,
                args=())
        _LOGGER.info("Prometheus Start 2")
        p.daemon = True
        p.start()

893
    def start(self):
894 895 896 897 898 899 900 901 902
        """
        Each OP starts a thread or process by _is_thread_op 

        Args:
            None

        Returns:
            _threads_or_proces: threads or process list.
        """
903 904
        self._threads_or_proces = []
        for op in self._actual_ops:
B
barrierye 已提交
905
            op.use_profiler(self._use_profile)
B
barriery 已提交
906
            op.set_tracer(self._tracer)
B
bjjwwang 已提交
907
            op.set_use_prometheus(self._use_prometheus)
B
barrierye 已提交
908
            if self._is_thread_op:
W
wangjiawei04 已提交
909
                self._threads_or_proces.extend(op.start_with_thread())
910
            else:
W
wangjiawei04 已提交
911
                self._threads_or_proces.extend(op.start_with_process())
B
barriery 已提交
912
        _LOGGER.info("[DAG] start")
S
ShiningZhang 已提交
913 914 915
        if self._use_prometheus:
            _LOGGER.info("Prometheus Start 1")
            self.start_prom(self._prometheus_port)
B
bjjwwang 已提交
916
         
917 918 919 920
        # not join yet
        return self._threads_or_proces

    def join(self):
921 922 923 924 925 926 927 928 929
        """
        All threads or processes join.

        Args:
            None

        Returns:
            None
        """
930
        for x in self._threads_or_proces:
W
wangjiawei04 已提交
931 932
            if x is not None:
                x.join()
933 934

    def stop(self):
935 936 937 938 939 940 941 942 943
        """
        Stopping and cleanning all channels.

        Args:
            None

        Returns:
            None 
        """
944 945
        for chl in self._channels:
            chl.stop()
946 947 948
        for op in self._actual_ops:
            op.clean_input_channel()
            op.clean_output_channels()