Merge pull request #19 from barrierye/pipeling-log

Pipeling log

Merge pull request #19 from barrierye/pipeling-log
Pipeling log
d0a08407 · barriery · GitHub · 8acd6803 · bd8bc552 · d0a08407
11 changed file
--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -4,12 +4,26 @@

 ## Compilation environment requirements

- OS: CentOS 7
- GCC: 4.8.2 and later
- Golang: 1.9.2 and later
- Git：2.17.1 and later
- CMake：3.2.2 and later
- Python：2.7.2 and later / 3.6 and later
+|            module            |                           version                            |
+| :--------------------------: | :----------------------------------------------------------: |
+|              OS              |                           CentOS 7                           |
+|             gcc              |                       4.8.5 and later                        |
+|           gcc-c++            |                       4.8.5 and later                        |
+|             git              |                        3.82 and later                        |
+|            cmake             |                       3.2.0 and later                        |
+|            Python            |               2.7.2 and later / 3.6 and later                |
+|              Go              |                       1.9.2 and later                        |
+|             git              |                       2.17.1 and later                       |
+|         glibc-static         |                             2.17                             |
+|        openssl-devel         |                            1.0.2k                            |
+|         bzip2-devel          |                       1.0.6 and later                        |
+| python-devel / python3-devel |              2.7.5 and later / 3.6.8 and later               |
+|         sqlite-devel         |                       3.7.17 and later                       |
+|           patchelf           |                        0.9 and later                         |
+|           libXext            |                            1.3.3                             |
+|            libSM             |                            1.2.2                             |
+|          libXrender          |                            0.9.10                            |
+|          python-whl          | numpy>=1.12, <=1.16.4<br/>google>=2.0.3<br/>protobuf>=3.12.2<br/>grpcio-tools>=1.28.1<br/>grpcio>=1.28.1<br/>func-timeout>=4.3.5<br/>pyyaml>=1.3.0<br/>sentencepiece==0.1.92<br>flask>=1.1.2<br>ujson>=2.0.3 |

 It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you, see [this document](DOCKER_IMAGES.md).


--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -4,12 +4,26 @@

 ## 编译环境设置

- OS: CentOS 7
- GCC: 4.8.2及以上
- Golang: 1.9.2及以上
- Git：2.17.1及以上
- CMake：3.2.2及以上
- Python：2.7.2及以上 / 3.6及以上
+|             组件             |                           版本要求                           |
+| :--------------------------: | :----------------------------------------------------------: |
+|              OS              |                           CentOS 7                           |
+|             gcc              |                       4.8.5 and later                        |
+|           gcc-c++            |                       4.8.5 and later                        |
+|             git              |                        3.82 and later                        |
+|            cmake             |                       3.2.0 and later                        |
+|            Python            |               2.7.2 and later / 3.6 and later                |
+|              Go              |                       1.9.2 and later                        |
+|             git              |                       2.17.1 and later                       |
+|         glibc-static         |                             2.17                             |
+|        openssl-devel         |                            1.0.2k                            |
+|         bzip2-devel          |                       1.0.6 and later                        |
+| python-devel / python3-devel |              2.7.5 and later / 3.6.8 and later               |
+|         sqlite-devel         |                       3.7.17 and later                       |
+|           patchelf           |                             0.9                              |
+|           libXext            |                            1.3.3                             |
+|            libSM             |                            1.2.2                             |
+|          libXrender          |                            0.9.10                            |
+|          python-whl          | numpy>=1.12, <=1.16.4<br/>google>=2.0.3<br/>protobuf>=3.12.2<br/>grpcio-tools>=1.28.1<br/>grpcio>=1.28.1<br/>func-timeout>=4.3.5<br/>pyyaml>=1.3.0<br/>sentencepiece==0.1.92<br/>flask>=1.1.2<br/>ujson>=2.0.3 |

 推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境，详见[该文档](DOCKER_IMAGES_CN.md)。


--- a/python/pipeline/__init__.py
+++ b/python/pipeline/__init__.py
@@ -11,7 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import logger  # this module must be the first to import
+from operator import Op, RequestOp, ResponseOp
+from pipeline_server import PipelineServer
+from pipeline_client import PipelineClient
+from analyse import Analyst
 from operator import Op, RequestOp, ResponseOp
 from pipeline_server import PipelineServer
 from pipeline_client import PipelineClient

--- a/python/pipeline/analyse.py
+++ b/python/pipeline/analyse.py
@@ -164,7 +164,7 @@ class OpAnalyst(object):

    def add(self, name_str, ts_list):
        if self._close:
-            _LOGGER.error("OpAnalyst is closed.")
+            _LOGGER.error("Failed to add item: OpAnalyst is closed.")
            return
        op_name, curr_idx, step = self._parse(name_str)
        if op_name not in self.op_time_list_dict:

--- a/python/pipeline/channel.py
+++ b/python/pipeline/channel.py
--- a/python/pipeline/dag.py
+++ b/python/pipeline/dag.py
@@ -30,6 +30,7 @@ from .channel import (ThreadChannel, ProcessChannel, ChannelData,
                      ChannelDataEcode, ChannelDataType, ChannelStopError)
 from .profiler import TimeProfiler
 from .util import NameGenerator
+from .proto import pipeline_service_pb2

 _LOGGER = logging.getLogger()

@@ -74,17 +75,18 @@ class DAGExecutor(object):
        self._recive_func = threading.Thread(
            target=DAGExecutor._recive_out_channel_func, args=(self, ))
        self._recive_func.start()
-        _LOGGER.debug("[DAG Executor] start recive thread")
+        _LOGGER.debug("[DAG Executor] Start recive thread")

    def stop(self):
        self._dag.stop()
        self._dag.join()
-        _LOGGER.info("[DAG Executor] succ stop")
+        _LOGGER.info("[DAG Executor] Stop")

    def _get_next_data_id(self):
        data_id = None
        with self._id_lock:
            if self._id_counter >= self._reset_max_id:
+                _LOGGER.info("[DAG Executor] Reset request id")
                self._id_counter -= self._reset_max_id
            data_id = self._id_counter
            self._id_counter += 1
@@ -96,16 +98,18 @@ class DAGExecutor(object):

    def _set_in_channel(self, in_channel):
        if not isinstance(in_channel, (ThreadChannel, ProcessChannel)):
-            _LOGGER.critical("[DAG Executor] in_channel must be Channel"
-                             " type, but get {}".format(type(in_channel)))
+            _LOGGER.critical("[DAG Executor] Failed to set in_channel: "
+                             "in_channel must be Channel type, but get {}".
+                             format(type(in_channel)))
            os._exit(-1)
        in_channel.add_producer(self.name)
        self._in_channel = in_channel

    def _set_out_channel(self, out_channel):
        if not isinstance(out_channel, (ThreadChannel, ProcessChannel)):
-            _LOGGER.critical("[DAG Executor]iout_channel must be Channel"
-                             " type, but get {}".format(type(out_channel)))
+            _LOGGER.critical("[DAG Executor] Failed to set out_channel: "
+                             "must be Channel type, but get {}".format(
+                                 type(out_channel)))
            os._exit(-1)
        out_channel.add_consumer(self.name)
        self._out_channel = out_channel
@@ -116,7 +120,7 @@ class DAGExecutor(object):
            try:
                channeldata_dict = self._out_channel.front(self.name)
            except ChannelStopError:
-                _LOGGER.info("[DAG Executor] channel stop.")
+                _LOGGER.info("[DAG Executor] Stop.")
                with self._cv_for_cv_pool:
                    for data_id, cv in self._cv_pool.items():
                        closed_errror_data = ChannelData(
@@ -130,17 +134,20 @@ class DAGExecutor(object):

            if len(channeldata_dict) != 1:
                _LOGGER.critical(
-                    "[DAG Executor] out_channel cannot have multiple input ops")
+                    "[DAG Executor] Failed to fetch result: out_channel "
+                    "cannot have multiple input ops")
                os._exit(-1)
            (_, channeldata), = channeldata_dict.items()
            if not isinstance(channeldata, ChannelData):
                _LOGGER.critical(
-                    '[DAG Executor] data must be ChannelData type, but get {}'
+                    '[DAG Executor] Failed to fetch result: data in out_channel" \
+                    " must be ChannelData type, but get {}'
                    .format(type(channeldata)))
                os._exit(-1)

            data_id = channeldata.id
-            _LOGGER.debug("recive thread fetch data[{}]".format(data_id))
+            _LOGGER.debug("(logid={}) [recive thread] Fetched data".format(
+                data_id))
            with self._cv_for_cv_pool:
                cond_v = self._cv_pool[data_id]
            with cond_v:
@@ -164,7 +171,7 @@ class DAGExecutor(object):
                    ready_data = self._fetch_buffer[data_id]
                    self._cv_pool.pop(data_id)
                    self._fetch_buffer.pop(data_id)
-        _LOGGER.debug("resp thread get resp data[{}]".format(data_id))
+        _LOGGER.debug("(logid={}) [resp thread] Got data".format(data_id))
        return ready_data

    def _pack_channeldata(self, rpc_request, data_id):
@@ -172,8 +179,10 @@ class DAGExecutor(object):
        try:
            dictdata = self._unpack_rpc_func(rpc_request)
        except Exception as e:
-            _LOGGER.error("parse RPC package to data[{}] Error: {}"
-                          .format(data_id, e))
+            _LOGGER.error(
+                "(logid={}) Failed to parse RPC request package: {}"
+                .format(data_id, e),
+                exc_info=True)
            return ChannelData(
                ecode=ChannelDataEcode.RPC_PACKAGE_ERROR.value,
                error_info="rpc package error: {}".format(e),
@@ -187,7 +196,7 @@ class DAGExecutor(object):
                    profile_value = rpc_request.value[idx]
                    break
            client_need_profile = (profile_value == self._client_profile_value)
-            _LOGGER.debug("request[{}] need profile: {}".format(
+            _LOGGER.debug("(logid={}) Need profile in client: {}".format(
                data_id, client_need_profile))
            return ChannelData(
                datatype=ChannelDataType.DICT.value,
@@ -197,26 +206,28 @@ class DAGExecutor(object):

    def call(self, rpc_request):
        data_id, cond_v = self._get_next_data_id()
-        _LOGGER.debug("generate Request id: {}".format(data_id))
+        _LOGGER.info("(logid={}) Succ generate id".format(data_id))

+        start_call, end_call = None, None
        if not self._is_thread_op:
-            self._profiler.record("call_{}#DAG-{}_0".format(data_id, data_id))
+            start_call = self._profiler.record("call_{}#DAG-{}_0".format(
+                data_id, data_id))
        else:
-            self._profiler.record("call_{}#DAG_0".format(data_id))
+            start_call = self._profiler.record("call_{}#DAG_0".format(data_id))

-        _LOGGER.debug("try parse RPC request to channeldata[{}]".format(
-            data_id))
+        _LOGGER.debug("(logid={}) Parsing RPC request package".format(data_id))
        self._profiler.record("prepack_{}#{}_0".format(data_id, self.name))
        req_channeldata = self._pack_channeldata(rpc_request, data_id)
        self._profiler.record("prepack_{}#{}_1".format(data_id, self.name))

        resp_channeldata = None
        for i in range(self._retry):
-            _LOGGER.debug("push data[{}] into Graph engine".format(data_id))
+            _LOGGER.debug("(logid={}) Pushing data into Graph engine".format(
+                data_id))
            try:
                self._in_channel.push(req_channeldata, self.name)
            except ChannelStopError:
-                _LOGGER.debug("[DAG Executor] channel stop.")
+                _LOGGER.debug("[DAG Executor] Stop")
                with self._cv_for_cv_pool:
                    self._cv_pool.pop(data_id)
                return self._pack_for_rpc_resp(
@@ -225,32 +236,35 @@ class DAGExecutor(object):
                        error_info="dag closed.",
                        data_id=data_id))

-            _LOGGER.debug("wait Graph engine for data[{}]...".format(data_id))
+            _LOGGER.debug("(logid={}) Wait for Graph engine...".format(data_id))
            resp_channeldata = self._get_channeldata_from_fetch_buffer(data_id,
                                                                       cond_v)

            if resp_channeldata.ecode == ChannelDataEcode.OK.value:
-                _LOGGER.debug("request[{}] succ predict".format(data_id))
+                _LOGGER.debug("(logid={}) Succ predict".format(data_id))
                break
            else:
-                _LOGGER.warning("request[{}] predict failed: {}"
-                                .format(data_id, resp_channeldata.error_info))
+                _LOGGER.error("(logid={}) Failed to predict: {}"
+                              .format(data_id, resp_channeldata.error_info))
                if resp_channeldata.ecode != ChannelDataEcode.TIMEOUT.value:
                    break

            if i + 1 < self._retry:
-                _LOGGER.warning("retry({}/{}) data[{}]".format(
-                    i + 1, self._retry, data_id))
+                _LOGGER.warning("(logid={}) DAGExecutor retry({}/{})".format(
+                    data_id, i + 1, self._retry))

-        _LOGGER.debug("unpack channeldata[{}] into RPC response".format(
-            data_id))
+        _LOGGER.debug("(logid={}) Packing RPC response package".format(data_id))
        self._profiler.record("postpack_{}#{}_0".format(data_id, self.name))
        rpc_resp = self._pack_for_rpc_resp(resp_channeldata)
        self._profiler.record("postpack_{}#{}_1".format(data_id, self.name))
        if not self._is_thread_op:
-            self._profiler.record("call_{}#DAG-{}_1".format(data_id, data_id))
+            end_call = self._profiler.record("call_{}#DAG-{}_1".format(data_id,
+                                                                       data_id))
        else:
-            self._profiler.record("call_{}#DAG_1".format(data_id))
+            end_call = self._profiler.record("call_{}#DAG_1".format(data_id))
+        _LOGGER.log(level=1,
+                    msg="(logid={}) call[{} ms]".format(
+                        data_id, (end_call - start_call) / 1e3))

        profile_str = self._profiler.gen_profile_str()
        if self._server_use_profile:
@@ -268,7 +282,17 @@ class DAGExecutor(object):
        return rpc_resp

    def _pack_for_rpc_resp(self, channeldata):
-        return self._pack_rpc_func(channeldata)
+        try:
+            return self._pack_rpc_func(channeldata)
+        except Exception as e:
+            _LOGGER.error(
+                "(logid={}) Failed to pack RPC response package: {}"
+                .format(channeldata.id, e),
+                exc_info=True)
+            resp = pipeline_service_pb2.Response()
+            resp.ecode = ChannelDataEcode.RPC_PACKAGE_ERROR.value
+            resp.error_info = "rpc package error: {}".format(e)
+            return resp


 class DAG(object):
@@ -283,7 +307,7 @@ class DAG(object):
        self._build_dag_each_worker = build_dag_each_worker
        if not self._is_thread_op:
            self._manager = multiprocessing.Manager()
-        _LOGGER.info("[DAG] succ init")
+        _LOGGER.info("[DAG] Succ init")

    def get_use_ops(self, response_op):
        unique_names = set()
@@ -303,7 +327,8 @@ class DAG(object):
                    used_ops.add(pred_op)
                    # check the name of op is globally unique
                    if pred_op.name in unique_names:
-                        _LOGGER.critical("the name of Op must be unique: {}".
+                        _LOGGER.critical("Failed to get used Ops: the"
+                                         " name of Op must be unique: {}".
                                         format(pred_op.name))
                        os._exit(-1)
                    unique_names.add(pred_op.name)
@@ -317,12 +342,12 @@ class DAG(object):
        else:
            channel = ProcessChannel(
                self._manager, name=name_gen.next(), maxsize=self._channel_size)
-        _LOGGER.debug("[DAG] gen Channel: {}".format(channel.name))
+        _LOGGER.debug("[DAG] Generate channel: {}".format(channel.name))
        return channel

    def _gen_virtual_op(self, name_gen):
        vir_op = VirtualOp(name=name_gen.next())
-        _LOGGER.debug("[DAG] gen VirtualOp: {}".format(vir_op.name))
+        _LOGGER.debug("[DAG] Generate virtual_op: {}".format(vir_op.name))
        return vir_op

    def _topo_sort(self, used_ops, response_op, out_degree_ops):
@@ -337,7 +362,8 @@ class DAG(object):
            if len(op.get_input_ops()) == 0:
                zero_indegree_num += 1
        if zero_indegree_num != 1:
-            _LOGGER.critical("DAG contains multiple RequestOps")
+            _LOGGER.critical("Failed to topo sort: DAG contains "
+                             "multiple RequestOps")
            os._exit(-1)
        last_op = response_op.get_input_ops()[0]
        ques[que_idx].put(last_op)
@@ -362,14 +388,15 @@ class DAG(object):
                break
            que_idx = (que_idx + 1) % 2
        if sorted_op_num < len(used_ops):
-            _LOGGER.critical("not legal DAG")
+            _LOGGER.critical("Failed to topo sort: not legal DAG")
            os._exit(-1)

        return dag_views, last_op

    def _build_dag(self, response_op):
        if response_op is None:
-            _LOGGER.critical("ResponseOp has not been set.")
+            _LOGGER.critical("Failed to build DAG: ResponseOp"
+                             " has not been set.")
            os._exit(-1)
        used_ops, out_degree_ops = self.get_use_ops(response_op)
        if not self._build_dag_each_worker:
@@ -380,8 +407,8 @@ class DAG(object):
            _LOGGER.info("-------------------------------------------")
        if len(used_ops) <= 1:
            _LOGGER.critical(
-                "Besides RequestOp and ResponseOp, there should be at least one Op in DAG."
-            )
+                "Failed to build DAG: besides RequestOp and ResponseOp, "
+                "there should be at least one Op in DAG.")
            os._exit(-1)
        if self._build_dag_each_worker:
            _LOGGER.info("Because `build_dag_each_worker` mode is used, "
@@ -443,8 +470,6 @@ class DAG(object):
                    continue
                channel = self._gen_channel(channel_name_gen)
                channels.append(channel)
-                _LOGGER.debug("[DAG] Channel({}) => Op({})"
-                              .format(channel.name, op.name))
                op.add_input_channel(channel)
                pred_ops = pred_op_of_next_view_op[op.name]
                if v_idx == 0:
@@ -452,8 +477,6 @@ class DAG(object):
                else:
                    # if pred_op is virtual op, it will use ancestors as producers to channel
                    for pred_op in pred_ops:
-                        _LOGGER.debug("[DAG] Op({}) => Channel({})"
-                                      .format(pred_op.name, channel.name))
                        pred_op.add_output_channel(channel)
                processed_op.add(op.name)
                # find same input op to combine channel
@@ -469,8 +492,6 @@ class DAG(object):
                            same_flag = False
                            break
                    if same_flag:
-                        _LOGGER.debug("[DAG] Channel({}) => Op({})"
-                                      .format(channel.name, other_op.name))
                        other_op.add_input_channel(channel)
                        processed_op.add(other_op.name)
        output_channel = self._gen_channel(channel_name_gen)
@@ -488,7 +509,7 @@ class DAG(object):
            actual_ops.append(op)

        for c in channels:
-            _LOGGER.debug("Channel({}):\n\t-producers: {}\n\t-consumers: {}"
+            _LOGGER.debug("Channel({}):\n\t- producers: {}\n\t- consumers: {}"
                          .format(c.name, c.get_producers(), c.get_consumers()))

        return (actual_ops, channels, input_channel, output_channel, pack_func,
@@ -497,7 +518,7 @@ class DAG(object):
    def build(self):
        (actual_ops, channels, input_channel, output_channel, pack_func,
         unpack_func) = self._build_dag(self._response_op)
-        _LOGGER.info("[DAG] succ build dag")
+        _LOGGER.info("[DAG] Succ build DAG")

        self._actual_ops = actual_ops
        self._channels = channels

--- a/python/pipeline/operator.py
+++ b/python/pipeline/operator.py
--- a/python/pipeline/pipeline_server.py
+++ b/python/pipeline/pipeline_server.py
@@ -67,9 +67,11 @@ class PipelineServer(object):

    def set_response_op(self, response_op):
        if not isinstance(response_op, ResponseOp):
-            raise Exception("response_op must be ResponseOp type.")
+            raise Exception("Failed to set response_op: response_op "
+                            "must be ResponseOp type.")
        if len(response_op.get_input_ops()) != 1:
-            raise Exception("response_op can only have one previous op.")
+            raise Exception("Failed to set response_op: response_op "
+                            "can only have one previous op.")
        self._response_op = response_op

    def _port_is_available(self, port):
@@ -83,7 +85,8 @@ class PipelineServer(object):

        self._port = conf["port"]
        if not self._port_is_available(self._port):
-            raise SystemExit("Prot {} is already used".format(self._port))
+            raise SystemExit("Failed to prepare_server: prot {} "
+                             "is already used".format(self._port))
        self._worker_num = conf["worker_num"]
        self._build_dag_each_worker = conf["build_dag_each_worker"]


--- a/python/pipeline/profiler.py
+++ b/python/pipeline/profiler.py
@@ -43,8 +43,9 @@ class UnsafeTimeProfiler(object):
    def record(self, name):
        if self._enable is False:
            return
-        self.time_record.append('{}:{} '.format(name,
-                                                int(round(_time() * 1000000))))
+        timestamp = int(round(_time() * 1000000))
+        self.time_record.append('{}:{} '.format(name, timestamp))
+        return timestamp

    def print_profile(self):
        if self._enable is False:
@@ -80,6 +81,7 @@ class TimeProfiler(object):
        name = '_'.join(name_with_tag[:-1])
        with self._lock:
            self._time_record.put((name, tag, timestamp))
+        return timestamp

    def print_profile(self):
        if self._enable is False:

--- a/python/requirements.txt
+++ b/python/requirements.txt
 numpy>=1.12, <=1.16.4 ; python_version<"3.5"
+google>=2.0.3
 protobuf>=3.12.2
 grpcio-tools>=1.28.1
 grpcio>=1.28.1
 func-timeout>=4.3.5
 pyyaml>=1.3.0
+sentencepiece==0.1.92
+flask>=1.1.2
+ujson>=2.0.3
--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -54,7 +54,6 @@ function build_app() {
    local DIRNAME=build-app-$TYPE
    mkdir $DIRNAME # pwd: /Serving
    cd $DIRNAME # pwd: /Serving/build-app-$TYPE
-    pip install numpy sentencepiece
    case $TYPE in
        CPU|GPU)
            cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ \
@@ -295,8 +294,6 @@ function python_run_criteo_ctr_with_cube() {
 function python_test_bert() {
    # pwd: /Serving/python/examples
    local TYPE=$1
-    yum install -y libXext libSM libXrender >/dev/null
-    pip install ujson
    export SERVING_BIN=${SERVING_WORKDIR}/build-server-${TYPE}/core/general-server/serving
    cd bert # pwd: /Serving/python/examples/bert
    case $TYPE in