Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into try-to-get-numay_array-from-ModelRes

Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Serving into try-to-get-numay_array-from-ModelRes
5d9210f7 · barrierye · 7aad48b9 · 78a7beab · 5d9210f7 · 5d9210f7
26 changed file
--- a/README.md
+++ b/README.md
@@ -82,7 +82,9 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `port` | int | `9292` | Exposed port of current service to users|
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
-| `mem_optim` | bool | `False` | Enable memory optimization |
+| `mem_optim` | bool | `False` | Enable memory / graphic memory optimization |
+| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version) | bool | `False` | Run inference with MKL |

 Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
 </center>

--- a/README_CN.md
+++ b/README_CN.md
@@ -87,6 +87,8 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
 | `mem_optim` | bool | `False` | Enable memory optimization |
+| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version) | bool | `False` | Run inference with MKL |

 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。
 </center>

--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -43,6 +43,7 @@ message EngineDesc {
  optional bool enable_memory_optimization = 13;
  optional bool static_optimization = 14;
  optional bool force_update_static_cache = 15;
+  optional bool enable_ir_optimization = 16;
 };

 // model_toolkit conf

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -345,7 +345,7 @@ int PredictorClient::numpy_predict(
    PredictorRes &predict_res_batch,
    const int &pid) {
  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-
+  VLOG(2) << "batch size: " << batch_size;
  predict_res_batch.clear();
  Timer timeline;
  int64_t preprocess_start = timeline.TimeStampUS();
@@ -462,7 +462,7 @@ int PredictorClient::numpy_predict(
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
              for (ssize_t k = 0; k < int_array.shape(2); k++) {
                for (ssize_t l = 0; k < int_array.shape(3); l++) {
-                  tensor->add_float_data(int_array(i, j, k, l));
+                  tensor->add_int64_data(int_array(i, j, k, l));
                }
              }
            }
@@ -474,7 +474,7 @@ int PredictorClient::numpy_predict(
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
              for (ssize_t k = 0; k < int_array.shape(2); k++) {
-                tensor->add_float_data(int_array(i, j, k));
+                tensor->add_int64_data(int_array(i, j, k));
              }
            }
          }
@@ -484,7 +484,7 @@ int PredictorClient::numpy_predict(
          auto int_array = int_feed[vec_idx].unchecked<2>();
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
            for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              tensor->add_float_data(int_array(i, j));
+              tensor->add_int64_data(int_array(i, j));
            }
          }
          break;
@@ -492,7 +492,7 @@ int PredictorClient::numpy_predict(
        case 1: {
          auto int_array = int_feed[vec_idx].unchecked<1>();
          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            tensor->add_float_data(int_array(i));
+            tensor->add_int64_data(int_array(i));
          }
          break;
        }

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -35,6 +35,7 @@ class InferEngineCreationParams {
  InferEngineCreationParams() {
    _path = "";
    _enable_memory_optimization = false;
+    _enable_ir_optimization = false;
    _static_optimization = false;
    _force_update_static_cache = false;
  }
@@ -45,10 +46,16 @@ class InferEngineCreationParams {
    _enable_memory_optimization = enable_memory_optimization;
  }

+  void set_enable_ir_optimization(bool enable_ir_optimization) {
+    _enable_ir_optimization = enable_ir_optimization;
+  }
+
  bool enable_memory_optimization() const {
    return _enable_memory_optimization;
  }

+  bool enable_ir_optimization() const { return _enable_ir_optimization; }
+
  void set_static_optimization(bool static_optimization = false) {
    _static_optimization = static_optimization;
  }
@@ -68,6 +75,7 @@ class InferEngineCreationParams {
              << "model_path = " << _path << ", "
              << "enable_memory_optimization = " << _enable_memory_optimization
              << ", "
+              << "enable_ir_optimization = " << _enable_ir_optimization << ", "
              << "static_optimization = " << _static_optimization << ", "
              << "force_update_static_cache = " << _force_update_static_cache;
  }
@@ -75,6 +83,7 @@ class InferEngineCreationParams {
 private:
  std::string _path;
  bool _enable_memory_optimization;
+  bool _enable_ir_optimization;
  bool _static_optimization;
  bool _force_update_static_cache;
 };
@@ -150,6 +159,11 @@ class ReloadableInferEngine : public InferEngine {
      force_update_static_cache = conf.force_update_static_cache();
    }

+    if (conf.has_enable_ir_optimization()) {
+      _infer_engine_params.set_enable_ir_optimization(
+          conf.enable_ir_optimization());
+    }
+
    _infer_engine_params.set_path(_model_data_path);
    if (enable_memory_optimization) {
      _infer_engine_params.set_enable_memory_optimization(true);

--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -9,14 +9,18 @@
 - Golang: 1.9.2 and later
 - Git：2.17.1 and later
 - CMake：3.2.2 and later
- Python：2.7.2 and later
+- Python：2.7.2 and later / 3.6 and later

 It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you: 

 - CPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-devel`，dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel)
 - GPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-gpu-devel`，dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)

-This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python 3, just adjust the Python options of cmake.
+This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python3, just adjust the Python options of cmake:
+
+- Set `DPYTHON_INCLUDE_DIR` to `$PYTHONROOT/include/python3.6m/`
+- Set  `DPYTHON_LIBRARIES` to `$PYTHONROOT/lib64/libpython3.6.so`
+- Set `DPYTHON_EXECUTABLE` to `$PYTHONROOT/bin/python3`

 ## Get Code

@@ -54,6 +58,8 @@ make -j10

 execute `make install` to put targets under directory `./output`

+**Attention：** After the compilation is successful, you need to set the path of `SERVING_BIN`. See [Note](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md#Note) for details.
+
 ## Compile Client

 ``` shell

--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -9,14 +9,18 @@
 - Golang: 1.9.2及以上
 - Git：2.17.1及以上
 - CMake：3.2.2及以上
- Python：2.7.2及以上
+- Python：2.7.2及以上 / 3.6及以上

 推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境：

 - CPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-devel`，dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel)
 - GPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-gpu-devel`，dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)

-本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译，只需要调整cmake的Python相关选项即可。
+本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译，只需要调整cmake的Python相关选项即可：
+
+- 将`DPYTHON_INCLUDE_DIR`设置为`$PYTHONROOT/include/python3.6m/`
+- 将`DPYTHON_LIBRARIES`设置为`$PYTHONROOT/lib64/libpython3.6.so`
+- 将`DPYTHON_EXECUTABLE`设置为`$PYTHONROOT/bin/python3`

 ## 获取代码

@@ -54,6 +58,8 @@ make -j10

 执行`make install`可以把目标产出放在`./output`目录下。

+**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
+
 ## 编译Client部分

 ``` shell

--- a/doc/PERFORMANCE_OPTIM.md
+++ b/doc/PERFORMANCE_OPTIM.md
+# Performance optimization
+
+Due to different model structures, different prediction services consume different computing resources when performing predictions. For online prediction services, models that require less computing resources will have a higher proportion of communication time cost, which is called communication-intensive service. Models that require more computing resources have a higher time cost for inference calculations, which is called computationa-intensive services.
+
+For a prediction service, the easiest way to determine what type it is is to look at the time ratio. Paddle Serving provides [Timeline tool] (../python/examples/util/README_CN.md), which can intuitively display the time spent in each stage of the prediction service.
+
+For communication-intensive prediction services, requests can be aggregated, and within a limit that can tolerate delay, multiple prediction requests can be combined into a batch for prediction.
+
+For computation-intensive prediction services, you can use GPU prediction services instead of CPU prediction services, or increase the number of graphics cards for GPU prediction services.
+
+Under the same conditions, the communication time of the HTTP prediction service provided by Paddle Serving is longer than that of the RPC prediction service, so for communication-intensive services, please give priority to using RPC communication.
+
+Parameters for performance optimization:
+
+| Parameters | Type | Default | Description                                                  |
+| ---------- | ---- | ------- | ------------------------------------------------------------ |
+| mem_optim  | bool | False   | Enable memory / graphic memory optimization                                   |
+| ir_optim   | bool | Fasle   | Enable analysis and optimization of calculation graph,including OP fusion, etc |
--- a/doc/PERFORMANCE_OPTIM_CN.md
+++ b/doc/PERFORMANCE_OPTIM_CN.md
 # 性能优化

-由于模型结构的不同，在执行预测时不同的预测对计算资源的消耗也不相同，对于在线的预测服务来说，对计算资源要求较少的模型，通信的时间成本占比就会较高，称为通信密集型服务，对计算资源要求较多的模型，推理计算的时间成本较高，称为计算密集型服务。对于这两种服务类型，可以根据实际需求采取不同的方式进行优化
+由于模型结构的不同，在执行预测时不同的预测服务对计算资源的消耗也不相同。对于在线的预测服务来说，对计算资源要求较少的模型，通信的时间成本占比就会较高，称为通信密集型服务，对计算资源要求较多的模型，推理计算的时间成本较高，称为计算密集型服务。对于这两种服务类型，可以根据实际需求采取不同的方式进行优化

 对于一个预测服务来说，想要判断属于哪种类型，最简单的方法就是看时间占比，Paddle Serving提供了[Timeline工具](../python/examples/util/README_CN.md)，可以直观的展现预测服务中各阶段的耗时。

@@ -10,4 +10,9 @@

 在相同条件下，Paddle Serving提供的HTTP预测服务的通信时间是大于RPC预测服务的，因此对于通信密集型的服务请优先考虑使用RPC的通信方式。

-对于模型较大，预测服务内存或显存占用较多的情况，可以通过将--mem_optim选项设置为True来开启内存/显存优化。
+性能优化相关参数：
+
+| 参数      | 类型 | 默认值 | 含义                      |
+| --------- | ---- | ------ | -------------------------------- |
+| mem_optim | bool | False  | 开启内存/显存优化                |
+| ir_optim  | bool | Fasle  | 开启计算图分析优化，包括OP融合等 |
--- a/doc/SAVE.md
+++ b/doc/SAVE.md
-## How to save a servable model of Paddle Serving?
+# How to save a servable model of Paddle Serving?

 ([简体中文](./SAVE_CN.md)|English)

- Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle.
+## Save from training or prediction script 
+Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle.
 ``` python
 import paddle_serving_client.io as serving_io
 serving_io.save_model("imdb_model", "imdb_client_conf",
@@ -29,3 +30,15 @@ for line in sys.stdin:
    fetch_map = client.predict(feed=feed, fetch=fetch)
    print("{} {}".format(fetch_map["prediction"][1], label[0]))
 ```
+
+## Export from saved model files
+If you have saved model files using Paddle's `save_inference_model` API, you can use Paddle Serving's` inference_model_to_serving` API to convert it into a model file that can be used for Paddle Serving.
+```
+import paddle_serving_client.io as serving_io
+serving_io.inference_model_to_serving(dirname, model_filename=None, params_filename=None, serving_server="serving_server", serving_client="serving_client")
+```
+dirname (str) - Path of saved model files. Program file and parameter files are saved in this directory.
+model_filename (str, optional) - The name of file to load the inference program. If it is None, the default filename __model__ will be used. Default: None.
+paras_filename (str, optional) - The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.
+serving_server (str, optional) - The path of model files and configuration files for server. Default: "serving_server".
+serving_client (str, optional) - The path of configuration files for client. Default: "serving_client".
--- a/doc/SAVE_CN.md
+++ b/doc/SAVE_CN.md
-## 怎样保存用于Paddle Serving的模型？
+# 怎样保存用于Paddle Serving的模型？

 (简体中文|[English](./SAVE.md))

- 目前，Paddle Serving提供了一个save_model接口供用户访问，该接口与Paddle的`save_inference_model`类似。
+## 从训练或预测脚本中保存
+目前，Paddle Serving提供了一个save_model接口供用户访问，该接口与Paddle的`save_inference_model`类似。

 ``` python
 import paddle_serving_client.io as serving_io
@@ -29,3 +30,15 @@ for line in sys.stdin:
    fetch_map = client.predict(feed=feed, fetch=fetch)
    print("{} {}".format(fetch_map["prediction"][1], label[0]))
 ```
+
+## 从已保存的模型文件中导出
+如果已使用Paddle 的`save_inference_model`接口保存出预测要使用的模型，则可以通过Paddle Serving的`inference_model_to_serving`接口转换成可用于Paddle Serving的模型文件。
+```
+import paddle_serving_client.io as serving_io
+serving_io.inference_model_to_serving(dirname, model_filename=None, params_filename=None, serving_server="serving_server", serving_client="serving_client")
+```
+dirname (str) – 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。
+model_filename (str，可选) – 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 __model__ 作为默认的文件名。默认值为None。
+params_filename (str，可选) – 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None。默认值为None。
+serving_server (str, 可选) - 转换后的模型文件和配置文件的存储路径。默认值为"serving_server"。
+serving_client (str, 可选) - 转换后的客户端配置文件存储路径。默认值为"serving_client"。
--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -194,6 +194,12 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore {
      analysis_config.EnableMemoryOptim();
    }

+    if (params.enable_ir_optimization()) {
+      analysis_config.SwitchIrOptim(true);
+    } else {
+      analysis_config.SwitchIrOptim(false);
+    }
+
    AutoLock lock(GlobalPaddleCreateMutex::instance());
    _core =
        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);

--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -198,6 +198,12 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
      analysis_config.EnableMemoryOptim();
    }

+    if (params.enable_ir_optimization()) {
+      analysis_config.SwitchIrOptim(true);
+    } else {
+      analysis_config.SwitchIrOptim(false);
+    }
+
    AutoLock lock(GlobalPaddleCreateMutex::instance());
    _core =
        paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -19,6 +19,8 @@ endif()
 if (CLIENT)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py
+    ${CMAKE_CURRENT_BINARY_DIR}/python_tag.py)
 endif()

 if (APP)
@@ -43,7 +45,8 @@ if (APP)
 add_custom_command(
        OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
        COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
-        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel)
+        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+        DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
 add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()

@@ -52,6 +55,7 @@ add_custom_command(
 	OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
 	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
 	COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} python_tag.py
 	COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 	DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
 add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)

--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
@@ -2,16 +2,6 @@

 ([简体中文](./README_CN.md)|English)

-### Compile Source Code
-in the root directory of this git project
-```
-mkdir build_server
-cd build_server
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
-make -j10
-make install -j10
-```
-
 ### Get Sample Dataset

 go to directory `python/examples/criteo_ctr_with_cube`

--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
 ## 带稀疏参数索引服务的CTR预测服务
 (简体中文|[English](./README.md))

-### 编译源代码
-在本项目的根目录下，执行
-```
-mkdir build_server
-cd build_server
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
-make -j10
-make install -j10
-```
-
 ### 获取样例数据
 进入目录 `python/examples/criteo_ctr_with_cube`
 ```

--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -71,6 +71,7 @@ class Debugger(object):
        if profile:
            config.enable_profile()
        config.set_cpu_math_library_num_threads(cpu_num)
+        config.switch_ir_optim(False)

        self.predictor = create_paddle_predictor(config)


--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -260,10 +260,16 @@ class Client(object):
                    if i == 0:
                        int_feed_names.append(key)
                        if isinstance(feed_i[key], np.ndarray):
+                            if key in self.lod_tensor_set:
+                                raise ValueError(
+                                    "LodTensor var can not be ndarray type.")
                            int_shape.append(list(feed_i[key].shape))
                        else:
                            int_shape.append(self.feed_shapes_[key])
                    if isinstance(feed_i[key], np.ndarray):
+                        if key in self.lod_tensor_set:
+                            raise ValueError(
+                                "LodTensor var can not be ndarray type.")
                        #int_slot.append(np.reshape(feed_i[key], (-1)).tolist())
                        int_slot.append(feed_i[key])
                        self.has_numpy_input = True
@@ -274,10 +280,16 @@ class Client(object):
                    if i == 0:
                        float_feed_names.append(key)
                        if isinstance(feed_i[key], np.ndarray):
+                            if key in self.lod_tensor_set:
+                                raise ValueError(
+                                    "LodTensor var can not be ndarray type.")
                            float_shape.append(list(feed_i[key].shape))
                        else:
                            float_shape.append(self.feed_shapes_[key])
                    if isinstance(feed_i[key], np.ndarray):
+                        if key in self.lod_tensor_set:
+                            raise ValueError(
+                                "LodTensor var can not be ndarray type.")
                        #float_slot.append(np.reshape(feed_i[key], (-1)).tolist())
                        float_slot.append(feed_i[key])
                        self.has_numpy_input = True

--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -103,17 +103,21 @@ def save_model(server_model_folder,
        fout.write(config.SerializeToString())


-def inference_model_to_serving(infer_model, serving_client, serving_server):
+def inference_model_to_serving(dirname,
+                               model_filename=None,
+                               params_filename=None,
+                               serving_server="serving_server",
+                               serving_client="serving_client"):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    inference_program, feed_target_names, fetch_targets = \
-            fluid.io.load_inference_model(dirname=infer_model, executor=exe)
+            fluid.io.load_inference_model(dirname=dirname, executor=exe, model_filename=model_filename, params_filename=params_filename)
    feed_dict = {
        x: inference_program.global_block().var(x)
        for x in feed_target_names
    }
    fetch_dict = {x.name: x for x in fetch_targets}
-    save_model(serving_client, serving_server, feed_dict, fetch_dict,
+    save_model(serving_server, serving_client, feed_dict, fetch_dict,
               inference_program)
    feed_names = feed_dict.keys()
    fetch_names = fetch_dict.keys()

--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -127,6 +127,7 @@ class Server(object):
        self.model_toolkit_conf = None
        self.resource_conf = None
        self.memory_optimization = False
+        self.ir_optimization = False
        self.model_conf = None
        self.workflow_fn = "workflow.prototxt"
        self.resource_fn = "resource.prototxt"
@@ -175,6 +176,9 @@ class Server(object):
    def set_memory_optimize(self, flag=False):
        self.memory_optimization = flag

+    def set_ir_optimize(self, flag=False):
+        self.ir_optimization = flag
+
    def check_local_bin(self):
        if "SERVING_BIN" in os.environ:
            self.use_local_bin = True
@@ -195,6 +199,7 @@ class Server(object):
            engine.enable_batch_align = 0
            engine.model_data_path = model_config_path
            engine.enable_memory_optimization = self.memory_optimization
+            engine.enable_ir_optimization = self.ir_optimization
            engine.static_optimization = False
            engine.force_update_static_cache = False

@@ -244,7 +249,7 @@ class Server(object):
        workflow_oi_config_path = None
        if isinstance(model_config_paths, str):
            # If there is only one model path, use the default infer_op.
-            # Because there are several infer_op type, we need to find 
+            # Because there are several infer_op type, we need to find
            # it from workflow_conf.
            default_engine_names = [
                'general_infer_0', 'general_dist_kv_infer_0',
@@ -284,8 +289,8 @@ class Server(object):
        # check config here
        # print config here

-    def use_mkl(self):
-        self.mkl_flag = True
+    def use_mkl(self, flag):
+        self.mkl_flag = flag

    def get_device_version(self):
        avx_flag = False
@@ -300,6 +305,10 @@ class Server(object):
            else:
                device_version = "serving-cpu-avx-openblas-"
        else:
+            if mkl_flag:
+                print(
+                    "Your CPU does not support AVX, server will running with noavx-openblas mode."
+                )
            device_version = "serving-cpu-noavx-openblas-"
        return device_version


--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -41,6 +41,9 @@ def parse_args():  # pylint: disable=doc-string-missing
        "--device", type=str, default="cpu", help="Type of device")
    parser.add_argument(
        "--mem_optim", type=bool, default=False, help="Memory optimize")
+    parser.add_argument(
+        "--ir_optim", type=bool, default=False, help="Graph optimize")
+    parser.add_argument("--use_mkl", type=bool, default=False, help="Use MKL")
    parser.add_argument(
        "--max_body_size",
        type=int,
@@ -57,7 +60,9 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    workdir = args.workdir
    device = args.device
    mem_optim = args.mem_optim
+    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
+    use_mkl = args.use_mkl

    if model == "":
        print("You must specify your serving model")
@@ -78,6 +83,8 @@ def start_standard_model():  # pylint: disable=doc-string-missing
    server.set_op_sequence(op_seq_maker.get_op_sequence())
    server.set_num_threads(thread_num)
    server.set_memory_optimize(mem_optim)
+    server.set_ir_optimize(ir_optim)
+    server.use_mkl(use_mkl)
    server.set_max_body_size(max_body_size)
    server.set_port(port)


--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -47,6 +47,8 @@ def serve_args():
        "--name", type=str, default="None", help="Default service name")
    parser.add_argument(
        "--mem_optim", type=bool, default=False, help="Memory optimize")
+    parser.add_argument(
+        "--ir_optim", type=bool, default=False, help="Graph optimize")
    parser.add_argument(
        "--max_body_size",
        type=int,
@@ -156,6 +158,7 @@ class Server(object):
        self.model_toolkit_conf = None
        self.resource_conf = None
        self.memory_optimization = False
+        self.ir_optimization = False
        self.model_conf = None
        self.workflow_fn = "workflow.prototxt"
        self.resource_fn = "resource.prototxt"
@@ -204,6 +207,9 @@ class Server(object):
    def set_memory_optimize(self, flag=False):
        self.memory_optimization = flag

+    def set_ir_optimize(self, flag=False):
+        self.ir_optimization = flag
+
    def check_local_bin(self):
        if "SERVING_BIN" in os.environ:
            self.use_local_bin = True
@@ -240,6 +246,7 @@ class Server(object):
            engine.enable_batch_align = 0
            engine.model_data_path = model_config_path
            engine.enable_memory_optimization = self.memory_optimization
+            engine.enable_ir_optimization = self.ir_optimization
            engine.static_optimization = False
            engine.force_update_static_cache = False


--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -35,6 +35,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    thread_num = args.thread
    model = args.model
    mem_optim = args.mem_optim
+    ir_optim = args.ir_optim
    max_body_size = args.max_body_size
    workdir = "{}_{}".format(args.workdir, gpuid)

@@ -57,6 +58,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
    server.set_op_sequence(op_seq_maker.get_op_sequence())
    server.set_num_threads(thread_num)
    server.set_memory_optimize(mem_optim)
+    server.set_ir_optimize(ir_optim)
    server.set_max_body_size(max_body_size)

    server.load_model_config(model)

--- a/tools/Dockerfile.centos6.devel
+++ b/tools/Dockerfile.centos6.devel
@@ -21,7 +21,7 @@ RUN yum -y install wget && \
    wget https://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz && \
    tar -zxf Python-2.7.5.tgz && \
    cd Python-2.7.5 && \
-    ./configure --prefix=/usr/local/python2.7 --enable-shared && \
+    ./configure --prefix=/usr/local/python2.7 --enable-shared --enable-unicode=ucs4 && \
    make all && make install && \
    make clean && \
    echo 'export PATH=/usr/local/python2.7/bin:$PATH' >> /root/.bashrc && \

--- a/tools/Dockerfile.centos6.gpu.devel
+++ b/tools/Dockerfile.centos6.gpu.devel
@@ -21,7 +21,7 @@ RUN yum -y install wget && \
    wget https://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz && \
    tar -zxf Python-2.7.5.tgz && \
    cd Python-2.7.5 && \
-    ./configure --prefix=/usr/local/python2.7 --enable-shared && \
+    ./configure --prefix=/usr/local/python2.7 --enable-shared --enable-unicode=ucs4 && \
    make all && make install && \
    make clean && \
    echo 'export PATH=/usr/local/python2.7/bin:$PATH' >> /root/.bashrc && \

--- a/tools/python_tag.py
+++ b/tools/python_tag.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
+import re
+with open("setup.cfg", "w") as f:
+    line = "[bdist_wheel]\npython-tag={0}{1}\nplat-name=linux_x86_64".format(
+        get_abbr_impl(), get_impl_ver())
+    f.write(line)