diff --git a/README.md b/README.md
index 747c140ded49f279c289b0bc8a3b4b1963243040..9d1ec854ba67d220a481816cda5eeebf2bc89739 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,9 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `port` | int | `9292` | Exposed port of current service to users|
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
-| `mem_optim` | bool | `False` | Enable memory optimization |
+| `mem_optim` | bool | `False` | Enable memory / graphic memory optimization |
+| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version) | bool | `False` | Run inference with MKL |
 
 Here, we use `curl` to send a HTTP POST request to the service we just started. Users can use any python library to send HTTP POST as well, e.g, [requests](https://requests.readthedocs.io/en/master/).
 </center>
diff --git a/README_CN.md b/README_CN.md
index 266fca330d7597d6188fa0022e6376bc23149c74..0c30ef0cffea7d2940c544c55b641255108908fd 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -87,6 +87,8 @@ python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --po
 | `name` | str | `""` | Service name, can be used to generate HTTP request url |
 | `model` | str | `""` | Path of paddle model directory to be served |
 | `mem_optim` | bool | `False` | Enable memory optimization |
+| `ir_optim` | bool | `False` | Enable analysis and optimization of calculation graph |
+| `use_mkl` (Only for cpu version) | bool | `False` | Run inference with MKL |
 
 我们使用 `curl` 命令来发送HTTP POST请求给刚刚启动的服务。用户也可以调用python库来发送HTTP POST请求，请参考英文文档 [requests](https://requests.readthedocs.io/en/master/)。
 </center>
diff --git a/core/configure/proto/server_configure.proto b/core/configure/proto/server_configure.proto
index 4bdc233099cffbc7949a6b5cf8627fe6461f565c..8956022685090c94be2037445c646e9fbffd1a5c 100644
--- a/core/configure/proto/server_configure.proto
+++ b/core/configure/proto/server_configure.proto
@@ -43,6 +43,7 @@ message EngineDesc {
   optional bool enable_memory_optimization = 13;
   optional bool static_optimization = 14;
   optional bool force_update_static_cache = 15;
+  optional bool enable_ir_optimization = 16;
 };
 
 // model_toolkit conf
diff --git a/core/general-client/src/general_model.cpp b/core/general-client/src/general_model.cpp
index 86f75bc1c1b401cd14f2c6651ea52ef08fdb8c40..cab050e732fb701120c7f1a5c80737fc75282324 100644
--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -345,7 +345,7 @@ int PredictorClient::numpy_predict(
     PredictorRes &predict_res_batch,
     const int &pid) {
   int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-
+  VLOG(2) << "batch size: " << batch_size;
   predict_res_batch.clear();
   Timer timeline;
   int64_t preprocess_start = timeline.TimeStampUS();
@@ -462,7 +462,7 @@ int PredictorClient::numpy_predict(
             for (ssize_t j = 0; j < int_array.shape(1); j++) {
               for (ssize_t k = 0; k < int_array.shape(2); k++) {
                 for (ssize_t l = 0; k < int_array.shape(3); l++) {
-                  tensor->add_float_data(int_array(i, j, k, l));
+                  tensor->add_int64_data(int_array(i, j, k, l));
                 }
               }
             }
@@ -474,7 +474,7 @@ int PredictorClient::numpy_predict(
           for (ssize_t i = 0; i < int_array.shape(0); i++) {
             for (ssize_t j = 0; j < int_array.shape(1); j++) {
               for (ssize_t k = 0; k < int_array.shape(2); k++) {
-                tensor->add_float_data(int_array(i, j, k));
+                tensor->add_int64_data(int_array(i, j, k));
               }
             }
           }
@@ -484,7 +484,7 @@ int PredictorClient::numpy_predict(
           auto int_array = int_feed[vec_idx].unchecked<2>();
           for (ssize_t i = 0; i < int_array.shape(0); i++) {
             for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              tensor->add_float_data(int_array(i, j));
+              tensor->add_int64_data(int_array(i, j));
             }
           }
           break;
@@ -492,7 +492,7 @@ int PredictorClient::numpy_predict(
         case 1: {
           auto int_array = int_feed[vec_idx].unchecked<1>();
           for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            tensor->add_float_data(int_array(i));
+            tensor->add_int64_data(int_array(i));
           }
           break;
         }
diff --git a/core/predictor/framework/infer.h b/core/predictor/framework/infer.h
index 4bb3be9ad2c3dc7ef94a32200b014325aceedf45..e8c0ff47d86f081516a35576655f843a28b0591b 100644
--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -35,6 +35,7 @@ class InferEngineCreationParams {
   InferEngineCreationParams() {
     _path = "";
     _enable_memory_optimization = false;
+    _enable_ir_optimization = false;
     _static_optimization = false;
     _force_update_static_cache = false;
   }
@@ -45,10 +46,16 @@ class InferEngineCreationParams {
     _enable_memory_optimization = enable_memory_optimization;
   }
 
+  void set_enable_ir_optimization(bool enable_ir_optimization) {
+    _enable_ir_optimization = enable_ir_optimization;
+  }
+
   bool enable_memory_optimization() const {
     return _enable_memory_optimization;
   }
 
+  bool enable_ir_optimization() const { return _enable_ir_optimization; }
+
   void set_static_optimization(bool static_optimization = false) {
     _static_optimization = static_optimization;
   }
@@ -68,6 +75,7 @@ class InferEngineCreationParams {
               << "model_path = " << _path << ", "
               << "enable_memory_optimization = " << _enable_memory_optimization
               << ", "
+              << "enable_ir_optimization = " << _enable_ir_optimization << ", "
               << "static_optimization = " << _static_optimization << ", "
               << "force_update_static_cache = " << _force_update_static_cache;
   }
@@ -75,6 +83,7 @@ class InferEngineCreationParams {
  private:
   std::string _path;
   bool _enable_memory_optimization;
+  bool _enable_ir_optimization;
   bool _static_optimization;
   bool _force_update_static_cache;
 };
@@ -150,6 +159,11 @@ class ReloadableInferEngine : public InferEngine {
       force_update_static_cache = conf.force_update_static_cache();
     }
 
+    if (conf.has_enable_ir_optimization()) {
+      _infer_engine_params.set_enable_ir_optimization(
+          conf.enable_ir_optimization());
+    }
+
     _infer_engine_params.set_path(_model_data_path);
     if (enable_memory_optimization) {
       _infer_engine_params.set_enable_memory_optimization(true);
diff --git a/doc/COMPILE.md b/doc/COMPILE.md
index 41a79f082494b0ac22bb4479a5d246cdb6882a3d..f61ac061883581090087a2202e694c9a07468c5f 100644
--- a/doc/COMPILE.md
+++ b/doc/COMPILE.md
@@ -9,14 +9,18 @@
 - Golang: 1.9.2 and later
 - Git：2.17.1 and later
 - CMake：3.2.2 and later
-- Python：2.7.2 and later
+- Python：2.7.2 and later / 3.6 and later
 
 It is recommended to use Docker for compilation. We have prepared the Paddle Serving compilation environment for you: 
 
 - CPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-devel`，dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel)
 - GPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-gpu-devel`，dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)
 
-This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python 3, just adjust the Python options of cmake.
+This document will take Python2 as an example to show how to compile Paddle Serving. If you want to compile with Python3, just adjust the Python options of cmake:
+
+- Set `DPYTHON_INCLUDE_DIR` to `$PYTHONROOT/include/python3.6m/`
+- Set  `DPYTHON_LIBRARIES` to `$PYTHONROOT/lib64/libpython3.6.so`
+- Set `DPYTHON_EXECUTABLE` to `$PYTHONROOT/bin/python3`
 
 ## Get Code
 
@@ -54,6 +58,8 @@ make -j10
 
 execute `make install` to put targets under directory `./output`
 
+**Attention：** After the compilation is successful, you need to set the path of `SERVING_BIN`. See [Note](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE.md#Note) for details.
+
 ## Compile Client
 
 ``` shell
diff --git a/doc/COMPILE_CN.md b/doc/COMPILE_CN.md
index eb334232d98f26e68d719d10cbe458a356738d2f..c6e5426f02335598277ceb40fafc5215c7f03b2b 100644
--- a/doc/COMPILE_CN.md
+++ b/doc/COMPILE_CN.md
@@ -9,14 +9,18 @@
 - Golang: 1.9.2及以上
 - Git：2.17.1及以上
 - CMake：3.2.2及以上
-- Python：2.7.2及以上
+- Python：2.7.2及以上 / 3.6及以上
 
 推荐使用Docker编译，我们已经为您准备好了Paddle Serving编译环境：
 
 - CPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-devel`，dockerfile: [Dockerfile.devel](../tools/Dockerfile.devel)
 - GPU: `hub.baidubce.com/paddlepaddle/serving:0.2.0-gpu-devel`，dockerfile: [Dockerfile.gpu.devel](../tools/Dockerfile.gpu.devel)
 
-本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译，只需要调整cmake的Python相关选项即可。
+本文档将以Python2为例介绍如何编译Paddle Serving。如果您想用Python3进行编译，只需要调整cmake的Python相关选项即可：
+
+- 将`DPYTHON_INCLUDE_DIR`设置为`$PYTHONROOT/include/python3.6m/`
+- 将`DPYTHON_LIBRARIES`设置为`$PYTHONROOT/lib64/libpython3.6.so`
+- 将`DPYTHON_EXECUTABLE`设置为`$PYTHONROOT/bin/python3`
 
 ## 获取代码
 
@@ -54,6 +58,8 @@ make -j10
 
 执行`make install`可以把目标产出放在`./output`目录下。
 
+**注意：** 编译成功后，需要设置`SERVING_BIN`路径，详见后面的[注意事项](https://github.com/PaddlePaddle/Serving/blob/develop/doc/COMPILE_CN.md#注意事项)。
+
 ## 编译Client部分
 
 ``` shell
diff --git a/doc/PERFORMANCE_OPTIM.md b/doc/PERFORMANCE_OPTIM.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b025e94d6f8d3ed69fb76898eb6afada9ca6613
--- /dev/null
+++ b/doc/PERFORMANCE_OPTIM.md
@@ -0,0 +1,18 @@
+# Performance optimization
+
+Due to different model structures, different prediction services consume different computing resources when performing predictions. For online prediction services, models that require less computing resources will have a higher proportion of communication time cost, which is called communication-intensive service. Models that require more computing resources have a higher time cost for inference calculations, which is called computationa-intensive services.
+
+For a prediction service, the easiest way to determine what type it is is to look at the time ratio. Paddle Serving provides [Timeline tool] (../python/examples/util/README_CN.md), which can intuitively display the time spent in each stage of the prediction service.
+
+For communication-intensive prediction services, requests can be aggregated, and within a limit that can tolerate delay, multiple prediction requests can be combined into a batch for prediction.
+
+For computation-intensive prediction services, you can use GPU prediction services instead of CPU prediction services, or increase the number of graphics cards for GPU prediction services.
+
+Under the same conditions, the communication time of the HTTP prediction service provided by Paddle Serving is longer than that of the RPC prediction service, so for communication-intensive services, please give priority to using RPC communication.
+
+Parameters for performance optimization:
+
+| Parameters | Type | Default | Description                                                  |
+| ---------- | ---- | ------- | ------------------------------------------------------------ |
+| mem_optim  | bool | False   | Enable memory / graphic memory optimization                                   |
+| ir_optim   | bool | Fasle   | Enable analysis and optimization of calculation graph,including OP fusion, etc |
diff --git a/doc/PERFORMANCE_OPTIM_CN.md b/doc/PERFORMANCE_OPTIM_CN.md
index dd17bc8afab8472f8f55b4870f73e4c481e97cd3..7bd64d3e2d645c9328ead55e867d0b97946840ad 100644
--- a/doc/PERFORMANCE_OPTIM_CN.md
+++ b/doc/PERFORMANCE_OPTIM_CN.md
@@ -1,6 +1,6 @@
 # 性能优化
 
-由于模型结构的不同，在执行预测时不同的预测对计算资源的消耗也不相同，对于在线的预测服务来说，对计算资源要求较少的模型，通信的时间成本占比就会较高，称为通信密集型服务，对计算资源要求较多的模型，推理计算的时间成本较高，称为计算密集型服务。对于这两种服务类型，可以根据实际需求采取不同的方式进行优化
+由于模型结构的不同，在执行预测时不同的预测服务对计算资源的消耗也不相同。对于在线的预测服务来说，对计算资源要求较少的模型，通信的时间成本占比就会较高，称为通信密集型服务，对计算资源要求较多的模型，推理计算的时间成本较高，称为计算密集型服务。对于这两种服务类型，可以根据实际需求采取不同的方式进行优化
 
 对于一个预测服务来说，想要判断属于哪种类型，最简单的方法就是看时间占比，Paddle Serving提供了[Timeline工具](../python/examples/util/README_CN.md)，可以直观的展现预测服务中各阶段的耗时。
 
@@ -10,4 +10,9 @@
 
 在相同条件下，Paddle Serving提供的HTTP预测服务的通信时间是大于RPC预测服务的，因此对于通信密集型的服务请优先考虑使用RPC的通信方式。
 
-对于模型较大，预测服务内存或显存占用较多的情况，可以通过将--mem_optim选项设置为True来开启内存/显存优化。
+性能优化相关参数：
+
+| 参数      | 类型 | 默认值 | 含义                      |
+| --------- | ---- | ------ | -------------------------------- |
+| mem_optim | bool | False  | 开启内存/显存优化                |
+| ir_optim  | bool | Fasle  | 开启计算图分析优化，包括OP融合等 |
diff --git a/doc/SAVE.md b/doc/SAVE.md
index c1e6b19a45c75a64207802984f52c734d44f8fc8..3f7f97e12e1e309ff0933e150ea7bcd23298b60e 100644
--- a/doc/SAVE.md
+++ b/doc/SAVE.md
@@ -1,8 +1,9 @@
-## How to save a servable model of Paddle Serving?
+# How to save a servable model of Paddle Serving?
 
 ([简体中文](./SAVE_CN.md)|English)
 
-- Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle.
+## Save from training or prediction script 
+Currently, paddle serving provides a save_model interface for users to access, the interface is similar with `save_inference_model` of Paddle.
 ``` python
 import paddle_serving_client.io as serving_io
 serving_io.save_model("imdb_model", "imdb_client_conf",
@@ -29,3 +30,15 @@ for line in sys.stdin:
     fetch_map = client.predict(feed=feed, fetch=fetch)
     print("{} {}".format(fetch_map["prediction"][1], label[0]))
  ```
+
+## Export from saved model files
+If you have saved model files using Paddle's `save_inference_model` API, you can use Paddle Serving's` inference_model_to_serving` API to convert it into a model file that can be used for Paddle Serving.
+```
+import paddle_serving_client.io as serving_io
+serving_io.inference_model_to_serving(dirname, model_filename=None, params_filename=None, serving_server="serving_server", serving_client="serving_client")
+```
+dirname (str) - Path of saved model files. Program file and parameter files are saved in this directory.
+model_filename (str, optional) - The name of file to load the inference program. If it is None, the default filename __model__ will be used. Default: None.
+paras_filename (str, optional) - The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.
+serving_server (str, optional) - The path of model files and configuration files for server. Default: "serving_server".
+serving_client (str, optional) - The path of configuration files for client. Default: "serving_client".
diff --git a/doc/SAVE_CN.md b/doc/SAVE_CN.md
index 43b62c2ac623b386505356194ac136ea305fe683..fc75cd8d015a6d6f42a08f29e4035db20f450d91 100644
--- a/doc/SAVE_CN.md
+++ b/doc/SAVE_CN.md
@@ -1,8 +1,9 @@
-## 怎样保存用于Paddle Serving的模型？
+# 怎样保存用于Paddle Serving的模型？
 
 (简体中文|[English](./SAVE.md))
 
-- 目前，Paddle Serving提供了一个save_model接口供用户访问，该接口与Paddle的`save_inference_model`类似。
+## 从训练或预测脚本中保存
+目前，Paddle Serving提供了一个save_model接口供用户访问，该接口与Paddle的`save_inference_model`类似。
 
 ``` python
 import paddle_serving_client.io as serving_io
@@ -29,3 +30,15 @@ for line in sys.stdin:
     fetch_map = client.predict(feed=feed, fetch=fetch)
     print("{} {}".format(fetch_map["prediction"][1], label[0]))
  ```
+
+## 从已保存的模型文件中导出
+如果已使用Paddle 的`save_inference_model`接口保存出预测要使用的模型，则可以通过Paddle Serving的`inference_model_to_serving`接口转换成可用于Paddle Serving的模型文件。
+```
+import paddle_serving_client.io as serving_io
+serving_io.inference_model_to_serving(dirname, model_filename=None, params_filename=None, serving_server="serving_server", serving_client="serving_client")
+```
+dirname (str) – 需要转换的模型文件存储路径，Program结构文件和参数文件均保存在此目录。
+model_filename (str，可选) – 存储需要转换的模型Inference Program结构的文件名称。如果设置为None，则使用 __model__ 作为默认的文件名。默认值为None。
+params_filename (str，可选) – 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为None。默认值为None。
+serving_server (str, 可选) - 转换后的模型文件和配置文件的存储路径。默认值为"serving_server"。
+serving_client (str, 可选) - 转换后的客户端配置文件存储路径。默认值为"serving_client"。
diff --git a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
index 24148e374e51cb42cb0d8d1423e0ca009e9e8294..a4d8dda71a7977185106bb1552cb8f39ef6bc50e 100644
--- a/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
+++ b/paddle_inference/inferencer-fluid-cpu/include/fluid_cpu_engine.h
@@ -194,6 +194,12 @@ class FluidCpuAnalysisDirCore : public FluidFamilyCore {
       analysis_config.EnableMemoryOptim();
     }
 
+    if (params.enable_ir_optimization()) {
+      analysis_config.SwitchIrOptim(true);
+    } else {
+      analysis_config.SwitchIrOptim(false);
+    }
+
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core =
         paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
diff --git a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
index a3fa365444a40d505b16b22e702d4a8b69699073..2fc6ae587ff26f5f05ff9332f08067ab49d06254 100644
--- a/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
+++ b/paddle_inference/inferencer-fluid-gpu/include/fluid_gpu_engine.h
@@ -198,6 +198,12 @@ class FluidGpuAnalysisDirCore : public FluidFamilyCore {
       analysis_config.EnableMemoryOptim();
     }
 
+    if (params.enable_ir_optimization()) {
+      analysis_config.SwitchIrOptim(true);
+    } else {
+      analysis_config.SwitchIrOptim(false);
+    }
+
     AutoLock lock(GlobalPaddleCreateMutex::instance());
     _core =
         paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(analysis_config);
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c1590fb1b36de669f89711f95c4d49aedadb0c91..07699da458ab62ad1a5b9ece83547799d08f8cf7 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -19,6 +19,8 @@ endif()
 if (CLIENT)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.client.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../tools/python_tag.py
+    ${CMAKE_CURRENT_BINARY_DIR}/python_tag.py)
 endif()
 
 if (APP)
@@ -43,7 +45,8 @@ if (APP)
 add_custom_command(
         OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
         COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_app/ ${PADDLE_SERVING_BINARY_DIR}/python/
-        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel)
+        COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+        DEPENDS ${SERVING_APP_CORE} general_model_config_py_proto ${PY_FILES})
 add_custom_target(paddle_python ALL DEPENDS ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
 endif()
 
@@ -52,6 +55,7 @@ add_custom_command(
 	OUTPUT ${PADDLE_SERVING_BINARY_DIR}/.timestamp
 	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_serving_client/ ${PADDLE_SERVING_BINARY_DIR}/python/
 	COMMAND ${CMAKE_COMMAND} -E copy ${SERVING_CLIENT_CORE} ${PADDLE_SERVING_BINARY_DIR}/python/paddle_serving_client/serving_client.so
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} python_tag.py
 	COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 	DEPENDS ${SERVING_CLIENT_CORE} sdk_configure_py_proto ${PY_FILES})
 add_custom_target(paddle_python ALL DEPENDS serving_client ${PADDLE_SERVING_BINARY_DIR}/.timestamp)
diff --git a/python/examples/criteo_ctr_with_cube/README.md b/python/examples/criteo_ctr_with_cube/README.md
index 056035a68335d88fef813b834538643b8cc04ea0..02125422af7e7ce53a05a1eff9a43159034a79dc 100755
--- a/python/examples/criteo_ctr_with_cube/README.md
+++ b/python/examples/criteo_ctr_with_cube/README.md
@@ -2,16 +2,6 @@
 
 ([简体中文](./README_CN.md)|English)
 
-### Compile Source Code
-in the root directory of this git project
-```
-mkdir build_server
-cd build_server
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
-make -j10
-make install -j10
-```
-
 ### Get Sample Dataset
 
 go to directory `python/examples/criteo_ctr_with_cube`
diff --git a/python/examples/criteo_ctr_with_cube/README_CN.md b/python/examples/criteo_ctr_with_cube/README_CN.md
index 2ba22770b770f318ba0c3ef503c34571d2976e8f..3b6f812ca53bd435e9b11b59e2a459c46ee3f864 100644
--- a/python/examples/criteo_ctr_with_cube/README_CN.md
+++ b/python/examples/criteo_ctr_with_cube/README_CN.md
@@ -1,16 +1,6 @@
 ## 带稀疏参数索引服务的CTR预测服务
 (简体中文|[English](./README.md))
 
-### 编译源代码
-在本项目的根目录下，执行
-```
-mkdir build_server
-cd build_server
-cmake -DPYTHON_INCLUDE_DIR=$PYTHONROOT/include/python2.7/ -DPYTHON_LIBRARIES=$PYTHONROOT/lib64/libpython2.7.so -DPYTHON_EXECUTABLE=$PYTHONROOT/bin/python -DSERVER=ON ..
-make -j10
-make install -j10
-```
-
 ### 获取样例数据
 进入目录 `python/examples/criteo_ctr_with_cube`
 ```
diff --git a/python/paddle_serving_app/local_predict.py b/python/paddle_serving_app/local_predict.py
index 133aa4ccf32d29538d5b7032874f2c770e55e184..6620994165306a550204498e5185bb3aacca8ffd 100644
--- a/python/paddle_serving_app/local_predict.py
+++ b/python/paddle_serving_app/local_predict.py
@@ -71,6 +71,7 @@ class Debugger(object):
         if profile:
             config.enable_profile()
         config.set_cpu_math_library_num_threads(cpu_num)
+        config.switch_ir_optim(False)
 
         self.predictor = create_paddle_predictor(config)
 
diff --git a/python/paddle_serving_client/__init__.py b/python/paddle_serving_client/__init__.py
index 95d74303dce0d932806e02549ec7a56d2f562446..858f889bf257588cf43e3ab30b1b9330de479ba6 100644
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -260,10 +260,16 @@ class Client(object):
                     if i == 0:
                         int_feed_names.append(key)
                         if isinstance(feed_i[key], np.ndarray):
+                            if key in self.lod_tensor_set:
+                                raise ValueError(
+                                    "LodTensor var can not be ndarray type.")
                             int_shape.append(list(feed_i[key].shape))
                         else:
                             int_shape.append(self.feed_shapes_[key])
                     if isinstance(feed_i[key], np.ndarray):
+                        if key in self.lod_tensor_set:
+                            raise ValueError(
+                                "LodTensor var can not be ndarray type.")
                         #int_slot.append(np.reshape(feed_i[key], (-1)).tolist())
                         int_slot.append(feed_i[key])
                         self.has_numpy_input = True
@@ -274,10 +280,16 @@ class Client(object):
                     if i == 0:
                         float_feed_names.append(key)
                         if isinstance(feed_i[key], np.ndarray):
+                            if key in self.lod_tensor_set:
+                                raise ValueError(
+                                    "LodTensor var can not be ndarray type.")
                             float_shape.append(list(feed_i[key].shape))
                         else:
                             float_shape.append(self.feed_shapes_[key])
                     if isinstance(feed_i[key], np.ndarray):
+                        if key in self.lod_tensor_set:
+                            raise ValueError(
+                                "LodTensor var can not be ndarray type.")
                         #float_slot.append(np.reshape(feed_i[key], (-1)).tolist())
                         float_slot.append(feed_i[key])
                         self.has_numpy_input = True
diff --git a/python/paddle_serving_client/io/__init__.py b/python/paddle_serving_client/io/__init__.py
index 74a6ca871b5c1e32b3c1ecbc6656c95d7c78a399..4f174866e5521577ba35f39216f7dd0793879a6c 100644
--- a/python/paddle_serving_client/io/__init__.py
+++ b/python/paddle_serving_client/io/__init__.py
@@ -103,17 +103,21 @@ def save_model(server_model_folder,
         fout.write(config.SerializeToString())
 
 
-def inference_model_to_serving(infer_model, serving_client, serving_server):
+def inference_model_to_serving(dirname,
+                               model_filename=None,
+                               params_filename=None,
+                               serving_server="serving_server",
+                               serving_client="serving_client"):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_program, feed_target_names, fetch_targets = \
-            fluid.io.load_inference_model(dirname=infer_model, executor=exe)
+            fluid.io.load_inference_model(dirname=dirname, executor=exe, model_filename=model_filename, params_filename=params_filename)
     feed_dict = {
         x: inference_program.global_block().var(x)
         for x in feed_target_names
     }
     fetch_dict = {x.name: x for x in fetch_targets}
-    save_model(serving_client, serving_server, feed_dict, fetch_dict,
+    save_model(serving_server, serving_client, feed_dict, fetch_dict,
                inference_program)
     feed_names = feed_dict.keys()
     fetch_names = fetch_dict.keys()
diff --git a/python/paddle_serving_server/__init__.py b/python/paddle_serving_server/__init__.py
index a58fb11ac3ee1fbe5086ae4381f6d6208c0c73ec..971359fca0df3a122b28889e0711c86364a1c45d 100644
--- a/python/paddle_serving_server/__init__.py
+++ b/python/paddle_serving_server/__init__.py
@@ -127,6 +127,7 @@ class Server(object):
         self.model_toolkit_conf = None
         self.resource_conf = None
         self.memory_optimization = False
+        self.ir_optimization = False
         self.model_conf = None
         self.workflow_fn = "workflow.prototxt"
         self.resource_fn = "resource.prototxt"
@@ -175,6 +176,9 @@ class Server(object):
     def set_memory_optimize(self, flag=False):
         self.memory_optimization = flag
 
+    def set_ir_optimize(self, flag=False):
+        self.ir_optimization = flag
+
     def check_local_bin(self):
         if "SERVING_BIN" in os.environ:
             self.use_local_bin = True
@@ -195,6 +199,7 @@ class Server(object):
             engine.enable_batch_align = 0
             engine.model_data_path = model_config_path
             engine.enable_memory_optimization = self.memory_optimization
+            engine.enable_ir_optimization = self.ir_optimization
             engine.static_optimization = False
             engine.force_update_static_cache = False
 
@@ -244,7 +249,7 @@ class Server(object):
         workflow_oi_config_path = None
         if isinstance(model_config_paths, str):
             # If there is only one model path, use the default infer_op.
-            # Because there are several infer_op type, we need to find 
+            # Because there are several infer_op type, we need to find
             # it from workflow_conf.
             default_engine_names = [
                 'general_infer_0', 'general_dist_kv_infer_0',
@@ -284,8 +289,8 @@ class Server(object):
         # check config here
         # print config here
 
-    def use_mkl(self):
-        self.mkl_flag = True
+    def use_mkl(self, flag):
+        self.mkl_flag = flag
 
     def get_device_version(self):
         avx_flag = False
@@ -300,6 +305,10 @@ class Server(object):
             else:
                 device_version = "serving-cpu-avx-openblas-"
         else:
+            if mkl_flag:
+                print(
+                    "Your CPU does not support AVX, server will running with noavx-openblas mode."
+                )
             device_version = "serving-cpu-noavx-openblas-"
         return device_version
 
diff --git a/python/paddle_serving_server/serve.py b/python/paddle_serving_server/serve.py
index 395177a8c77e5c608c2e0364b1d43ac534172d66..70aafbf5c3da4d1a2a8ec50ce5a2258383863057 100644
--- a/python/paddle_serving_server/serve.py
+++ b/python/paddle_serving_server/serve.py
@@ -41,6 +41,9 @@ def parse_args():  # pylint: disable=doc-string-missing
         "--device", type=str, default="cpu", help="Type of device")
     parser.add_argument(
         "--mem_optim", type=bool, default=False, help="Memory optimize")
+    parser.add_argument(
+        "--ir_optim", type=bool, default=False, help="Graph optimize")
+    parser.add_argument("--use_mkl", type=bool, default=False, help="Use MKL")
     parser.add_argument(
         "--max_body_size",
         type=int,
@@ -57,7 +60,9 @@ def start_standard_model():  # pylint: disable=doc-string-missing
     workdir = args.workdir
     device = args.device
     mem_optim = args.mem_optim
+    ir_optim = args.ir_optim
     max_body_size = args.max_body_size
+    use_mkl = args.use_mkl
 
     if model == "":
         print("You must specify your serving model")
@@ -78,6 +83,8 @@ def start_standard_model():  # pylint: disable=doc-string-missing
     server.set_op_sequence(op_seq_maker.get_op_sequence())
     server.set_num_threads(thread_num)
     server.set_memory_optimize(mem_optim)
+    server.set_ir_optimize(ir_optim)
+    server.use_mkl(use_mkl)
     server.set_max_body_size(max_body_size)
     server.set_port(port)
 
diff --git a/python/paddle_serving_server_gpu/__init__.py b/python/paddle_serving_server_gpu/__init__.py
index 5fa4f010f2112bd400b81ba2f616e4ebe963a810..5a06bd712a836617047b0cc947956fc5d2213daa 100644
--- a/python/paddle_serving_server_gpu/__init__.py
+++ b/python/paddle_serving_server_gpu/__init__.py
@@ -47,6 +47,8 @@ def serve_args():
         "--name", type=str, default="None", help="Default service name")
     parser.add_argument(
         "--mem_optim", type=bool, default=False, help="Memory optimize")
+    parser.add_argument(
+        "--ir_optim", type=bool, default=False, help="Graph optimize")
     parser.add_argument(
         "--max_body_size",
         type=int,
@@ -156,6 +158,7 @@ class Server(object):
         self.model_toolkit_conf = None
         self.resource_conf = None
         self.memory_optimization = False
+        self.ir_optimization = False
         self.model_conf = None
         self.workflow_fn = "workflow.prototxt"
         self.resource_fn = "resource.prototxt"
@@ -204,6 +207,9 @@ class Server(object):
     def set_memory_optimize(self, flag=False):
         self.memory_optimization = flag
 
+    def set_ir_optimize(self, flag=False):
+        self.ir_optimization = flag
+
     def check_local_bin(self):
         if "SERVING_BIN" in os.environ:
             self.use_local_bin = True
@@ -240,6 +246,7 @@ class Server(object):
             engine.enable_batch_align = 0
             engine.model_data_path = model_config_path
             engine.enable_memory_optimization = self.memory_optimization
+            engine.enable_ir_optimization = self.ir_optimization
             engine.static_optimization = False
             engine.force_update_static_cache = False
 
diff --git a/python/paddle_serving_server_gpu/serve.py b/python/paddle_serving_server_gpu/serve.py
index 512b5ec0a7d15a030afdcaa5e8daa344b29fb96e..297ff25d2084bead186fa4b9037e5de8282df0fe 100644
--- a/python/paddle_serving_server_gpu/serve.py
+++ b/python/paddle_serving_server_gpu/serve.py
@@ -35,6 +35,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
     thread_num = args.thread
     model = args.model
     mem_optim = args.mem_optim
+    ir_optim = args.ir_optim
     max_body_size = args.max_body_size
     workdir = "{}_{}".format(args.workdir, gpuid)
 
@@ -57,6 +58,7 @@ def start_gpu_card_model(index, gpuid, args):  # pylint: disable=doc-string-miss
     server.set_op_sequence(op_seq_maker.get_op_sequence())
     server.set_num_threads(thread_num)
     server.set_memory_optimize(mem_optim)
+    server.set_ir_optimize(ir_optim)
     server.set_max_body_size(max_body_size)
 
     server.load_model_config(model)
diff --git a/tools/Dockerfile.centos6.devel b/tools/Dockerfile.centos6.devel
index dd519a0a08bd9fc02b7ad51d248912c2a22a811d..dd5a2ef786ed8a9c239a99cabbcfe2d482e6341c 100644
--- a/tools/Dockerfile.centos6.devel
+++ b/tools/Dockerfile.centos6.devel
@@ -21,7 +21,7 @@ RUN yum -y install wget && \
     wget https://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz && \
     tar -zxf Python-2.7.5.tgz && \
     cd Python-2.7.5 && \
-    ./configure --prefix=/usr/local/python2.7 --enable-shared && \
+    ./configure --prefix=/usr/local/python2.7 --enable-shared --enable-unicode=ucs4 && \
     make all && make install && \
     make clean && \
     echo 'export PATH=/usr/local/python2.7/bin:$PATH' >> /root/.bashrc && \
diff --git a/tools/Dockerfile.centos6.gpu.devel b/tools/Dockerfile.centos6.gpu.devel
index 3288f09d4cacc8aa7fa0bd112dc6bf97939ecde5..c34780c151e960134af5f8b448e0465b8285e8b2 100644
--- a/tools/Dockerfile.centos6.gpu.devel
+++ b/tools/Dockerfile.centos6.gpu.devel
@@ -21,7 +21,7 @@ RUN yum -y install wget && \
     wget https://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz && \
     tar -zxf Python-2.7.5.tgz && \
     cd Python-2.7.5 && \
-    ./configure --prefix=/usr/local/python2.7 --enable-shared && \
+    ./configure --prefix=/usr/local/python2.7 --enable-shared --enable-unicode=ucs4 && \
     make all && make install && \
     make clean && \
     echo 'export PATH=/usr/local/python2.7/bin:$PATH' >> /root/.bashrc && \
diff --git a/tools/python_tag.py b/tools/python_tag.py
new file mode 100644
index 0000000000000000000000000000000000000000..75947cff0b1b39d4c262a306bbe2bc878ae7d3ba
--- /dev/null
+++ b/tools/python_tag.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
+import re
+with open("setup.cfg", "w") as f:
+    line = "[bdist_wheel]\npython-tag={0}{1}\nplat-name=linux_x86_64".format(
+        get_abbr_impl(), get_impl_ver())
+    f.write(line)