Merge branch 'develop' of https://github.com/bjjwwang/serving into develop

4b8c5995 · bjjwwang · a6b9d12e · 563421e7 · 4b8c5995 · 4b8c5995
7 changed file
--- a/README.md
+++ b/README.md
@@ -101,17 +101,17 @@ git clone https://github.com/PaddlePaddle/Serving
 install python dependencies
 ```
 cd Serving
-pip install -r python/requirements.txt
+pip3 install -r python/requirements.txt
 ```

 ```shell
-pip install paddle-serving-client==0.6.0
-pip install paddle-serving-server==0.6.0 # CPU
-pip install paddle-serving-app==0.6.0
-pip install paddle-serving-server-gpu==0.6.0.post102 #GPU with CUDA10.2 + TensorRT7
+pip3 install paddle-serving-client==0.6.0
+pip3 install paddle-serving-server==0.6.0 # CPU
+pip3 install paddle-serving-app==0.6.0
+pip3 install paddle-serving-server-gpu==0.6.0.post102 #GPU with CUDA10.2 + TensorRT7
 # DO NOT RUN ALL COMMANDS! check your GPU env and select the right one
-pip install paddle-serving-server-gpu==0.6.0.post101 # GPU with CUDA10.1 + TensorRT6
-pip install paddle-serving-server-gpu==0.6.0.post11 # GPU with CUDA10.1 + TensorRT7
+pip3 install paddle-serving-server-gpu==0.6.0.post101 # GPU with CUDA10.1 + TensorRT6
+pip3 install paddle-serving-server-gpu==0.6.0.post11 # GPU with CUDA10.1 + TensorRT7
 ```

 You may need to use a domestic mirror source (in China, you can use the Tsinghua mirror source, add `-i https://pypi.tuna.tsinghua.edu.cn/simple` to pip command) to speed up the download.
@@ -129,10 +129,10 @@ Recommended to install paddle >= 2.1.0

 ```
 # CPU users, please run
-pip install paddlepaddle==2.1.0
+pip3 install paddlepaddle==2.1.0

 # GPU Cuda10.2 please run
-pip install paddlepaddle-gpu==2.1.0 
+pip3 install paddlepaddle-gpu==2.1.0 
 ```

 **Note**: If your Cuda version is not 10.2, please do not execute the above commands directly, you need to refer to [Paddle official documentation-multi-version whl package list
@@ -141,7 +141,7 @@ pip install paddlepaddle-gpu==2.1.0
 Select the url link of the corresponding GPU environment and install it. For example, for Python3.6 users of Cuda 10.1, please select `cp36-cp36m` and
 The url corresponding to `cuda10.1-cudnn7-mkl-gcc8.2-avx-trt6.0.1.5`, copy it and run
 ```
-pip install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.0-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddlepaddle_gpu-2.1.0.post101-cp36-cp36m-linux_x86_64.whl
+pip3 install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.0-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddlepaddle_gpu-2.1.0.post101-cp36-cp36m-linux_x86_64.whl
 ```

 the default `paddlepaddle-gpu==2.1.0` is Cuda 10.2 with no TensorRT. If you want to install PaddlePaddle with TensorRT. please also check the documentation-multi-version whl package list and find key word `cuda10.2-cudnn8.0-trt7.1.3`. More info please check [Paddle Serving uses TensorRT](./doc/TENSOR_RT.md)
@@ -169,7 +169,7 @@ Paddle Serving provides HTTP and RPC based service for users to access

 A user can also start a RPC service with `paddle_serving_server.serve`. RPC service is usually faster than HTTP service, although a user needs to do some coding based on Paddle Serving's python client API. Note that we do not specify `--name` here. 
 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
 ```
 <center>

@@ -209,7 +209,7 @@ Here, `client.predict` function has two arguments. `feed` is a `python dict` wit
 Users can also put the data format processing logic on the server side, so that they can directly use curl to access the service, refer to the following case whose path is `python/examples/fit_a_line`

 ```
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
 ```
 for client side,
 ```
@@ -225,22 +225,22 @@ Paddle Serving provides industry-leading multi-model tandem services, which stro

 we get two models
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 then we start server side, launch two models as one standalone web service
 ```
-python web_service.py
+python3 web_service.py
 ```
 http request
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
 grpc request
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
 output
 ```
@@ -259,7 +259,7 @@ output

 ### Developers
 - [How to deploy Paddle Serving on K8S?(Chinese)](doc/PADDLE_SERVING_ON_KUBERNETES.md)
- [How to route Paddle Serving to secure endpoint?(Chinese)](doc/SERVIING_AUTH_DOCKER.md)
+- [How to route Paddle Serving to secure endpoint?(Chinese)](doc/SERVING_AUTH_DOCKER.md)
 - [How to develop a new Web Service?](doc/NEW_WEB_SERVICE.md)
 - [Compile from source code](doc/COMPILE.md)
 - [Develop Pipeline Serving](doc/PIPELINE_SERVING.md)

--- a/README_CN.md
+++ b/README_CN.md
@@ -103,17 +103,17 @@ git clone https://github.com/PaddlePaddle/Serving
 安装所需的pip依赖
 ```
 cd Serving
-pip install -r python/requirements.txt
+pip3 install -r python/requirements.txt
 ```

 ```shell
-pip install paddle-serving-client==0.6.0
-pip install paddle-serving-server==0.6.0 # CPU
-pip install paddle-serving-app==0.6.0
-pip install paddle-serving-server-gpu==0.6.0.post102 #GPU with CUDA10.2 + TensorRT7
+pip3 install paddle-serving-client==0.6.0
+pip3 install paddle-serving-server==0.6.0 # CPU
+pip3 install paddle-serving-app==0.6.0
+pip3 install paddle-serving-server-gpu==0.6.0.post102 #GPU with CUDA10.2 + TensorRT7
 # 其他GPU环境需要确认环境再选择执行哪一条
-pip install paddle-serving-server-gpu==0.6.0.post101 # GPU with CUDA10.1 + TensorRT6
-pip install paddle-serving-server-gpu==0.6.0.post11 # GPU with CUDA10.1 + TensorRT7
+pip3 install paddle-serving-server-gpu==0.6.0.post101 # GPU with CUDA10.1 + TensorRT6
+pip3 install paddle-serving-server-gpu==0.6.0.post11 # GPU with CUDA10.1 + TensorRT7
 ```

 您可能需要使用国内镜像源（例如清华源, 在pip命令中添加`-i https://pypi.tuna.tsinghua.edu.cn/simple`）来加速下载。
@@ -130,17 +130,17 @@ paddle-serving-client和paddle-serving-app安装包支持Linux和Windows，其

 ```
 # CPU环境请执行
-pip install paddlepaddle==2.1.0
+pip3 install paddlepaddle==2.1.0

 # GPU Cuda10.2环境请执行
-pip install paddlepaddle-gpu==2.1.0
+pip3 install paddlepaddle-gpu==2.1.0
 ```

 **注意**： 如果您的Cuda版本不是10.2，请勿直接执行上述命令，需要参考[Paddle官方文档-多版本whl包列表](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-release)

 选择相应的GPU环境的url链接并进行安装，例如Cuda 10.1的Python3.6用户，请选择表格当中的`cp36-cp36m`和`cuda10.1-cudnn7-mkl-gcc8.2-avx-trt6.0.1.5`对应的url，复制下来并执行
 ```
-pip install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.0-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddlepaddle_gpu-2.1.0.post101-cp36-cp36m-linux_x86_64.whl
+pip3 install https://paddle-wheel.bj.bcebos.com/with-trt/2.1.0-gpu-cuda10.1-cudnn7-mkl-gcc8.2/paddlepaddle_gpu-2.1.0.post101-cp36-cp36m-linux_x86_64.whl
 ```
 由于默认的`paddlepaddle-gpu==2.1.0`是Cuda 10.2，并没有联编TensorRT，因此如果需要和在`paddlepaddle-gpu`上使用TensorRT，需要在上述多版本whl包列表当中，找到`cuda10.2-cudnn8.0-trt7.1.3`，下载对应的Python版本。更多信息请参考[如何使用TensorRT?](doc/TENSOR_RT_CN.md)。

@@ -168,7 +168,7 @@ Paddle Serving 为用户提供了基于 HTTP 和 RPC 的服务
 用户还可以使用`paddle_serving_server.serve`启动RPC服务。 尽管用户需要基于Paddle Serving的python客户端API进行一些开发，但是RPC服务通常比HTTP服务更快。需要指出的是这里我们没有指定`--name`。

 ``` shell
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292
 ```
 <center>

@@ -209,7 +209,7 @@ print(fetch_map)
 用户也可以将数据格式处理逻辑放在服务器端进行，这样就可以直接用curl去访问服务，参考如下案例，在目录`python/examples/fit_a_line`.

 ```
-python -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
+python3 -m paddle_serving_server.serve --model uci_housing_model --thread 10 --port 9292 --name uci
 ```
 客户端输入
 ```
@@ -226,22 +226,22 @@ Paddle Serving提供业界领先的多模型串联服务，强力支持各大公

 我们先获取两个模型
 ```
-python -m paddle_serving_app.package --get_model ocr_rec
+python3 -m paddle_serving_app.package --get_model ocr_rec
 tar -xzvf ocr_rec.tar.gz
-python -m paddle_serving_app.package --get_model ocr_det
+python3 -m paddle_serving_app.package --get_model ocr_det
 tar -xzvf ocr_det.tar.gz
 ```
 然后启动服务端程序，将两个串联的模型作为一个整体的服务。
 ```
-python web_service.py
+python3 web_service.py
 ```
 最终使用http的方式请求
 ```
-python pipeline_http_client.py
+python3 pipeline_http_client.py
 ```
 也支持rpc的方式
 ```
-python pipeline_rpc_client.py
+python3 pipeline_rpc_client.py
 ```
 输出
 ```
@@ -262,7 +262,7 @@ python pipeline_rpc_client.py
 - [如何编译PaddleServing?](doc/COMPILE_CN.md)
 - [如何开发Pipeline?](doc/PIPELINE_SERVING_CN.md)
 - [如何在K8S集群上部署Paddle Serving?](doc/PADDLE_SERVING_ON_KUBERNETES.md)
- [如何在Paddle Serving上部署安全网关?](doc/SERVIING_AUTH_DOCKER.md)
+- [如何在Paddle Serving上部署安全网关?](doc/SERVING_AUTH_DOCKER.md)
 - [如何开发Pipeline?](doc/PIPELINE_SERVING_CN.md)
 - [如何使用uWSGI部署Web Service](doc/UWSGI_DEPLOY_CN.md)
 - [如何实现模型文件热加载](doc/HOT_LOADING_IN_SERVING_CN.md)

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -18,7 +18,6 @@
 #include "core/sdk-cpp/include/common.h"
 #include "core/sdk-cpp/include/predictor_sdk.h"
 #include "core/util/include/timer.h"
-
 DEFINE_bool(profile_client, false, "");
 DEFINE_bool(profile_server, false, "");

@@ -46,7 +45,7 @@ void PredictorClient::init_gflags(std::vector<std::string> argv) {
    int argc = argv.size();
    char **arr = new char *[argv.size()];
    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
+    for (size_t i = 0; i < argv.size(); ++i) {
      arr[i] = &argv[i][0];
      line += argv[i];
      line += ' ';
@@ -90,9 +89,10 @@ int PredictorClient::init(const std::vector<std::string> &conf_file) {

    if (conf_file.size() > 1) {
      model_config.Clear();
-      if (configure::read_proto_conf(conf_file[conf_file.size()-1].c_str(), &model_config) != 0) {
+      if (configure::read_proto_conf(conf_file[conf_file.size() - 1].c_str(),
+                                     &model_config) != 0) {
        LOG(ERROR) << "Failed to load general model config"
-                  << ", file path: " << conf_file[conf_file.size()-1];
+                   << ", file path: " << conf_file[conf_file.size() - 1];
        return -1;
      }
    }
@@ -154,16 +154,17 @@ int PredictorClient::numpy_predict(
    const std::vector<std::string> &int_feed_name,
    const std::vector<std::vector<int>> &int_shape,
    const std::vector<std::vector<int>> &int_lod_slot_batch,
-    const std::vector<std::vector<std::string>>& string_feed_batch,
-    const std::vector<std::string>& string_feed_name,
-    const std::vector<std::vector<int>>& string_shape,
-    const std::vector<std::vector<int>>& string_lod_slot_batch,
+    const std::vector<std::vector<std::string>> &string_feed_batch,
+    const std::vector<std::string> &string_feed_name,
+    const std::vector<std::vector<int>> &string_shape,
+    const std::vector<std::vector<int>> &string_lod_slot_batch,
    const std::vector<std::string> &fetch_name,
    PredictorRes &predict_res_batch,
    const int &pid,
    const uint64_t log_id) {
  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());
-  batch_size = batch_size > string_feed_batch.size() ? batch_size : string_feed_batch.size();
+  batch_size = batch_size > string_feed_batch.size() ? batch_size
+                                                     : string_feed_batch.size();
  VLOG(2) << "batch size: " << batch_size;
  predict_res_batch.clear();
  Timer timeline;
@@ -187,7 +188,6 @@ int PredictorClient::numpy_predict(
  }

  int vec_idx = 0;
-
  for (int bi = 0; bi < batch_size; bi++) {
    VLOG(2) << "prepare batch " << bi;
    std::vector<Tensor *> tensor_vec;
@@ -207,7 +207,8 @@ int PredictorClient::numpy_predict(
      tensor_vec.push_back(inst->add_tensor_array());
    }

-    VLOG(2) << "batch [" << bi << "] " << "prepared";
+    VLOG(2) << "batch [" << bi << "] "
+            << "prepared";

    vec_idx = 0;
    for (auto &name : float_feed_name) {
@@ -216,7 +217,11 @@ int PredictorClient::numpy_predict(
        LOG(ERROR) << "idx > tensor_vec.size()";
        return -1;
      }
+      int nbytes = float_feed[vec_idx].nbytes();
+      void *rawdata_ptr = (void *)(float_feed[vec_idx].data(0));
+      int total_number = float_feed[vec_idx].size();
      Tensor *tensor = tensor_vec[idx];
+
      VLOG(2) << "prepare float feed " << name << " shape size "
              << float_shape[vec_idx].size();
      for (uint32_t j = 0; j < float_shape[vec_idx].size(); ++j) {
@@ -226,52 +231,12 @@ int PredictorClient::numpy_predict(
        tensor->add_lod(float_lod_slot_batch[vec_idx][j]);
      }
      tensor->set_elem_type(P_FLOAT32);
-      const int float_shape_size = float_shape[vec_idx].size();
-      switch (float_shape_size) {
-        case 4: {
-          auto float_array = float_feed[vec_idx].unchecked<4>();
-          for (ssize_t i = 0; i < float_array.shape(0); i++) {
-            for (ssize_t j = 0; j < float_array.shape(1); j++) {
-              for (ssize_t k = 0; k < float_array.shape(2); k++) {
-                for (ssize_t l = 0; l < float_array.shape(3); l++) {
-                  tensor->add_float_data(float_array(i, j, k, l));
-                }
-              }
-            }
-          }
-          break;
-        }
-        case 3: {
-          auto float_array = float_feed[vec_idx].unchecked<3>();
-          for (ssize_t i = 0; i < float_array.shape(0); i++) {
-            for (ssize_t j = 0; j < float_array.shape(1); j++) {
-              for (ssize_t k = 0; k < float_array.shape(2); k++) {
-                tensor->add_float_data(float_array(i, j, k));
-              }
-            }
-          }
-          break;
-        }
-        case 2: {
-          auto float_array = float_feed[vec_idx].unchecked<2>();
-          for (ssize_t i = 0; i < float_array.shape(0); i++) {
-            for (ssize_t j = 0; j < float_array.shape(1); j++) {
-              tensor->add_float_data(float_array(i, j));
-            }
-          }
-          break;
-        }
-        case 1: {
-          auto float_array = float_feed[vec_idx].unchecked<1>();
-          for (ssize_t i = 0; i < float_array.shape(0); i++) {
-            tensor->add_float_data(float_array(i));
-          }
-          break;
-        }
-      }
+
+      tensor->mutable_float_data()->Resize(total_number, 0);
+      memcpy(tensor->mutable_float_data()->mutable_data(), rawdata_ptr, nbytes);
      vec_idx++;
    }
-    
+
    VLOG(2) << "batch [" << bi << "] "
            << "float feed value prepared";

@@ -283,6 +248,9 @@ int PredictorClient::numpy_predict(
        return -1;
      }
      Tensor *tensor = tensor_vec[idx];
+      int nbytes = int_feed[vec_idx].nbytes();
+      void *rawdata_ptr = (void *)(int_feed[vec_idx].data(0));
+      int total_number = int_feed[vec_idx].size();

      for (uint32_t j = 0; j < int_shape[vec_idx].size(); ++j) {
        tensor->add_shape(int_shape[vec_idx][j]);
@@ -293,71 +261,12 @@ int PredictorClient::numpy_predict(
      tensor->set_elem_type(_type[idx]);

      if (_type[idx] == P_INT64) {
-        VLOG(2) << "prepare int feed " << name << " shape size "
-                << int_shape[vec_idx].size();
+        tensor->mutable_int64_data()->Resize(total_number, 0);
+        memcpy(
+            tensor->mutable_int64_data()->mutable_data(), rawdata_ptr, nbytes);
      } else {
-        VLOG(2) << "prepare int32 feed " << name << " shape size "
-                << int_shape[vec_idx].size();
-      }
-
-      const int int_shape_size = int_shape[vec_idx].size();
-      switch (int_shape_size) {
-        case 4: {
-          auto int_array = int_feed[vec_idx].unchecked<4>();
-          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              for (ssize_t k = 0; k < int_array.shape(2); k++) {
-                for (ssize_t l = 0; k < int_array.shape(3); l++) {
-                  if (_type[idx] == P_INT64) {
-                    tensor->add_int64_data(int_array(i, j, k, l));
-                  } else {
-                    tensor->add_int_data(int_array(i, j, k, l));
-                  }
-                }
-              }
-            }
-          }
-          break;
-        }
-        case 3: {
-          auto int_array = int_feed[vec_idx].unchecked<3>();
-          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              for (ssize_t k = 0; k < int_array.shape(2); k++) {
-                if (_type[idx] == P_INT64) {
-                  tensor->add_int64_data(int_array(i, j, k));
-                } else {
-                  tensor->add_int_data(int_array(i, j, k));
-                }
-              }
-            }
-          }
-          break;
-        }
-        case 2: {
-          auto int_array = int_feed[vec_idx].unchecked<2>();
-          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            for (ssize_t j = 0; j < int_array.shape(1); j++) {
-              if (_type[idx] == P_INT64) {
-                tensor->add_int64_data(int_array(i, j));
-              } else {
-                tensor->add_int_data(int_array(i, j));
-              }
-            }
-          }
-          break;
-        }
-        case 1: {
-          auto int_array = int_feed[vec_idx].unchecked<1>();
-          for (ssize_t i = 0; i < int_array.shape(0); i++) {
-            if (_type[idx] == P_INT64) {
-              tensor->add_int64_data(int_array(i));
-            } else {
-              tensor->add_int_data(int_array(i));
-            }
-          }
-          break;
-        }
+        tensor->mutable_int_data()->Resize(total_number, 0);
+        memcpy(tensor->mutable_int_data()->mutable_data(), rawdata_ptr, nbytes);
      }
      vec_idx++;
    }
@@ -383,10 +292,11 @@ int PredictorClient::numpy_predict(
      tensor->set_elem_type(P_STRING);

      const int string_shape_size = string_shape[vec_idx].size();
-      //string_shape[vec_idx] = [1];cause numpy has no datatype of string.
-      //we pass string via vector<vector<string> >.
+      // string_shape[vec_idx] = [1];cause numpy has no datatype of string.
+      // we pass string via vector<vector<string> >.
      if (string_shape_size != 1) {
-        LOG(ERROR) << "string_shape_size should be 1-D, but received is : " << string_shape_size;
+        LOG(ERROR) << "string_shape_size should be 1-D, but received is : "
+                   << string_shape_size;
        return -1;
      }
      switch (string_shape_size) {
@@ -397,7 +307,7 @@ int PredictorClient::numpy_predict(
      }
      vec_idx++;
    }
-    
+
    VLOG(2) << "batch [" << bi << "] "
            << "string feed value prepared";
  }

--- a/doc/DOCKER_IMAGES.md
+++ b/doc/DOCKER_IMAGES.md
@@ -29,10 +29,12 @@ You can get images in two ways:
 Runtime images cannot be used for compilation.
 If you want to customize your Serving based on source code, use the version with the suffix - devel.

+**cuda10.1-cudnn7-gcc54 image is not ready, you should run from dockerfile if you need it.**
+
 |                         Description                          |   OS    |             TAG              |                          Dockerfile                          |
 | :----------------------------------------------------------: | :-----: | :--------------------------: | :----------------------------------------------------------: |
 |                       CPU development                        | Ubuntu16 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
-|              GPU (cuda10.1-cudnn7-tensorRT6-gcc54) development               | Ubuntu16 | latest-cuda10.1-cudnn7-gcc54-devel | [Dockerfile.cuda10.1-cudnn7-gcc54.devel](../tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel) |
+|              GPU (cuda10.1-cudnn7-tensorRT6-gcc54) development               | Ubuntu16 | latest-cuda10.1-cudnn7-gcc54-devel(not ready) | [Dockerfile.cuda10.1-cudnn7-gcc54.devel](../tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel) |
 |              GPU (cuda10.1-cudnn7-tensorRT6) development               | Ubuntu16 | latest-cuda10.1-cudnn7-devel | [Dockerfile.cuda10.1-cudnn7.devel](../tools/Dockerfile.cuda10.1-cudnn7.devel) |
 |              GPU (cuda10.2-cudnn8-tensorRT7) development               | Ubuntu16 | latest-cuda10.2-cudnn8-devel | [Dockerfile.cuda10.2-cudnn8.devel](../tools/Dockerfile.cuda10.2-cudnn8.devel) |
 |              GPU (cuda11-cudnn8-tensorRT7) development               | Ubuntu18 | latest-cuda11-cudnn8-devel | [Dockerfile.cuda11-cudnn8.devel](../tools/Dockerfile.cuda11-cudnn8.devel) |
@@ -65,7 +67,7 @@ Develop Images:
 |    CPU   | >=0.5.0 | 0.6.0-devel                 | Ubuntu 16 |  8.2.0       |
 |          | <=0.4.0 | 0.4.0-devel                  | CentOS 7  | 4.8.5       |
 | Cuda10.1 | >=0.5.0 | 0.6.0-cuda10.1-cudnn7-devel  | Ubuntu 16 |   8.2.0       |
-|          | 0.6.0   | 0.6.0-cuda10.1-cudnn7-gcc54-devel  | Ubuntu 16 |  5.4.0 |
+|          | 0.6.0   | 0.6.0-cuda10.1-cudnn7-gcc54-devel(not ready)  | Ubuntu 16 |  5.4.0 |
 |          | <=0.4.0 | 0.6.0-cuda10.1-cudnn7-devel    | CentOS 7  | 4.8.5     |
 | Cuda10.2 | >=0.5.0 | 0.6.0-cuda10.2-cudnn8-devel  | Ubuntu 16 |   8.2.0       |
 |          | <=0.4.0 | Nan                          | Nan       | Nan         |

--- a/doc/DOCKER_IMAGES_CN.md
+++ b/doc/DOCKER_IMAGES_CN.md
@@ -31,11 +31,12 @@
 若需要基于源代码二次开发编译，请使用后缀为-devel的版本。
 **在TAG列，latest也可以替换成对应的版本号，例如0.5.0/0.4.1等，但需要注意的是，部分开发环境随着某个版本迭代才增加，因此并非所有环境都有对应的版本号可以使用。**

+**cuda10.1-cudnn7-gcc54环境尚未同步到镜像仓库，如果您需要相关镜像请运行相关dockerfile**

 |                         镜像选择                         |   操作系统    |             TAG              |                          Dockerfile                          |
 | :----------------------------------------------------------: | :-----: | :--------------------------: | :----------------------------------------------------------: |
 |                       CPU development                        | Ubuntu16 |         latest-devel         |        [Dockerfile.devel](../tools/Dockerfile.devel)         |
-|              GPU (cuda10.1-cudnn7-tensorRT6-gcc54) development               | Ubuntu16 | latest-cuda10.1-cudnn7-gcc54-devel | [Dockerfile.cuda10.1-cudnn7-gcc54.devel](../tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel) |
+|              GPU (cuda10.1-cudnn7-tensorRT6-gcc54) development               | Ubuntu16 | latest-cuda10.1-cudnn7-gcc54-devel (not ready) | [Dockerfile.cuda10.1-cudnn7-gcc54.devel](../tools/Dockerfile.cuda10.1-cudnn7-gcc54.devel) |
 |              GPU (cuda10.1-cudnn7-tensorRT6) development               | Ubuntu16 | latest-cuda10.1-cudnn7-devel | [Dockerfile.cuda10.1-cudnn7.devel](../tools/Dockerfile.cuda10.1-cudnn7.devel) |
 |              GPU (cuda10.2-cudnn8-tensorRT7) development               | Ubuntu16 | latest-cuda10.2-cudnn8-devel | [Dockerfile.cuda10.2-cudnn8.devel](../tools/Dockerfile.cuda10.2-cudnn8.devel) |
 |              GPU (cuda11-cudnn8-tensorRT7) development               | Ubuntu18 | latest-cuda11-cudnn8-devel | [Dockerfile.cuda11-cudnn8.devel](../tools/Dockerfile.cuda11-cudnn8.devel) |
@@ -71,7 +72,7 @@ registry.baidubce.com/paddlepaddle/serving:xpu-x86 # for x86 xpu user
 |    CPU   | >=0.5.0 | 0.6.0-devel                 | Ubuntu 16 |  8.2.0       |
 |          | <=0.4.0 | 0.4.0-devel                  | CentOS 7  | 4.8.5       |
 | Cuda10.1 | >=0.5.0 | 0.6.0-cuda10.1-cudnn7-devel  | Ubuntu 16 |   8.2.0       |
-|          | 0.6.0   | 0.6.0-cuda10.1-cudnn7-gcc54-devel  | Ubuntu 16 |  5.4.0 |
+|          | 0.6.0   | 0.6.0-cuda10.1-cudnn7-gcc54-devel (not ready)  | Ubuntu 16 |  5.4.0 |
 |          | <=0.4.0 | 0.6.0-cuda10.1-cudnn7-devel    | CentOS 7  | 4.8.5     |
 | Cuda10.2 | >=0.5.0 | 0.6.0-cuda10.2-cudnn8-devel  | Ubuntu 16 |   8.2.0       |
 |          | <=0.4.0 | Nan                          | Nan       | Nan         |

--- a/python/paddle_serving_client/client.py
+++ b/python/paddle_serving_client/client.py
@@ -356,7 +356,8 @@ class Client(object):
                        int_feed_names.append(key)
                        shape_lst = []
                        if batch == False:
-                            feed_i[key] = feed_i[key][np.newaxis, :]
+                            feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
+                                1, axis=0)
                        if isinstance(feed_i[key], np.ndarray):
                            shape_lst.extend(list(feed_i[key].shape))
                            int_shape.append(shape_lst)
@@ -369,10 +370,10 @@ class Client(object):
                            int_lod_slot_batch.append([])

                    if isinstance(feed_i[key], np.ndarray):
-                        int_slot.append(feed_i[key])
+                        int_slot.append(np.ascontiguousarray(feed_i[key]))
                        self.has_numpy_input = True
                    else:
-                        int_slot.append(feed_i[key])
+                        int_slot.append(np.ascontiguousarray(feed_i[key]))
                        self.all_numpy_input = False

                elif self.feed_types_[key] in float_type:
@@ -380,7 +381,8 @@ class Client(object):
                        float_feed_names.append(key)
                        shape_lst = []
                        if batch == False:
-                            feed_i[key] = feed_i[key][np.newaxis, :]
+                            feed_i[key] = np.expand_dims(feed_i[key], 0).repeat(
+                                1, axis=0)
                        if isinstance(feed_i[key], np.ndarray):
                            shape_lst.extend(list(feed_i[key].shape))
                            float_shape.append(shape_lst)
@@ -393,10 +395,10 @@ class Client(object):
                            float_lod_slot_batch.append([])

                    if isinstance(feed_i[key], np.ndarray):
-                        float_slot.append(feed_i[key])
+                        float_slot.append(np.ascontiguousarray(feed_i[key]))
                        self.has_numpy_input = True
                    else:
-                        float_slot.append(feed_i[key])
+                        float_slot.append(np.ascontiguousarray(feed_i[key]))
                        self.all_numpy_input = False
                #if input is string, feed is not numpy.
                elif self.feed_types_[key] in string_type:
@@ -408,7 +410,7 @@ class Client(object):
                                key)])
                        else:
                            string_lod_slot_batch.append([])
-                    string_slot.append(feed_i[key])
+                    string_slot.append(np.ascontiguousarray(feed_i[key]))
                    self.has_numpy_input = True
            int_slot_batch.append(int_slot)
            int_lod_slot_batch.append(int_lod_slot)
@@ -626,6 +628,7 @@ class MultiLangClient(object):
                        raise Exception("error tensor value type.")
                else:
                    raise Exception("var must be list or ndarray.")
+                data = np.ascontiguousarray(data)
                tensor.data = data.tobytes()
            tensor.shape.extend(list(var.shape))
            if "{}.lod".format(name) in feed.keys():
@@ -700,7 +703,7 @@ class MultiLangClient(object):
        if batch is False:
            for key in feed:
                if ".lod" not in key:
-                    feed[key] = feed[key][np.newaxis, :]
+                    feed[key] = np.expand_dims(feed[key], 0).repeat(1, axis=0)
        if not asyn:
            try:
                self.profile_.record('py_prepro_0')

--- a/python/paddle_serving_server/rpc_service.py
+++ b/python/paddle_serving_server/rpc_service.py
@@ -126,7 +126,7 @@ class MultiLangServerServiceServicer(multi_lang_general_model_service_pb2_grpc.
                    else:
                        raise Exception("error type.")
                data.shape = list(feed_inst.tensor_array[idx].shape)
-                feed_dict[name] = data
+                feed_dict[name] = np.ascontiguousarray(data)
                if len(var.lod) > 0:
                    feed_dict["{}.lod".format(name)] = var.lod
            feed_batch.append(feed_dict)