Merge pull request #462 from guru4elephant/variable_shape

Variable shape

Merge pull request #462 from guru4elephant/variable_shape
Variable shape
4469c36a · MRXLT · GitHub · aff978d7 · 6b908cb8 · 4469c36a
17 changed file
--- a/core/general-client/include/general_model.h
+++ b/core/general-client/include/general_model.h
@@ -45,13 +45,17 @@ class PredictorRes {
  ~PredictorRes() {}

 public:
-  const std::vector<std::vector<int64_t>>& get_int64_by_name(
-      const std::string& name) {
-    return _int64_map[name];
+  const std::vector<int64_t>& get_int64_by_name(const std::string& name) {
+    return _int64_value_map[name];
  }
-  const std::vector<std::vector<float>>& get_float_by_name(
-      const std::string& name) {
-    return _float_map[name];
+  const std::vector<float>& get_float_by_name(const std::string& name) {
+    return _float_value_map[name];
+  }
+  const std::vector<int>& get_shape(const std::string& name) {
+    return _shape_map[name];
+  }
+  const std::vector<int>& get_lod(const std::string& name) {
+    return _lod_map[name];
  }
  void set_variant_tag(const std::string& variant_tag) {
    _variant_tag = variant_tag;
@@ -59,8 +63,10 @@ class PredictorRes {
  const std::string& variant_tag() { return _variant_tag; }

 public:
-  std::map<std::string, std::vector<std::vector<int64_t>>> _int64_map;
-  std::map<std::string, std::vector<std::vector<float>>> _float_map;
+  std::map<std::string, std::vector<int64_t>> _int64_value_map;
+  std::map<std::string, std::vector<float>> _float_value_map;
+  std::map<std::string, std::vector<int>> _shape_map;
+  std::map<std::string, std::vector<int>> _lod_map;

 private:
  std::string _variant_tag;
@@ -81,21 +87,16 @@ class PredictorClient {
  int create_predictor_by_desc(const std::string& sdk_desc);

  int create_predictor();
-  int destroy_predictor();

-  int predict(const std::vector<std::vector<float>>& float_feed,
-              const std::vector<std::string>& float_feed_name,
-              const std::vector<std::vector<int64_t>>& int_feed,
-              const std::vector<std::string>& int_feed_name,
-              const std::vector<std::string>& fetch_name,
-              PredictorRes& predict_res,  // NOLINT
-              const int& pid);
+  int destroy_predictor();

  int batch_predict(
      const std::vector<std::vector<std::vector<float>>>& float_feed_batch,
      const std::vector<std::string>& float_feed_name,
+      const std::vector<std::vector<int>>& float_shape,
      const std::vector<std::vector<std::vector<int64_t>>>& int_feed_batch,
      const std::vector<std::string>& int_feed_name,
+      const std::vector<std::vector<int>>& int_shape,
      const std::vector<std::string>& fetch_name,
      PredictorRes& predict_res_batch,  // NOLINT
      const int& pid);

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -132,152 +132,22 @@ int PredictorClient::create_predictor() {
  _api.thrd_initialize();
 }

-int PredictorClient::predict(const std::vector<std::vector<float>> &float_feed,
-                             const std::vector<std::string> &float_feed_name,
-                             const std::vector<std::vector<int64_t>> &int_feed,
-                             const std::vector<std::string> &int_feed_name,
-                             const std::vector<std::string> &fetch_name,
-                             PredictorRes &predict_res,
-                             const int &pid) {  // NOLINT
-  predict_res._int64_map.clear();
-  predict_res._float_map.clear();
-  Timer timeline;
-  int64_t preprocess_start = timeline.TimeStampUS();
-  _api.thrd_clear();
-  std::string variant_tag;
-  _predictor = _api.fetch_predictor("general_model", &variant_tag);
-  predict_res.set_variant_tag(variant_tag);
-
-  Request req;
-  for (auto &name : fetch_name) {
-    req.add_fetch_var_names(name);
-  }
-
-  std::vector<Tensor *> tensor_vec;
-  FeedInst *inst = req.add_insts();
-  for (auto &name : float_feed_name) {
-    tensor_vec.push_back(inst->add_tensor_array());
-  }
-
-  for (auto &name : int_feed_name) {
-    tensor_vec.push_back(inst->add_tensor_array());
-  }
-
-  int vec_idx = 0;
-  for (auto &name : float_feed_name) {
-    int idx = _feed_name_to_idx[name];
-    Tensor *tensor = tensor_vec[idx];
-    for (int j = 0; j < _shape[idx].size(); ++j) {
-      tensor->add_shape(_shape[idx][j]);
-    }
-    tensor->set_elem_type(1);
-    for (int j = 0; j < float_feed[vec_idx].size(); ++j) {
-      tensor->add_float_data(float_feed[vec_idx][j]);
-    }
-    vec_idx++;
-  }
-
-  VLOG(2) << "feed float feed var done.";
-  vec_idx = 0;
-
-  for (auto &name : int_feed_name) {
-    int idx = _feed_name_to_idx[name];
-    Tensor *tensor = tensor_vec[idx];
-    for (int j = 0; j < _shape[idx].size(); ++j) {
-      tensor->add_shape(_shape[idx][j]);
-    }
-    tensor->set_elem_type(0);
-    for (int j = 0; j < int_feed[vec_idx].size(); ++j) {
-      tensor->add_int64_data(int_feed[vec_idx][j]);
-    }
-    vec_idx++;
-  }
-
-  int64_t preprocess_end = timeline.TimeStampUS();
-  int64_t client_infer_start = timeline.TimeStampUS();
-  Response res;
-
-  int64_t client_infer_end = 0;
-  int64_t postprocess_start = 0;
-  int64_t postprocess_end = 0;
-
-  if (FLAGS_profile_client) {
-    if (FLAGS_profile_server) {
-      req.set_profile_server(true);
-    }
-  }
-
-  res.Clear();
-  if (_predictor->inference(&req, &res) != 0) {
-    LOG(ERROR) << "failed call predictor with req: " << req.ShortDebugString();
-    return -1;
-  } else {
-    VLOG(2) << "predict done.";
-    client_infer_end = timeline.TimeStampUS();
-    postprocess_start = client_infer_end;
-    for (auto &name : fetch_name) {
-      int idx = _fetch_name_to_idx[name];
-      VLOG(2) << "fetch name: " << name;
-      if (_fetch_name_to_type[name] == 0) {
-        int len = res.insts(0).tensor_array(idx).int64_data_size();
-        VLOG(2) << "fetch tensor : " << name << " type: int64 len : " << len;
-        predict_res._int64_map[name].resize(1);
-        predict_res._int64_map[name][0].resize(len);
-        for (int i = 0; i < len; ++i) {
-          predict_res._int64_map[name][0][i] =
-              res.insts(0).tensor_array(idx).int64_data(i);
-        }
-      } else if (_fetch_name_to_type[name] == 1) {
-        int len = res.insts(0).tensor_array(idx).float_data_size();
-        VLOG(2) << "fetch tensor : " << name << " type: float32 len : " << len;
-        predict_res._float_map[name].resize(1);
-        predict_res._float_map[name][0].resize(len);
-        for (int i = 0; i < len; ++i) {
-          predict_res._float_map[name][0][i] =
-              res.insts(0).tensor_array(idx).float_data(i);
-        }
-      }
-      postprocess_end = timeline.TimeStampUS();
-    }
-  }
-
-  if (FLAGS_profile_client) {
-    std::ostringstream oss;
-    oss << "PROFILE\t"
-        << "pid:" << pid << "\t"
-        << "prepro_0:" << preprocess_start << " "
-        << "prepro_1:" << preprocess_end << " "
-        << "client_infer_0:" << client_infer_start << " "
-        << "client_infer_1:" << client_infer_end << " ";
-
-    if (FLAGS_profile_server) {
-      int op_num = res.profile_time_size() / 2;
-      for (int i = 0; i < op_num; ++i) {
-        oss << "op" << i << "_0:" << res.profile_time(i * 2) << " ";
-        oss << "op" << i << "_1:" << res.profile_time(i * 2 + 1) << " ";
-      }
-    }
-
-    oss << "postpro_0:" << postprocess_start << " ";
-    oss << "postpro_1:" << postprocess_end;
-
-    fprintf(stderr, "%s\n", oss.str().c_str());
-  }
-  return 0;
-}
-
 int PredictorClient::batch_predict(
    const std::vector<std::vector<std::vector<float>>> &float_feed_batch,
    const std::vector<std::string> &float_feed_name,
+    const std::vector<std::vector<int>> &float_shape,
    const std::vector<std::vector<std::vector<int64_t>>> &int_feed_batch,
    const std::vector<std::string> &int_feed_name,
+    const std::vector<std::vector<int>> &int_shape,
    const std::vector<std::string> &fetch_name,
    PredictorRes &predict_res_batch,
    const int &pid) {
  int batch_size = std::max(float_feed_batch.size(), int_feed_batch.size());

-  predict_res_batch._int64_map.clear();
-  predict_res_batch._float_map.clear();
+  predict_res_batch._int64_value_map.clear();
+  predict_res_batch._float_value_map.clear();
+  predict_res_batch._shape_map.clear();
+  predict_res_batch._lod_map.clear();
  Timer timeline;
  int64_t preprocess_start = timeline.TimeStampUS();

@@ -294,7 +164,7 @@ int PredictorClient::batch_predict(
  for (auto &name : fetch_name) {
    req.add_fetch_var_names(name);
  }
-  //
+
  for (int bi = 0; bi < batch_size; bi++) {
    VLOG(2) << "prepare batch " << bi;
    std::vector<Tensor *> tensor_vec;
@@ -309,14 +179,14 @@ int PredictorClient::batch_predict(
      tensor_vec.push_back(inst->add_tensor_array());
    }

-    VLOG(2) << "batch [" << bi << "] int_feed_name and float_feed_name "
+    VLOG(2) << "batch [" << bi << "] int_feed_name and float_feed_name"
            << "prepared";
    int vec_idx = 0;
    for (auto &name : float_feed_name) {
      int idx = _feed_name_to_idx[name];
      Tensor *tensor = tensor_vec[idx];
-      for (int j = 0; j < _shape[idx].size(); ++j) {
-        tensor->add_shape(_shape[idx][j]);
+      for (int j = 0; j < float_shape[vec_idx].size(); ++j) {
+        tensor->add_shape(float_shape[vec_idx][j]);
      }
      tensor->set_elem_type(1);
      for (int j = 0; j < float_feed[vec_idx].size(); ++j) {
@@ -332,8 +202,8 @@ int PredictorClient::batch_predict(
    for (auto &name : int_feed_name) {
      int idx = _feed_name_to_idx[name];
      Tensor *tensor = tensor_vec[idx];
-      for (int j = 0; j < _shape[idx].size(); ++j) {
-        tensor->add_shape(_shape[idx][j]);
+      for (int j = 0; j < int_shape[vec_idx].size(); ++j) {
+        tensor->add_shape(int_shape[vec_idx][j]);
      }
      tensor->set_elem_type(0);
      VLOG(3) << "feed var name " << name << " index " << vec_idx
@@ -371,39 +241,43 @@ int PredictorClient::batch_predict(
  } else {
    client_infer_end = timeline.TimeStampUS();
    postprocess_start = client_infer_end;
+
    for (auto &name : fetch_name) {
-      predict_res_batch._int64_map[name].resize(batch_size);
-      predict_res_batch._float_map[name].resize(batch_size);
+      int idx = _fetch_name_to_idx[name];
+      int shape_size = res.insts(0).tensor_array(idx).shape_size();
+      predict_res_batch._shape_map[name].resize(shape_size);
+      for (int i = 0; i < shape_size; ++i) {
+        predict_res_batch._shape_map[name][i] =
+            res.insts(0).tensor_array(idx).shape(i);
+      }
+      int lod_size = res.insts(0).tensor_array(idx).lod_size();
+      if (lod_size > 0) {
+        predict_res_batch._lod_map[name].resize(lod_size);
+        for (int i = 0; i < lod_size; ++i) {
+          predict_res_batch._lod_map[name][i] =
+              res.insts(0).tensor_array(idx).lod(i);
+        }
+      }
    }
-    VLOG(2) << "response batch size " << res.insts_size();
-    VLOG(2) << "response var nmae " << res.insts(0).tensor_array_size();
-    for (int bi = 0; bi < batch_size; bi++) {
-      int idx = 0;
-      for (auto &name : fetch_name) {
-        int len = res.insts(bi).tensor_array(idx).data_size();
-        if (_fetch_name_to_type[name] == 0) {
-          int len = res.insts(bi).tensor_array(idx).int64_data_size();
-          VLOG(2) << "fetch tensor : " << name << " type: int64 len : " << len;
-          predict_res_batch._int64_map[name][bi].resize(len);
-          VLOG(2) << "fetch name " << name << " index " << idx << " first data "
-                  << res.insts(bi).tensor_array(idx).int64_data(0);
-          for (int i = 0; i < len; ++i) {
-            predict_res_batch._int64_map[name][bi][i] =
-                res.insts(bi).tensor_array(idx).int64_data(i);
-          }
-        } else if (_fetch_name_to_type[name] == 1) {
-          int len = res.insts(bi).tensor_array(idx).float_data_size();
-          VLOG(2) << "fetch tensor : " << name
-                  << " type: float32 len : " << len;
-          predict_res_batch._float_map[name][bi].resize(len);
-          VLOG(2) << "fetch name " << name << " index " << idx << " first data "
-                  << res.insts(bi).tensor_array(idx).float_data(0);
-          for (int i = 0; i < len; ++i) {
-            predict_res_batch._float_map[name][bi][i] =
-                res.insts(bi).tensor_array(idx).float_data(i);
-          }
+
+    for (auto &name : fetch_name) {
+      int idx = _fetch_name_to_idx[name];
+      if (_fetch_name_to_type[name] == 0) {
+        predict_res_batch._int64_value_map[name].resize(
+            res.insts(0).tensor_array(idx).int64_data_size());
+        int size = res.insts(0).tensor_array(idx).int64_data_size();
+        for (int i = 0; i < size; ++i) {
+          predict_res_batch._int64_value_map[name][i] =
+              res.insts(0).tensor_array(idx).int64_data(i);
+        }
+      } else {
+        predict_res_batch._float_value_map[name].resize(
+            res.insts(0).tensor_array(idx).float_data_size());
+        int size = res.insts(0).tensor_array(idx).float_data_size();
+        for (int i = 0; i < size; ++i) {
+          predict_res_batch._float_value_map[name][i] =
+              res.insts(0).tensor_array(idx).float_data(i);
        }
-        idx += 1;
      }
    }
    postprocess_end = timeline.TimeStampUS();

--- a/core/general-client/src/pybind_general_model.cpp
+++ b/core/general-client/src/pybind_general_model.cpp
@@ -40,6 +40,16 @@ PYBIND11_MODULE(serving_client, m) {
             return self.get_float_by_name(name);
           },
           py::return_value_policy::reference)
+      .def("get_shape",
+           [](PredictorRes &self, std::string &name) {
+             return self.get_shape(name);
+           },
+           py::return_value_policy::reference)
+      .def("get_lod",
+           [](PredictorRes &self, std::string &name) {
+             return self.get_lod(name);
+           },
+           py::return_value_policy::reference)
      .def("variant_tag",
           [](PredictorRes &self) { return self.variant_tag(); });

@@ -67,39 +77,26 @@ PYBIND11_MODULE(serving_client, m) {
           [](PredictorClient &self) { self.create_predictor(); })
      .def("destroy_predictor",
           [](PredictorClient &self) { self.destroy_predictor(); })
-      .def("predict",
-           [](PredictorClient &self,
-              const std::vector<std::vector<float>> &float_feed,
-              const std::vector<std::string> &float_feed_name,
-              const std::vector<std::vector<int64_t>> &int_feed,
-              const std::vector<std::string> &int_feed_name,
-              const std::vector<std::string> &fetch_name,
-              PredictorRes &predict_res,
-              const int &pid) {
-             return self.predict(float_feed,
-                                 float_feed_name,
-                                 int_feed,
-                                 int_feed_name,
-                                 fetch_name,
-                                 predict_res,
-                                 pid);
-           },
-           py::call_guard<py::gil_scoped_release>())
+
      .def("batch_predict",
           [](PredictorClient &self,
              const std::vector<std::vector<std::vector<float>>>
                  &float_feed_batch,
              const std::vector<std::string> &float_feed_name,
+              const std::vector<std::vector<int>> &float_shape,
              const std::vector<std::vector<std::vector<int64_t>>>
                  &int_feed_batch,
              const std::vector<std::string> &int_feed_name,
+              const std::vector<std::vector<int>> &int_shape,
              const std::vector<std::string> &fetch_name,
              PredictorRes &predict_res_batch,
              const int &pid) {
             return self.batch_predict(float_feed_batch,
                                       float_feed_name,
+                                       float_shape,
                                       int_feed_batch,
                                       int_feed_name,
+                                       int_shape,
                                       fetch_name,
                                       predict_res_batch,
                                       pid);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -73,22 +73,21 @@ int GeneralResponseOp::inference() {

  // response inst with only fetch_var_names
  Response *res = mutable_data<Response>();
-
-  for (int i = 0; i < batch_size; ++i) {
-    FetchInst *fetch_inst = res->add_insts();
-    for (auto &idx : fetch_index) {
-      Tensor *tensor = fetch_inst->add_tensor_array();
-      // currently only response float tensor or lod_tensor
-      tensor->set_elem_type(1);
-      if (model_config->_is_lod_fetch[idx]) {
-        VLOG(2) << "out[" << idx << " is lod_tensor";
-        tensor->add_shape(-1);
-      } else {
-        VLOG(2) << "out[" << idx << "] is tensor";
-        for (int k = 1; k < in->at(idx).shape.size(); ++k) {
-          VLOG(2) << "shape[" << k - 1 << "]: " << in->at(idx).shape[k];
-          tensor->add_shape(in->at(idx).shape[k]);
-        }
+  FetchInst *fetch_inst = res->add_insts();
+  for (auto &idx : fetch_index) {
+    Tensor *tensor = fetch_inst->add_tensor_array();
+    tensor->set_elem_type(1);
+    if (model_config->_is_lod_fetch[idx]) {
+      VLOG(2) << "out[" << idx << "] is lod_tensor";
+      for (int k = 0; k < in->at(idx).shape.size(); ++k) {
+        VLOG(2) << "shape[" << k << "]: " << in->at(idx).shape[k];
+        tensor->add_shape(in->at(idx).shape[k]);
+      }
+    } else {
+      VLOG(2) << "out[" << idx << "] is tensor";
+      for (int k = 0; k < in->at(idx).shape.size(); ++k) {
+        VLOG(2) << "shape[" << k << "]: " << in->at(idx).shape[k];
+        tensor->add_shape(in->at(idx).shape[k]);
      }
    }
  }
@@ -96,62 +95,42 @@ int GeneralResponseOp::inference() {
  int var_idx = 0;
  for (auto &idx : fetch_index) {
    int cap = 1;
-    for (int j = 1; j < in->at(idx).shape.size(); ++j) {
+    for (int j = 0; j < in->at(idx).shape.size(); ++j) {
      cap *= in->at(idx).shape[j];
    }
    if (in->at(idx).dtype == paddle::PaddleDType::INT64) {
      int64_t *data_ptr = static_cast<int64_t *>(in->at(idx).data.data());
      if (model_config->_is_lod_fetch[idx]) {
-        for (int j = 0; j < batch_size; ++j) {
-          for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
-               k++) {
-            FetchInst *fetch_p = res->mutable_insts(j);
-            fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[k]);
-          }
+        FetchInst *fetch_p = res->mutable_insts(0);
+        for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+          fetch_p->mutable_tensor_array(var_idx)->add_lod(
+              in->at(idx).lod[0][j]);
+        }
+        for (int j = 0; j < cap; ++j) {
+          fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[j]);
        }
      } else {
-        int var_size = in->at(idx).shape[0];
-        if (var_size == batch_size) {
-          for (int j = 0; j < batch_size; ++j) {
-            for (int k = j * cap; k < (j + 1) * cap; ++k) {
-              FetchInst *fetch_p = res->mutable_insts(j);
-              fetch_p->mutable_tensor_array(var_idx)->add_int64_data(
-                  data_ptr[k]);
-            }
-          }
-        } else {
-          for (int j = 0; j < batch_size; ++j) {
-            FetchInst *fetch_p = res->mutable_insts(j);
-            fetch_p->mutable_tensor_array(var_idx)->add_int64_data(data_ptr[0]);
-          }
+        FetchInst *fetch_p = res->mutable_insts(0);
+        for (int j = 0; j < cap; ++j) {
+          fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[j]);
        }
      }
      var_idx++;
    } else if (in->at(idx).dtype == paddle::PaddleDType::FLOAT32) {
      float *data_ptr = static_cast<float *>(in->at(idx).data.data());
      if (model_config->_is_lod_fetch[idx]) {
-        for (int j = 0; j < batch_size; ++j) {
-          for (int k = in->at(idx).lod[0][j]; k < in->at(idx).lod[0][j + 1];
-               k++) {
-            FetchInst *fetch_p = res->mutable_insts(j);
-            fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[k]);
-          }
+        FetchInst *fetch_p = res->mutable_insts(0);
+        for (int j = 0; j < in->at(idx).lod[0].size(); ++j) {
+          fetch_p->mutable_tensor_array(var_idx)->add_lod(
+              in->at(idx).lod[0][j]);
+        }
+        for (int j = 0; j < cap; ++j) {
+          fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[j]);
        }
      } else {
-        int var_size = in->at(idx).shape[0];
-        if (var_size == batch_size) {
-          for (int j = 0; j < batch_size; ++j) {
-            for (int k = j * cap; k < (j + 1) * cap; ++k) {
-              FetchInst *fetch_p = res->mutable_insts(j);
-              fetch_p->mutable_tensor_array(var_idx)->add_float_data(
-                  data_ptr[k]);
-            }
-          }
-        } else {
-          for (int j = 0; j < batch_size; ++j) {
-            FetchInst *fetch_p = res->mutable_insts(j);
-            fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[0]);
-          }
+        FetchInst *fetch_p = res->mutable_insts(0);
+        for (int j = 0; j < cap; ++j) {
+          fetch_p->mutable_tensor_array(var_idx)->add_float_data(data_ptr[j]);
        }
      }
      var_idx++;

--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -26,6 +26,7 @@ message Tensor {
  repeated float float_data = 4;
  optional int32 elem_type = 5;
  repeated int32 shape = 6;
+  repeated int32 lod = 7; // only for fetch tensor currently
 };

 message FeedInst { repeated Tensor tensor_array = 1; };

--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -26,6 +26,7 @@ message Tensor {
  repeated float float_data = 4;
  optional int32 elem_type = 5;
  repeated int32 shape = 6;
+  repeated int32 lod = 7; // only for fetch tensor currently
 };

 message FeedInst { repeated Tensor tensor_array = 1; };

--- a/doc/DESIGN.md
+++ b/doc/DESIGN.md
@@ -260,6 +260,7 @@ class Op {

 ```

+
 ### 5.4 Interfaces related to framework

 Service

--- a/python/examples/criteo_ctr/test_client.py
+++ b/python/examples/criteo_ctr/test_client.py
@@ -51,6 +51,5 @@ for ei in range(1000):
    for i in range(1, 27):
        feed_dict["sparse_{}".format(i - 1)] = data[0][i]
    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
-    #print(fetch_map)
 end = time.time()
 print(end - start)
--- a/python/examples/criteo_ctr_with_cube/test_client.py
+++ b/python/examples/criteo_ctr_with_cube/test_client.py
@@ -40,7 +40,7 @@ for ei in range(10000):
    for i in range(1, 27):
        feed_dict["embedding_{}.tmp_0".format(i - 1)] = data[0][i]
    fetch_map = client.predict(feed=feed_dict, fetch=["prob"])
-    prob_list.append(fetch_map['prob'][1])
+    prob_list.append(fetch_map['prob'][0][1])
    label_list.append(data[0][-1][0])

 print(auc(label_list, prob_list))

--- a/python/examples/fit_a_line/test_numpy_input_client.py
+++ b/python/examples/fit_a_line/test_numpy_input_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+
+from paddle_serving_client import Client
+import numpy as np
+import sys
+
+client = Client()
+client.load_client_config(sys.argv[1])
+client.connect(["127.0.0.1:9393"])
+
+import paddle
+test_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.test(), buf_size=500),
+    batch_size=1)
+
+for data in test_reader():
+    fetch_map = client.predict(
+        feed={"x": np.array(data[0][0])}, fetch=["price"])
+    print("{} {}".format(fetch_map["price"][0][0], data[0][1][0]))
--- a/python/examples/imdb/test_client.py
+++ b/python/examples/imdb/test_client.py
@@ -31,4 +31,4 @@ for line in sys.stdin:
    feed = {"words": word_ids}
    fetch = ["acc", "cost", "prediction"]
    fetch_map = client.predict(feed=feed, fetch=fetch)
-    print("{} {}".format(fetch_map["prediction"][1], label[0]))
+    print("{} {}".format(fetch_map["prediction"][0][1], label[0]))
--- a/python/paddle_serving_client/__init__.py
+++ b/python/paddle_serving_client/__init__.py
@@ -18,6 +18,8 @@ import os
 from .proto import sdk_configure_pb2 as sdk
 from .proto import general_model_config_pb2 as m_config
 import google.protobuf.text_format
+import numpy as np
+import time
 import sys

 int_type = 0
@@ -119,6 +121,7 @@ class Client(object):
        self.fetch_names_to_idx_ = {}
        self.lod_tensor_set = set()
        self.feed_tensor_len = {}
+
        for i, var in enumerate(model_conf.feed_var):
            self.feed_names_to_idx_[var.alias_name] = i
            self.feed_types_[var.alias_name] = var.feed_type
@@ -131,11 +134,11 @@ class Client(object):
                for dim in self.feed_shapes_[var.alias_name]:
                    counter *= dim
                self.feed_tensor_len[var.alias_name] = counter
-
        for i, var in enumerate(model_conf.fetch_var):
            self.fetch_names_to_idx_[var.alias_name] = i
            self.fetch_names_to_type_[var.alias_name] = var.fetch_type
-
+            if var.is_lod_tensor:
+                self.lod_tensor_set.add(var.alias_name)
        return

    def add_variant(self, tag, cluster, variant_weight):
@@ -162,7 +165,6 @@ class Client(object):
                    "parameter endpoints({}) will not take effect, because you use the add_variant function.".
                    format(endpoints))
        sdk_desc = self.predictor_sdk_.gen_desc()
-        print(sdk_desc)
        self.client_handle_.create_predictor_by_desc(sdk_desc.SerializeToString(
        ))

@@ -203,6 +205,8 @@ class Client(object):
        float_slot_batch = []
        int_feed_names = []
        float_feed_names = []
+        int_shape = []
+        float_shape = []
        fetch_names = []
        counter = 0
        batch_size = len(feed_batch)
@@ -219,50 +223,69 @@ class Client(object):
        for i, feed_i in enumerate(feed_batch):
            int_slot = []
            float_slot = []
+            int_shape = []
+            float_shape = []
            for key in feed_i:
                if key not in self.feed_names_:
                    raise ValueError("Wrong feed name: {}.".format(key))
-                self.shape_check(feed_i, key)
+                if not isinstance(feed_i[key], np.ndarray):
+                    self.shape_check(feed_i, key)
                if self.feed_types_[key] == int_type:
                    if i == 0:
                        int_feed_names.append(key)
-                    int_slot.append(feed_i[key])
+                        if isinstance(feed_i[key], np.ndarray):
+                            int_shape.append(list(feed_i[key].shape))
+                        else:
+                            int_shape.append(self.feed_shapes_[key])
+                    if isinstance(feed_i[key], np.ndarray):
+                        int_slot.append(np.reshape(feed_i[key], (-1)).tolist())
+                    else:
+                        int_slot.append(feed_i[key])
                elif self.feed_types_[key] == float_type:
                    if i == 0:
                        float_feed_names.append(key)
-                    float_slot.append(feed_i[key])
-            if len(int_slot) + len(float_slot) == 0:
-                raise ValueError("No feed data for predict.")
+                        if isinstance(feed_i[key], np.ndarray):
+                            float_shape.append(list(feed_i[key].shape))
+                        else:
+                            float_shape.append(self.feed_shapes_[key])
+                    if isinstance(feed_i[key], np.ndarray):
+                        float_slot.append(
+                            np.reshape(feed_i[key], (-1)).tolist())
+                    else:
+                        float_slot.append(feed_i[key])
            int_slot_batch.append(int_slot)
            float_slot_batch.append(float_slot)

        result_batch = self.result_handle_
        res = self.client_handle_.batch_predict(
-            float_slot_batch, float_feed_names, int_slot_batch, int_feed_names,
-            fetch_names, result_batch, self.pid)
+            float_slot_batch, float_feed_names, float_shape, int_slot_batch,
+            int_feed_names, int_shape, fetch_names, result_batch, self.pid)

        if res == -1:
            return None

        result_map_batch = []
        result_map = {}
+        # result map needs to be a numpy array
        for i, name in enumerate(fetch_names):
            if self.fetch_names_to_type_[name] == int_type:
                result_map[name] = result_batch.get_int64_by_name(name)
+                shape = result_batch.get_shape(name)
+                result_map[name] = np.array(result_map[name])
+                result_map[name].shape = shape
+                if name in self.lod_tensor_set:
+                    result_map["{}.lod".format(name)] = result_batch.get_lod(
+                        name)
            elif self.fetch_names_to_type_[name] == float_type:
                result_map[name] = result_batch.get_float_by_name(name)
-        for i in range(batch_size):
-            single_result = {}
-            for key in result_map:
-                single_result[key] = result_map[key][i]
-            result_map_batch.append(single_result)
-
-        if batch_size == 1:
-            return [result_map_batch[0], self.result_handle_.variant_tag()
-                    ] if need_variant_tag else result_map_batch[0]
-        else:
-            return [result_map_batch, self.result_handle_.variant_tag()
-                    ] if need_variant_tag else result_map_batch
+                shape = result_batch.get_shape(name)
+                result_map[name] = np.array(result_map[name])
+                result_map[name].shape = shape
+                if name in self.lod_tensor_set:
+                    result_map["{}.lod".format(name)] = result_batch.get_lod(
+                        name)
+
+        return result_map

    def release(self):
        self.client_handle_.destroy_predictor()

--- a/python/paddle_serving_server/web_service.py
+++ b/python/paddle_serving_server/web_service.py
@@ -67,11 +67,15 @@ class WebService(object):
                    feed_batch=feed, fetch=fetch)
                fetch_map_batch = self.postprocess(
                    feed=request.json, fetch=fetch, fetch_map=fetch_map_batch)
+                for key in fetch_map_batch:
+                    fetch_map_batch[key] = fetch_map_batch[key].tolist()
                result = {"result": fetch_map_batch}
            elif isinstance(feed, dict):
                if "fetch" in feed:
                    del feed["fetch"]
                fetch_map = self.client_service.predict(feed=feed, fetch=fetch)
+                for key in fetch_map:
+                    fetch_map[key] = fetch_map[key][0].tolist()
                result = self.postprocess(
                    feed=request.json, fetch=fetch, fetch_map=fetch_map)
        except ValueError:

--- a/python/paddle_serving_server_gpu/web_service.py
+++ b/python/paddle_serving_server_gpu/web_service.py
@@ -107,6 +107,8 @@ class WebService(object):
        fetch_map_batch = self.client.predict(feed=feed, fetch=fetch)
        fetch_map_batch = self.postprocess(
            feed=request.json, fetch=fetch, fetch_map=fetch_map_batch)
+        for key in fetch_map_batch:
+            fetch_map_batch[key] = fetch_map_batch[key].tolist()
        result = {"result": fetch_map_batch}
        return result


--- a/python/requirements.txt
+++ b/python/requirements.txt
-protobuf>=3.1.0
-six
-paddlepaddle-gpu
+numpy>=1.12, <=1.16.4 ; python_version<"3.5"
--- a/python/setup.py.client.in
+++ b/python/setup.py.client.in
@@ -53,7 +53,7 @@ if '${PACK}' == 'ON':


 REQUIRED_PACKAGES = [
-    'six >= 1.10.0', 'protobuf >= 3.1.0'
+    'six >= 1.10.0', 'protobuf >= 3.1.0', 'numpy >= 1.12'
 ]

 if not find_package("paddlepaddle") and not find_package("paddlepaddle-gpu"):

--- a/tools/serving_build.sh
+++ b/tools/serving_build.sh
@@ -18,6 +18,7 @@ function init() {
    export PYTHONROOT=/usr
    cd Serving
    export SERVING_WORKDIR=$PWD
+    $PYTHONROOT/bin/python -m pip install -r python/requirements.txt
 }

 function check_cmd() {