server support dtype uint8&int8

ee5a9489 · ShiningZhang · 52f2c635 · ee5a9489 · ee5a9489 · ee5a9489
7 changed file
--- a/core/general-client/src/client.cpp
+++ b/core/general-client/src/client.cpp
@@ -23,7 +23,23 @@ using configure::GeneralModelConfig;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// paddle inference 2.1 support: FLOAT32, INT64, INT32, UINT8
+// will support: INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING,
+};
 int ServingClient::init(const std::vector<std::string>& client_conf,
           const std::string server_port) {

--- a/core/general-client/src/general_model.cpp
+++ b/core/general-client/src/general_model.cpp
@@ -25,7 +25,23 @@ using baidu::paddle_serving::Timer;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::general_model::Response;
 using baidu::paddle_serving::predictor::general_model::Tensor;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// paddle inference 2.1 support: FLOAT32, INT64, INT32, UINT8
+// will support: INT8, FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING,
+};
 std::once_flag gflags_init_flag;
 namespace py = pybind11;

--- a/core/general-server/op/general_reader_op.cpp
+++ b/core/general-server/op/general_reader_op.cpp
@@ -31,7 +31,23 @@ using baidu::paddle_serving::predictor::MempoolWrapper;
 using baidu::paddle_serving::predictor::general_model::Tensor;
 using baidu::paddle_serving::predictor::general_model::Request;
 using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
-enum ProtoDataType { P_INT64, P_FLOAT32, P_INT32, P_STRING };
+// paddle inference 2.1 support: FLOAT32, INT64, INT32, UINT8, INT8
+// will support: FLOAT16
+enum ProtoDataType {
+  P_INT64 = 0,
+  P_FLOAT32,
+  P_INT32,
+  P_FP64,
+  P_INT16,
+  P_FP16,
+  P_BF16,
+  P_UINT8,
+  P_INT8,
+  P_BOOL,
+  P_COMPLEX64,
+  P_COMPLEX128,
+  P_STRING = 20,
+};
 int GeneralReaderOp::inference() {
  // read request from client
@@ -78,6 +94,7 @@ int GeneralReaderOp::inference() {
  int64_t elem_type = 0;
  int64_t elem_size = 0;
  int64_t databuf_size = 0;
+  const void* src_ptr = nullptr;
  for (int i = 0; i < var_num; ++i) {
    paddle::PaddleTensor paddleTensor;
    const Tensor &tensor = req->tensor(i);
@@ -86,19 +103,38 @@ int GeneralReaderOp::inference() {
    elem_size = 0;
    databuf_size = 0;
    elem_type = tensor.elem_type();
-    VLOG(2) << "var[" << i << "] has elem type: " << elem_type;
+    src_ptr = nullptr ;
    if (elem_type == P_INT64) {  // int64
      elem_size = sizeof(int64_t);
      paddleTensor.dtype = paddle::PaddleDType::INT64;
      data_len = tensor.int64_data_size();
+      src_ptr = tensor.int64_data().data();
    } else if (elem_type == P_FLOAT32) {
      elem_size = sizeof(float);
      paddleTensor.dtype = paddle::PaddleDType::FLOAT32;
      data_len = tensor.float_data_size();
+      src_ptr = tensor.float_data().data();
    } else if (elem_type == P_INT32) {
      elem_size = sizeof(int32_t);
      paddleTensor.dtype = paddle::PaddleDType::INT32;
      data_len = tensor.int_data_size();
+      src_ptr = tensor.int_data().data();
+    } else if (elem_type == P_UINT8) {
+      elem_size = sizeof(uint8_t);
+      paddleTensor.dtype = paddle::PaddleDType::UINT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_INT8) {
+      elem_size = sizeof(int8_t);
+      paddleTensor.dtype = paddle::PaddleDType::INT8;
+      data_len = tensor.tensor_content().size();
+      src_ptr = tensor.tensor_content().data();
+    } else if (elem_type == P_FP16) {
+      // paddle inference will support FLOAT16
+      // elem_size = 1;
+      // paddleTensor.dtype = paddle::PaddleDType::FLOAT16;
+      // data_len = tensor.tensor_content().size();
+      // src_ptr = tensor.tensor_content().data();
    } else if (elem_type == P_STRING) {
      // use paddle::PaddleDType::UINT8 as for String.
      elem_size = sizeof(char);
@@ -109,8 +145,18 @@ int GeneralReaderOp::inference() {
      // now only support single string
      for (int idx = 0; idx < tensor.data_size(); idx++) {
        data_len += tensor.data()[idx].length() + 1;
+        src_ptr = tensor.data()[idx].data();
      }
    }
+    VLOG(2) << "var[" << i << "] has elem type: " << elem_type << ";"
+            << "elem_size=" << elem_size << ";"
+            << "dtype=" << paddleTensor.dtype << ";"
+            << "data_len=" << data_len;
+    if (src_ptr == nullptr) {
+      LOG(ERROR) << "Not support var[" << i << "] with elem_type[" 
+                 << elem_type << "]";
+      continue;
+    }
    // implement lod tensor here
    // only support 1-D lod
    // TODO(HexToString): support 2-D lod
@@ -141,44 +187,17 @@ int GeneralReaderOp::inference() {
      VLOG(2) << "(logid=" << log_id << ") var[" << i
              << "] has lod_tensor and len=" << out->at(i).lod[0].back();
    }
-    if (elem_type == P_INT64) {
+    void* dst_ptr = out->at(i).data.data();
-      int64_t *dst_ptr = static_cast<int64_t *>(out->at(i).data.data());
+    if (!dst_ptr) {
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
+      LOG(ERROR) << "dst_ptr is nullptr";
-              << "] is " << tensor.int64_data(0);
+      return -1;
-      if (!dst_ptr) {
+    }
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
+    // For common data, we just copy from src to dst
-      }
+    // For string data, we need to iterate through all str
-      memcpy(dst_ptr, tensor.int64_data().data(), databuf_size);
+    if (elem_type != P_STRING) {
-      /*
+      memcpy(dst_ptr, src_ptr, databuf_size);
-      int elem_num = tensor.int64_data_size();
+    } else {
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.int64_data(k);
-      }
-      */
-    } else if (elem_type == P_FLOAT32) {
-      float *dst_ptr = static_cast<float *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.float_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.float_data().data(), databuf_size);
-      /*int elem_num = tensor.float_data_size();
-      for (int k = 0; k < elem_num; ++k) {
-        dst_ptr[k] = tensor.float_data(k);
-      }*/
-    } else if (elem_type == P_INT32) {
-      int32_t *dst_ptr = static_cast<int32_t *>(out->at(i).data.data());
-      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
-              << "] is " << tensor.int_data(0);
-      if (!dst_ptr) {
-        LOG(ERROR) << "dst_ptr is nullptr";
-        return -1;
-      }
-      memcpy(dst_ptr, tensor.int_data().data(), databuf_size);
-    } else if (elem_type == P_STRING) {
      char *dst_ptr = static_cast<char *>(out->at(i).data.data());
      VLOG(2) << "(logid=" << log_id << ") first element data in var[" << i
              << "] is " << tensor.data(0);

--- a/core/general-server/op/general_response_op.cpp
+++ b/core/general-server/op/general_response_op.cpp
@@ -168,7 +168,24 @@ int GeneralResponseOp::inference() {
        google::protobuf::RepeatedField<int32_t> tmp_data(data_ptr,
                                                          data_ptr + cap);
        output->mutable_tensor(var_idx)->mutable_int_data()->Swap(&tmp_data);
-      }
+      } else if (dtype == paddle::PaddleDType::UINT8) {
+        tensor->set_elem_type(7);
+        VLOG(2) << "(logid=" << log_id << ")Prepare uint8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } else if (dtype == paddle::PaddleDType::INT8) {
+        tensor->set_elem_type(8);
+        VLOG(2) << "(logid=" << log_id << ")Prepare int8 var ["
+                << model_config->_fetch_name[idx] << "].";
+        tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      } 
+      // inference will support fp16
+      //   else if (dtype == paddle::PaddleDType::FLOAT16) {
+      //   tensor->set_elem_type(5);
+      //   VLOG(2) << "(logid=" << log_id << ")Prepare float16 var ["
+      //           << model_config->_fetch_name[idx] << "].";
+      //   tensor->set_tensor_content(in->at(idx).data.data(), in->at(idx).data.length());
+      // }
      VLOG(2) << "(logid=" << log_id << ") fetch var ["
              << model_config->_fetch_name[idx] << "] ready";

--- a/core/general-server/proto/general_model_service.proto
+++ b/core/general-server/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 message Tensor {
-  repeated string data = 1;
+  // VarType: INT64
-  repeated int32 int_data = 2;
+  repeated int64 int64_data = 1;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
+  // VarType: FP32
-  optional int32 elem_type =
+  repeated float float_data = 2;
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
+  // VarType: INT32
-  repeated int32 lod = 7;         // only for fetch tensor currently
+  repeated int32 int_data = 3;
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string data = 9;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  12 => STRING
+  int32 elem_type = 10;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
+  bool profile_server = 3;
-  required uint64 log_id = 4 [ default = 0 ];
+  uint64 log_id = 4;
 };
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+  // Error messages
+  string err_msg = 4;
 };
 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 service GeneralModelService {

--- a/core/predictor/framework/infer.h
+++ b/core/predictor/framework/infer.h
@@ -443,7 +443,30 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
                 paddle::PaddleDType::INT32) {
        int32_t* data = static_cast<int32_t*>(origin_data);
        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+                 paddle::PaddleDType::UINT8) {
+        uint8_t* data = static_cast<uint8_t*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else if ((*tensorVector_in_pointer)[i].dtype ==
+                 paddle::PaddleDType::INT8) {
+        int8_t* data = static_cast<int8_t*>(origin_data);
+        lod_tensor_in->CopyFromCpu(data);
+      } else {
+        LOG(ERROR) << "Inference not support type["
+                   << (*tensorVector_in_pointer)[i].dtype
+                   << "],name[" << (*tensorVector_in_pointer)[i].name
+                   << "]" << " copy into core failed!";
      }
+      // Paddle inference will support FP16 in next version.
+      // else if ((*tensorVector_in_pointer)[i].dtype ==
+      //          paddle::PaddleDType::FLOAT16) {
+      //   paddle::platform::float16* data =
+      //       static_cast<paddle::platform::float16*>(origin_data);
+      //   lod_tensor_in->CopyFromCpu(data);
+      // }
+      VLOG(2) << "Tensor:name=" << (*tensorVector_in_pointer)[i].name
+              << ";in_dtype=" << (*tensorVector_in_pointer)[i].dtype
+              << ";tensor_dtype=" << lod_tensor_in->type();
    }
    // After the input data is passed in,
    // call 'core->Run()' perform the prediction process.
@@ -508,7 +531,41 @@ class FluidInferEngine : public CloneDBReloadableInferEngine<EngineCore> {
        int32_t* data_out = reinterpret_cast<int32_t*>(databuf_data);
        lod_tensor_out->CopyToCpu(data_out);
        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::UINT8) {
+        databuf_size = out_num * sizeof(uint8_t);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        uint8_t* data_out = reinterpret_cast<uint8_t*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
+      } else if (dataType == paddle::PaddleDType::INT8) {
+        databuf_size = out_num * sizeof(int8_t);
+        databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+        if (!databuf_data) {
+          LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+          return -1;
+        }
+        int8_t* data_out = reinterpret_cast<int8_t*>(databuf_data);
+        lod_tensor_out->CopyToCpu(data_out);
+        databuf_char = reinterpret_cast<char*>(data_out);
      }
+      // Inference will support FP16 in next version
+      //  else if (dataType == paddle::PaddleDType::FLOAT16) {
+      //   using float16 = paddle::platform::float16;
+      //   databuf_size = out_num * sizeof(float16);
+      //   databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+      //   if (!databuf_data) {
+      //     LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+      //     return -1;
+      //   }
+      //   float16* data_out = reinterpret_cast<float16*>(databuf_data);
+      //   lod_tensor_out->CopyToCpu(data_out);
+      //   databuf_char = reinterpret_cast<char*>(data_out);
+      // }
      // Because task scheduling requires OPs to use 'Channel'
      // (which is a data structure) to transfer data between OPs.
      // We need to copy the processed data to the 'Channel' for the next OP.

--- a/core/sdk-cpp/proto/general_model_service.proto
+++ b/core/sdk-cpp/proto/general_model_service.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-syntax = "proto2";
+syntax = "proto3";
 import "pds_option.proto";
 import "builtin_format.proto";
 package baidu.paddle_serving.predictor.general_model;
@@ -20,33 +20,88 @@ package baidu.paddle_serving.predictor.general_model;
 option cc_generic_services = true;
 message Tensor {
-  repeated string data = 1;
+  // VarType: INT64
-  repeated int32 int_data = 2;
+  repeated int64 int64_data = 1;
-  repeated int64 int64_data = 3;
-  repeated float float_data = 4;
+  // VarType: FP32
-  optional int32 elem_type =
+  repeated float float_data = 2;
-      5; // 0 means int64, 1 means float32, 2 means int32, 3 means string
-  repeated int32 shape = 6;       // shape should include batch
+  // VarType: INT32
-  repeated int32 lod = 7;         // only for fetch tensor currently
+  repeated int32 int_data = 3;
-  optional string name = 8;       // get from the Model prototxt
-  optional string alias_name = 9; // get from the Model prototxt
+  // VarType: FP64
+  repeated double float64_data = 4;
+  // VarType: UINT32
+  repeated uint32 uint32_data = 5;
+  // VarType: BOOL
+  repeated bool bool_data = 6;
+  // (No support)VarType: COMPLEX64, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated float complex64_data = 7;
+  // (No support)VarType: COMPLEX128, 2x represents the real part, 2x+1
+  // represents the imaginary part
+  repeated double complex128_data = 8;
+  // VarType: STRING
+  repeated string data = 9;
+  // Element types:
+  //   0 => INT64
+  //   1 => FP32
+  //   2 => INT32
+  //   3 => FP64
+  //   4 => INT16
+  //   5 => FP16
+  //   6 => BF16
+  //   7 => UINT8
+  //   8 => INT8
+  //   9 => BOOL
+  //  10 => COMPLEX64
+  //  11 => COMPLEX128
+  //  20 => STRING
+  int32 elem_type = 10;
+  // Shape of the tensor, including batch dimensions.
+  repeated int32 shape = 11;
+  // Level of data(LOD), support variable length data, only for fetch tensor
+  // currently.
+  repeated int32 lod = 12;
+  // Correspond to the variable 'name' in the model description prototxt.
+  string name = 13;
+  // Correspond to the variable 'alias_name' in the model description prototxt.
+  string alias_name = 14; // get from the Model prototxt
+  // VarType: FP16, INT16, INT8, BF16, UINT8
+  bytes tensor_content = 15;
 };
 message Request {
  repeated Tensor tensor = 1;
  repeated string fetch_var_names = 2;
-  optional bool profile_server = 3 [ default = false ];
+  bool profile_server = 3;
-  required uint64 log_id = 4 [ default = 0 ];
+  uint64 log_id = 4;
 };
 message Response {
  repeated ModelOutput outputs = 1;
  repeated int64 profile_time = 2;
+  // Error code
+  int32 err_no = 3;
+  // Error messages
+  string err_msg = 4;
 };
 message ModelOutput {
  repeated Tensor tensor = 1;
-  optional string engine_name = 2;
+  string engine_name = 2;
 }
 service GeneralModelService {