add int8_t gemm and enable MulOp to support int8_t.

52e13225 · Zhen Wang · 1c893a02 · 52e13225 · 52e13225 · 52e13225
7 changed file
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -32,7 +32,7 @@ template <typename Dtype>
 vector<string> OperatorBase<Dtype>::GetInputKeys() const {
  auto it = op_input_output_key.find(type_);
  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no outputs";
+    DLOG << type_ << " has no inputs";
    return {};
  }
  return it->second.first;

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -338,10 +338,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
  for (int i = 0; i < tensor.numel(); i += stride) {
    if (tensor.type() == typeid(float)) {
      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
    } else if (tensor.type() == typeid(int64_t)) {
      printer << tensor.data<int64_t>()[i] << " ";
    } else if (tensor.type() == typeid(int8_t)) {
-      printer << tensor.data<int8_t>()[i] << " ";
+      printer << static_cast<int32_t>(tensor.data<int8_t>()[i]) << " ";
    }
  }
 #endif

--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -25,12 +25,15 @@ bool MulKernel<CPU, float>::Init(MulParam<CPU> *param) {
  return true;
 }
 template <>
 void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
  MulCompute<float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }
+template class MulKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *out = param.Out();
-  out->mutable_data<float>();
  const Tensor x_matrix =
      input_x->dims().size() > 2
          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
  if (out_dim.size() != 2) {
    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+  if (param.InputX()->type() == typeid(int8_t)) {
-                      out, static_cast<float>(0));
+    out->mutable_data<int32_t>();
+    math::matmul<int8_t>(x_matrix, false, y_matrix, false,
+                         static_cast<int8_t>(1), out, static_cast<int8_t>(0));
+  } else {
+    out->mutable_data<float>();
+    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                        out, static_cast<float>(0));
+  }
  if (out_dim.size() != 2) {
    out->Resize(out_dim);
  }
 }
-template class MulKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <stdint-gcc.h>
 #include <string>
 #include "common/log.h"

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,80 +12,80 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"
-int main() {
+#define a(i, j) a[(i)*lda + (j)]
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+#define b(i, j) b[(i)*ldb + (j)]
-  auto program = loader.Load(g_resnet);
+#define c(i, j) c[(i)*ldc + (j)]
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
+namespace paddle_mobile {
+using framework::AttributeMap;
-  Executor4Test<paddle_mobile::CPU,
+using framework::DDim;
-                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
+using framework::Scope;
-      executor(program, "mul");
+using framework::make_ddim;
+template <typename I, typename O>
-  // 1. input_tensors;
+int TestMulOP() {
-  vector<Tensor> input_tensors;
+  int32_t m = 1024;
+  int32_t n = 1024;
-  Tensor input1;
+  int32_t k = 1024;
-  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
+  int32_t lda = k;
-  input_tensors.push_back(input1);
+  int32_t ldb = n;
-  Tensor input2;
+  int32_t ldc = n;
-  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
+  DDim inputA_shape = make_ddim({m, k});
-  input_tensors.push_back(input2);
+  DDim inputB_shape = make_ddim({k, n});
+  VariableNameMap inputs;
-  // 2. input_names
+  VariableNameMap outputs;
-  vector<string> input_names({
+  auto scope = std::make_shared<Scope>();
-      "pool2d_0.tmp_0",
+  inputs["X"] = std::vector<std::string>({"inputA"});
-      "fc_0.w_0",
+  inputs["Y"] = std::vector<std::string>({"inputB"});
-  });
+  outputs["Out"] = std::vector<std::string>({"output"});
-  // 3. output_names
+  auto inputA_var = scope.get()->Var("inputA");
-  vector<string> output_names({"fc_0.tmp_0"});
+  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<I>(inputA, inputA_shape, -127, 127);
-  // 4. out_dims;
+  auto inputB_var = scope.get()->Var("inputB");
-  vector<DDim> out_ddims;
+  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
+  SetupTensor<I>(inputB, inputB_shape, -127, 127);
-  out_ddims.push_back(out_ddim);
+  auto output_var = scope.get()->Var("output");
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+  AttributeMap attrs;
-                                            output_names, out_ddims);
+  attrs["x_num_col_dims"].Set<int>(1);
+  attrs["y_num_col_dims"].Set<int>(1);
-  auto output0_data = output[0]->data<float>();
+  auto *op =
+      new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs, scope);
-  auto dim_1 = input1.numel() / input1.dims()[0];
+  op->InferShape();
-  DLOG << " input1 : ";
+  op->Run();
-  for (int i = 0; i < input1.dims()[0]; ++i) {
+  auto output = output_var->template Get<framework::LoDTensor>();
-    for (int j = 0; j < dim_1; ++j) {
+  const O *output_data = output->data<O>();
-      DLOGF("%f ", input1_data[i * dim_1 + j]);
+  // compare
+  O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
+  I *a = inputA->data<I>();
+  I *b = inputB->data<I>();
+  for (int32_t i = 0; i < m; ++i) {
+    for (int32_t j = 0; j < n; ++j) {
+      O r = 0;
+      for (int32_t p = 0; p < k; p++) {
+        r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
+      }
+      c(i, j) = r;
    }
-    DLOGF("\n");
  }
-  auto dim_2 = input2.numel() / input2.dims()[0];
+  for (int32_t i = 0; i < m * n; ++i) {
-  DLOG << " input2 : ";
+    PADDLE_MOBILE_ENFORCE(
-  for (int i = 0; i < input2.dims()[0]; ++i) {
+        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
-    for (int j = 0; j < dim_2; ++j) {
+        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
-      DLOGF("%f ", input2_data[i * dim_2 + j]);
-    }
-    DLOGF("\n");
-  }
-  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output[0]->dims()[0]; ++i) {
-    for (int j = 0; j < dim_output0; ++j) {
-      DLOGF("%f ", output0_data[i * dim_2 + j]);
-    }
-    DLOGF("\n");
  }
+  DLOG << "Run MulOp successfully!";
+  delete op;
+  return 0;
+}
+}  // namespace paddle_mobile
-  /// output (3,3)
+int main() {
-  DLOG << "output memory size : " << output[0]->memory_size();
+  paddle_mobile::TestMulOP<int8_t, int32_t>();
-  DLOG << "output numel : " << output[0]->numel();
+  paddle_mobile::TestMulOP<float, float>();
-  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
-       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
  return 0;
 }