Merge branch 'master' of v9.git.n.xiaomi.com:deep-learning/mace into bm_to_image

a9832dfb · yejianwu · 6ed08429 · ef167287 · a9832dfb · a9832dfb
8 changed file
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -10,15 +10,23 @@
 namespace mace {
 namespace kernels {

-template<DeviceType D, typename T>
-struct AddNFunctor {
-  void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+struct AddNFunctorBase {};
+
+template <DeviceType D, typename T>
+struct AddNFunctor : AddNFunctorBase {
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor) {
+    output_tensor->ResizeLike(input_tensors[0]);
    Tensor::MappingGuard output_map(output_tensor);
    index_t size = input_tensors[0]->size();
    T *output_ptr = output_tensor->mutable_data<T>();
    memset(output_ptr, 0, size * sizeof(T));
    int n = input_tensors.size();
    for (int i = 0; i < n; ++i) {
+      MACE_CHECK(input_tensors[i]->dim(0) == output_tensor->dim(0));
+      MACE_CHECK(input_tensors[i]->dim(1) == output_tensor->dim(1));
+      MACE_CHECK(input_tensors[i]->dim(2) == output_tensor->dim(2));
+      MACE_CHECK(input_tensors[i]->dim(3) == output_tensor->dim(3));
      Tensor::MappingGuard input_map(input_tensors[i]);
      const T *input_ptr = input_tensors[i]->data<T>();
      for (index_t j = 0; j < size; ++j) {
@@ -28,15 +36,17 @@ struct AddNFunctor {
  }
 };

-template<>
+template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);

-template<>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(
-    std::vector<const Tensor *> &inputs, Tensor *output);
+template <typename T>
+struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
+  void operator()(const std::vector<const Tensor *> &input_tensors,
+                  Tensor *output_tensor);
+};

 }  //  namespace kernels
 }  //  namespace mace

-#endif  // MACE_KERNELS_ADDN_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_ADDN_H_
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -10,7 +10,7 @@ namespace kernels {

 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
  // TODO: neon mem copy
  index_t size = output_tensor->size();
  float *output_ptr = output_tensor->mutable_data<float>();
@@ -51,4 +51,4 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
 };

 }  // namespace kernels
-}  // namespace mace
\ No newline at end of file
+}  // namespace mace
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -5,52 +5,83 @@
 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"

 namespace mace {
 namespace kernels {

-static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
-  index_t element_size = input0->NumElements();
-  index_t blocks = (element_size + 3) / 4;
+template <typename T>
+static void AddN(const std::vector<const Tensor *> &input_tensors,
+                 Tensor *output) {
+  if (input_tensors.size() > 4) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);

-  const uint32_t gws = blocks;
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(output->dtype()));
-  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
+  auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);

  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);

  uint32_t idx = 0;
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
-  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
-  addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
-  addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  for (auto input : input_tensors) {
+  addn_kernel.setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(input->buffer())));
+  }
+  addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      addn_kernel, cl::NullRange,
-      cl::NDRange(gws),
-      cl::NDRange(lws),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+      cl::NDRange(width_pixels, batch_height_pixels),
+      cl::NDRange(64, 16),  // TODO fix this
+      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
+  MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
 }

-template<>
-void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
-                                                        Tensor *output_tensor) {
-
-  if (input_tensors.empty() || input_tensors.front() == nullptr) {
-    return;
-  }
+template <typename T>
+void AddNFunctor<DeviceType::OPENCL, T>::operator()(
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);

-  switch (size) {
-    case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
-      break;
-    default:MACE_NOT_IMPLEMENTED;
+  for (int i = 1; i < size; ++i) {
+    MACE_CHECK_NOTNULL(input_tensors[i]);
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
  }
+
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output_tensor->ResizeImage(output_shape, output_image_shape);
+
+  AddN<T>(input_tensors, output_tensor);
 };

+template
+struct AddNFunctor<DeviceType::OPENCL, float>;
+
+template
+struct AddNFunctor<DeviceType::OPENCL, half>;
+
 }  // namespace kernels
-} //  namespace mace
+}  //  namespace mace
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>

-// Supported data type: half/float
-__kernel void add2(__global const DATA_TYPE *input0,
-                   __global const DATA_TYPE *input1,
-                   __private const int size,
-                   __global DATA_TYPE *output) {
-  int idx = get_global_id(0);
+__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+                   __read_only image2d_t input1,
+#if INPUT_NUM > 2
+                   __read_only image2d_t input2,
+#endif
+#if INPUT_NUM > 3
+                   __read_only image2d_t input3,
+#endif
+                   __write_only image2d_t output) {
+  const int w = get_global_id(0);
+  const int hb = get_global_id(1);

-  if (idx + 4 > size) {
-    for(; idx < size; ++idx) {
-      *(output+idx) = *(input0+idx) + *(input1+idx);
-    }
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
-    VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
-    vstore4(in_data0+in_data1, idx, output);
-  }
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
+  DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
+  DATA_TYPE4 out = in0 + in1;
+
+#if INPUT_NUM > 2
+  DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
+  out = out + in2;
+#endif
+
+#if INPUT_NUM > 3
+  DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
+  out = out + in3;
+#endif
+
+  WRITE_IMAGET(output, (int2)(w, hb), out);
 }

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -23,4 +23,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
                             .Build(),
                         AddNOp<DeviceType::OPENCL, float>);

+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("AddN")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         AddNOp<DeviceType::OPENCL, half>);
+
 }  //  namespace mace
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -10,7 +10,7 @@

 namespace mace {

-template<DeviceType D, class T>
+template <DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
 public:
  AddNOp(const OperatorDef &operator_def, Workspace *ws)
@@ -18,7 +18,6 @@ class AddNOp : public Operator<D, T> {

  bool Run() override {
    Tensor *output_tensor = this->outputs_[0];
-    output_tensor->ResizeLike(this->inputs_[0]);
    int n = this->inputs_.size();
    vector<const Tensor *> inputs(n, nullptr);
    for (int i = 0; i < n; ++i) {

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -9,47 +9,69 @@

 namespace mace {
 template <DeviceType D, typename T>
-static void AddNBenchmark(int iters, int n, int size) {
+static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  mace::testing::StopTiming();

  OpsTestNet net;
-  OpDefBuilder op_def_builder("AddN", "AddNBM");
-  for (int i = 0; i < n; ++i) {
-    op_def_builder.Input(internal::MakeString("Input", i).c_str());
+  // Add input data
+  for (int i = 0; i < inputs; ++i) {
+    net.AddRandomInput<D, float>(
+        internal::MakeString("Input", i).c_str(), {n, h, w, c});
  }
-  op_def_builder.Output("Output").Finalize(net.NewOperatorDef());

-  // Add input data
-  for (int i = 0; i < n; ++i) {
-    net.AddRandomInput<DeviceType::CPU, float>(internal::MakeString("Input", i).c_str(), {size});
+  if (D == DeviceType::OPENCL) {
+    for (int i = 0; i < inputs; ++i) {
+      BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
+                          internal::MakeString("InputImage", i).c_str(),
+                          kernels::BufferType::IN_OUT);
+    }
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
+    }
+    op_def_builder.Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder op_def_builder("AddN", "AddNBM");
+    for (int i = 0; i < inputs; ++i) {
+      op_def_builder.Input(internal::MakeString("Input", i).c_str());
+    }
+    op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);
+    net.Sync();
  }

  mace::testing::StartTiming();
  while (iters--) {
    net.RunOp(D);
+    net.Sync();
  }
 }

-#define BM_ADDN_MACRO(N, SIZE, TYPE, DEVICE)                        \
-  static void BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE(int iters) { \
-    const int64_t tot = static_cast<int64_t>(iters) * N * SIZE;     \
-    mace::testing::ItemsProcessed(tot);                             \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));             \
-    AddNBenchmark<DEVICE, TYPE>(iters, N, SIZE);                    \
-  }                                                                 \
-  BENCHMARK(BM_ADDN_##N##_##SIZE##_##TYPE##_##DEVICE)
-
-#define BM_ADDN(N, SIZE, TYPE)       \
-  BM_ADDN_MACRO(N, SIZE, TYPE, CPU); \
-  BM_ADDN_MACRO(N, SIZE, TYPE, NEON);
-
-BM_ADDN(10, 1000, float);
-BM_ADDN(10, 10000, float);
-BM_ADDN(100, 1000, float);
-BM_ADDN(100, 10000, float);
-}  //  namespace mace
\ No newline at end of file
+#define BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, DEVICE)                     \
+  static void BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
+      int iters) {                                                          \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;        \
+    mace::testing::ItemsProcessed(tot);                                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                     \
+    AddNBenchmark<DEVICE, TYPE>(iters, INPUTS, N, H, W, C);                 \
+  }                                                                         \
+  BENCHMARK(BM_ADDN_##INPUTS##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+
+#define BM_ADDN(INPUTS, N, H, W, C, TYPE)       \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
+
+BM_ADDN(2, 1, 240, 240, 256, float);
+// BM_ADDN(2, 1, 240, 240, 256, half);
+BM_ADDN(4, 1, 240, 240, 256, float);
+// BM_ADDN(4, 1, 240, 240, 256, half);
+
+}  //  namespace mace
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -9,7 +9,7 @@ namespace mace {

 class AddnOpTest : public OpsTestBase {};

-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd2() {
  // Construct graph
  OpsTestNet net;
@@ -20,30 +20,26 @@ void SimpleAdd2() {
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});

  // Run
  net.RunOp(D);

-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

-TEST_F(AddnOpTest, CPUSimpleAdd2) {
-  SimpleAdd2<DeviceType::CPU>();
-}
+TEST_F(AddnOpTest, CPUSimpleAdd2) { SimpleAdd2<DeviceType::CPU>(); }

-TEST_F(AddnOpTest, NEONSimpleAdd2) {
-  SimpleAdd2<DeviceType::NEON>();
-}
+/*
+TEST_F(AddnOpTest, NEONSimpleAdd2) { SimpleAdd2<DeviceType::NEON>(); }

-TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
-  SimpleAdd2<DeviceType::OPENCL>();
-}
+TEST_F(AddnOpTest, OPENCLSimpleAdd2) { SimpleAdd2<DeviceType::OPENCL>(); }
+*/

-template<DeviceType D>
+template <DeviceType D>
 void SimpleAdd3() {
  // Construct graph
  OpsTestNet net;
@@ -55,62 +51,80 @@ void SimpleAdd3() {
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
-  net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input1", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, {1, 2, 3, 4, 5, 6});

  // Run
  net.RunOp(D);

-  auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
+  auto expected = CreateTensor<float>({1, 2, 3, 1}, {3, 6, 9, 12, 15, 18});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

-TEST_F(AddnOpTest, CPUSimpleAdd3) {
-  SimpleAdd3<DeviceType::CPU>();
-}
+TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }

-TEST_F(AddnOpTest, NEONSimpleAdd3) {
-  SimpleAdd3<DeviceType::NEON>();
-}
+/*
+TEST_F(AddnOpTest, NEONSimpleAdd3) { SimpleAdd3<DeviceType::NEON>(); }
+*/

-template<DeviceType D>
+template <DeviceType D>
 void RandomTest() {
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("AddN", "AddNTest")
-      .Input("Input1")
-      .Input("Input2")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
-  net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});
-
-  // Check
-  net.RunOp(D);
-
-  Tensor result;
-  result.Copy(*net.GetOutput("Output"));
-
-  // Run
-  net.RunOp();
-
-  ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
-}
-
-TEST_F(AddnOpTest, CPURandom) {
-  RandomTest<DeviceType::CPU>();
+  testing::internal::LogToStderr();
+  srand(time(NULL));
+
+  for (int round = 0; round < 10; ++round) {
+    // generate random input
+    index_t n = 1 + (rand() % 5);
+    index_t h = 1 + (rand() % 100);
+    index_t w = 1 + (rand() % 100);
+    index_t c = 1 + (rand() % 32);
+    int input_num = 2 + rand() % 3;
+    // Construct graph
+    OpsTestNet net;
+    auto op_def = OpDefBuilder("AddN", "AddNTest");
+    for (int i = 0; i < input_num; ++i) {
+      op_def.Input("Input" + ToString(i));
+    }
+    op_def.Output("Output").Finalize(net.NewOperatorDef());
+
+    // Add input data
+    for (int i = 0; i < input_num; ++i) {
+      net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
+    }
+
+    // run on cpu
+    net.RunOp();
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    for (int i = 0; i < input_num; ++i) {
+      BufferToImage<D, half>(net, "Input" + ToString(i),
+                             "InputImage" + ToString(i),
+                             kernels::BufferType::IN_OUT);
+    }
+
+    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
+    for (int i = 0; i < input_num; ++i) {
+      op_def_cl.Input("InputImage" + ToString(i));
+    }
+    op_def_cl.Output("OutputImage")
+        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .Finalize(net.NewOperatorDef());
+
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT);
+
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.1);
+  }
 }

-TEST_F(AddnOpTest, NEONRandom) {
-  RandomTest<DeviceType::NEON>();
-}
-
-TEST_F(AddnOpTest, OPENCLRandom) {
-  RandomTest<DeviceType::OPENCL>();
-}
+TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }

 }  // namespace mace