update relu buffer to image

b7a95857 · yejianwu · fd284f6a · b7a95857 · b7a95857 · b7a95857
7 changed file
--- a/mace/kernels/opencl/cl/relu.cl
+++ b/mace/kernels/opencl/cl/relu.cl
 #include <common.h>
 // Supported data type: half/float
-__kernel void relu(__global const DATA_TYPE *input,
+__kernel void relu(__read_only image2d_t input,
-                   __private const int size,
+                   __write_only image2d_t output) {
-                   __global DATA_TYPE *output) {
+  const int ch_blk = get_global_id(0);
-  int idx = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+  const int width = get_global_size(1);
-  if (idx + 4 > size) {
+  const int pos = ch_blk * width + w;
-    for(; idx < size; ++idx) {
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
-      *(output+idx) = fmax(*(input+idx), 0);
+  DATA_TYPE4 out = fmax(in, (DATA_TYPE4)0);
-    }
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
-    data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
-    vstore4(data, idx, output);
-  }
 }
-__kernel void relux(__global const DATA_TYPE *input,
+__kernel void relux(__read_only image2d_t input,
                    __private const DATA_TYPE max_limit,
-                    __private const int size,
+                    __write_only image2d_t output) {
-                    __global DATA_TYPE *output) {
+  const int ch_blk = get_global_id(0);
-  int idx = get_global_id(0);
+  const int w = get_global_id(1);
+  const int hb = get_global_id(2);
+  const int width = get_global_size(1);
-  if (idx + 4 > size) {
+  const int pos = ch_blk * width + w;
-    for(; idx < size; ++idx) {
+  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
-      *(output+idx) = clamp(*(input+idx), 0.0f, max_limit);
+  DATA_TYPE4 out = clamp(in, (DATA_TYPE4)0, (DATA_TYPE4)max_limit);
-    }
+  WRITE_IMAGET(output, (int2)(pos, hb), out);
-  } else {
-    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
-    data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
-    vstore4(data, idx, output);
-  }
 }
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -6,58 +6,70 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-template <>
+template <typename T>
-void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                        Tensor *output) {
-  index_t element_size = input->NumElements();
+  const index_t batch = input->dim(0);
-  index_t blocks = (element_size + 3) / 4;
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
-  const uint32_t gws = blocks;
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto dt = DataTypeToEnum<T>::value;
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  if (max_limit_ < 0) {
    auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
-    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t lws[3] = {1, kwg_size, 1};
    uint32_t idx = 0;
    relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-    relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
    relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
-        cl::NDRange(gws),
+        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws),
+        cl::NDRange(lws[0], lws[1], lws[2]),
        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS);
  } else {
    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
-    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+    const uint32_t lws[3] = {1, kwg_size, 1};
    uint32_t idx = 0;
    relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
    relu_kernel.setArg(idx++, max_limit_);
-    relu_kernel.setArg(idx++, static_cast<int32_t>(element_size));
    relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
-        cl::NDRange(gws),
+        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws),
+        cl::NDRange(lws[0], lws[1], lws[2]),
        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS);
  }
 }
+template
+struct ReluFunctor<DeviceType::OPENCL, float>;
+template
+struct ReluFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -33,11 +33,15 @@ struct ReluFunctor {
 template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                      Tensor *output);
-template <>
-void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+template <typename T>
-                                                        Tensor *output);
+struct ReluFunctor<DeviceType::OPENCL, T> {
+  T max_limit_;
+  void operator()(const Tensor *input, Tensor *output);
+};
 }  //  namespace kernels
 }  //  namespace mace
 #endif  // MACE_KERNELS_RELU_H_
\ No newline at end of file
--- a/mace/ops/relu.cc
+++ b/mace/ops/relu.cc
@@ -12,5 +12,14 @@ REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
 REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
-REGISTER_OPENCL_OPERATOR(Relu, ReluOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         ReluOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Relu")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ReluOp<DeviceType::OPENCL, half>);
 }  //  namespace mace
--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -16,7 +16,7 @@ class ReluOp : public Operator<D, T> {
  ReluOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws) {
    functor_.max_limit_ =
-        OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
+        OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
  }
  bool Run() override {
    const Tensor *input_tensor = this->inputs_[0];

--- a/mace/ops/relu_benchmark.cc
+++ b/mace/ops/relu_benchmark.cc
@@ -9,17 +9,28 @@
 namespace mace {
 template <DeviceType D, typename T>
-static void ReluBenchmark(int iters, int size) {
+static void ReluBenchmark(
+    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();
  OpsTestNet net;
-  OpDefBuilder("Relu", "ReluBM")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, float>("Input", {size});
+  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Relu", "ReluBM")
+        .Input("InputImage")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("Relu", "ReluBM")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+  }
  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -34,21 +45,23 @@ static void ReluBenchmark(int iters, int size) {
  net.Sync();
 }
-#define BM_RELU_MACRO(SIZE, TYPE, DEVICE)                     \
+#define BM_RELU_MACRO(N, C, H, W, TYPE, DEVICE)                      \
-  static void BM_RELU_##SIZE##_##TYPE##_##DEVICE(int iters) { \
+  static void BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE(int iters) {  \
-    const int64_t tot = static_cast<int64_t>(iters) * SIZE;   \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::ItemsProcessed(tot);                       \
+    mace::testing::ItemsProcessed(tot);                              \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));       \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
-    ReluBenchmark<DEVICE, TYPE>(iters, SIZE);                 \
+    ReluBenchmark<DEVICE, TYPE>(iters, N, C, H, W);                  \
-  }                                                           \
+  }                                                                  \
-  BENCHMARK(BM_RELU_##SIZE##_##TYPE##_##DEVICE)
+  BENCHMARK(BM_RELU_##N##C##H##W##_##TYPE##_##DEVICE)
-#define BM_RELU(SIZE, TYPE)       \
+#define BM_RELU(N, C, H, W, TYPE)       \
-  BM_RELU_MACRO(SIZE, TYPE, CPU); \
+  BM_RELU_MACRO(N, C, H, W, TYPE, CPU); \
-  BM_RELU_MACRO(SIZE, TYPE, NEON);\
+  BM_RELU_MACRO(N, C, H, W, TYPE, NEON);\
-  BM_RELU_MACRO(SIZE, TYPE, OPENCL);
+  BM_RELU_MACRO(N, C, H, W, TYPE, OPENCL);
-BM_RELU(1000, float);
+BM_RELU(1, 1, 512, 512, float);
-BM_RELU(100000, float);
+BM_RELU(1, 3, 128, 128, float);
-BM_RELU(10000000, float);
+BM_RELU(1, 3, 512, 512, float);
-}  //  namespace mace
+BM_RELU(1, 32, 112, 112, float);
\ No newline at end of file
+BM_RELU(1, 64, 256, 256, float);
+}  //  namespace mace
--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -12,10 +12,6 @@ class ReluOpTest : public OpsTestBase {};
 template <DeviceType D>
 void TestSimple() {
  OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddInputFromArray<D, float>("Input",
@@ -23,8 +19,28 @@ void TestSimple() {
                                  {-7, 7, -6, 6, -5, 5, -4, 4,
                                   -3, 3, -2, 2, -1, 1, 0, 0});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
  auto expected = CreateTensor<float>({2, 2, 2, 2},
                                      {0, 7, 0, 6, 0, 5, 0, 4,
@@ -48,20 +64,36 @@ TEST_F(ReluOpTest, OPENCLSimple) {
 template <DeviceType D>
 void TestUnalignedSimple() {
  OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddInputFromArray<D, float>("Input",
-                                  {1, 1, 3, 2},
+                                  {1, 3, 2, 1},
                                  {-7, 7, -6, 6, -5, 5});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
-  auto expected = CreateTensor<float>({1, 1, 3, 2},
+  auto expected = CreateTensor<float>({1, 3, 2, 1},
                                      {0, 7, 0, 6, 0, 5});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -82,11 +114,6 @@ TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
 template <DeviceType D>
 void TestSimpleReluX() {
  OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .AddFloatArg("max_limit", 6)
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddInputFromArray<D, float>("Input",
@@ -94,8 +121,30 @@ void TestSimpleReluX() {
                                  {-7, 7, -6, 6, -5, 5, -4, 4,
                                   -3, 3, -2, 2, -1, 1, 0, 0});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
  auto expected = CreateTensor<float>({2, 2, 2, 2},
                                      {0, 6, 0, 6, 0, 5, 0, 4,
@@ -119,21 +168,38 @@ TEST_F(ReluOpTest, OPENCLSimpleReluX) {
 template <DeviceType D>
 void TestUnalignedSimpleReluX() {
  OpsTestNet net;
-  OpDefBuilder("Relu", "ReluTest")
-      .Input("Input")
-      .Output("Output")
-      .AddFloatArg("max_limit", 6)
-      .Finalize(net.NewOperatorDef());
  // Add input data
  net.AddInputFromArray<D, float>("Input",
-                                  {1, 1, 1, 7},
+                                  {1, 1, 7, 1},
                                  {-7, 7, -6, 6, -5, 5, -4});
-  // Run
+  if (D == DeviceType::OPENCL) {
-  net.RunOp(D);
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("InputImage")
+        .Output("OutputImage")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Relu", "ReluTest")
+        .Input("Input")
+        .Output("Output")
+        .AddFloatArg("max_limit", 6)
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
-  auto expected = CreateTensor<float>({1, 1, 1, 7},
+  auto expected = CreateTensor<float>({1, 1, 7, 1},
                                      {0, 6, 0, 6, 0, 5, 0});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);