add compatible method to space2depth and depth2space ops

6982006f · yejianwu · 96452de2 · 6982006f · 6982006f
Showing with 68 addition and 21 deletion

mace/kernels/opencl/cl/depth_to_space.cl mace/kernels/opencl/cl/depth_to_space.cl +35 -0

mace/kernels/opencl/depth_to_space_opencl.cc mace/kernels/opencl/depth_to_space_opencl.cc +33 -21

未找到文件。
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
@@ -3,11 +3,28 @@
 __kernel void depth_to_space(__read_only image2d_t input,
                             __private const int block_size,
                             __private const int output_depth,
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                             __write_only image2d_t output,
+                             __private const int global_size_dim0,
+                             __private const int global_size_dim1,
+                             __private const int global_size_dim2) {
+#else
                             __write_only image2d_t output) {
+#endif
  const int out_d = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_h = get_global_id(2);
+#ifndef USE_QUALCOMM_OPENCL_2_0
+  if (out_d >= global_size_dim0 || out_w >= global_size_dim1
+      || out_h >= global_size_dim2) {
+    return;
+  }
+  const int output_width = global_size_dim1;
+#else
  const int output_width = get_global_size(1);
+#endif
  const int out_pos = mad24(out_d, output_width, out_w);
@@ -30,11 +47,29 @@ __kernel void depth_to_space(__read_only image2d_t input,
 __kernel void space_to_depth(__read_only image2d_t input,
                             __private const int block_size,
                             __private const int input_depth,
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                             __write_only image2d_t output,
+                             __private const int global_size_dim0,
+                             __private const int global_size_dim1,
+                             __private const int global_size_dim2) {
+#else
                             __write_only image2d_t output) {
+#endif
  const int d = get_global_id(0);
  const int w = get_global_id(1);
  const int h = get_global_id(2);
+#ifndef USE_QUALCOMM_OPENCL_2_0
+  if (d >= global_size_dim0 || w >= global_size_dim1
+      || h >= global_size_dim2) {
+    return;
+  }
+  const int input_width = global_size_dim1;
+#else
  const int input_width = get_global_size(1);
+#endif
  const int in_pos = mad24(d, input_width, w);
  const int output_width = input_width / block_size;

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -45,8 +45,11 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  output->ResizeImage(output_shape, image_shape);
-  if (kernel_.get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
@@ -55,38 +58,47 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
    kernel_ =
        runtime->BuildKernel("depth_to_space", kernel_name, built_options);
  }
+  uint32_t gws[3];
+  std::stringstream ss;
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, block_size_);
    kernel_.setArg(idx++, depth_blocks);
    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
    if (d2s_) {
-    const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
+      gws[0] = static_cast<uint32_t>(depth_blocks);
-                             static_cast<uint32_t>(output_width),
+      gws[1] = static_cast<uint32_t>(output_width);
-                             static_cast<uint32_t>(output_height * batch)};
+      gws[2] = static_cast<uint32_t>(output_height * batch);
-    const std::vector<uint32_t> lws = {8, 16, 8, 1};
-    std::stringstream ss;
      ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
         << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
-    TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
    } else {
-    const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
+      gws[0] = static_cast<uint32_t>(depth_blocks);
-                             static_cast<uint32_t>(input_width),
+      gws[1] = static_cast<uint32_t>(input_width);
-                             static_cast<uint32_t>(input_height * batch)};
+      gws[2] = static_cast<uint32_t>(input_height * batch);
-    const std::vector<uint32_t> lws = {8, 16, 8, 1};
-    std::stringstream ss;
      ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
         << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
-    TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
    }
+    kernel_.setArg(idx++, gws[0]);
+    kernel_.setArg(idx++, gws[1]);
+    kernel_.setArg(idx++, gws[2]);
+    input_shape_ = input->shape();
+  }
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;