Update opencl kernel

6cc1d13d · Liangliang He · 2a7274f4 · 6cc1d13d · 2a7274f4 · 6cc1d13d
5 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -14,15 +14,11 @@ cc_library(
    srcs = glob([
        "runtime/opencl/cl.hpp",
        "runtime/opencl/cl2.hpp",
-        "runtime/opencl/opencl_allocator.cc",
-        "runtime/opencl/opencl_wrapper.cc",
-        "runtime/opencl/opencl_runtime.cc",
+        "runtime/opencl/*.cc",
+    ]),
+    hdrs = glob([
+        "runtime/opencl/*.h",
    ]),
-    hdrs = [
-        "runtime/opencl/opencl_allocator.h",
-        "runtime/opencl/opencl_runtime.h",
-        "runtime/opencl/opencl_wrapper.h",
-    ],
    copts = ["-std=c++11"],
    deps = [
        "core",
@@ -31,17 +27,6 @@ cc_library(
    alwayslink = 1,
 )

-cc_binary(
-    name = "opencl_smoketest",
-    srcs = glob([
-        "runtime/opencl/opencl_smoketest.cc",
-    ]),
-    copts = ["-std=c++11"],
-    deps = [
-        "opencl_runtime",
-    ],
-)
-
 cc_library(
    name = "core",
    srcs = glob([

--- a/mace/core/runtime/opencl/opencl_smoketest.cc
+++ b/mace/core/runtime/opencl/opencl_smoketest.cc
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-#include "mace/core/logging.h"
-#include "mace/core/operator.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/core/runtime/opencl/opencl_wrapper.h"
-
-int main() {
-  using namespace mace;
-  auto runtime = mace::OpenCLRuntime::Get();
-
-  mace::Tensor ta(GetDeviceAllocator(DeviceType::OPENCL), DataType::DT_INT32);
-  mace::Tensor tb(GetDeviceAllocator(DeviceType::OPENCL), DataType::DT_INT32);
-  mace::Tensor tc(GetDeviceAllocator(DeviceType::OPENCL), DataType::DT_INT32);
-  mace::Tensor tstep(GetDeviceAllocator(DeviceType::OPENCL),
-                     DataType::DT_INT32);
-
-  int n = 1000;
-  std::vector<index_t> shape = {n};
-  ta.Resize(shape);
-  tb.Resize(shape);
-  tc.Resize(shape);
-  tstep.Resize({1});
-
-  int step_size = 10;
-  int global_size = n / step_size;
-  {
-    mace::Tensor::MappingGuard ta_mapper(&ta);
-    mace::Tensor::MappingGuard tb_mapper(&tb);
-    mace::Tensor::MappingGuard tstep_mapper(&tstep);
-    int32_t *a = ta.mutable_data<int32_t>();
-    int32_t *b = tb.mutable_data<int32_t>();
-    int32_t *step = tstep.mutable_data<int32_t>();
-    for (int i = 0; i < n; i++) {
-      a[i] = i;
-      b[i] = 2 * i;
-    }
-    step[0] = step_size;
-  }
-
-  auto program = runtime->program();
-
-  auto simple_add =
-      cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>(
-          program, "simple_add");
-  cl_int error;
-  simple_add(cl::EnqueueArgs(runtime->command_queue(), cl::NDRange(global_size),
-                             cl::NullRange),
-             *(static_cast<cl::Buffer *>(ta.buffer())),
-             *(static_cast<cl::Buffer *>(tb.buffer())),
-             *(static_cast<cl::Buffer *>(tc.buffer())),
-             *(static_cast<cl::Buffer *>(tstep.buffer())), error);
-  if (error != 0) {
-    LOG(ERROR) << "Failed to execute kernel " << error;
-  }
-
-  {
-    mace::Tensor::MappingGuard ta_mapper(&ta);
-    mace::Tensor::MappingGuard tb_mapper(&tb);
-    mace::Tensor::MappingGuard tc_mapper(&tc);
-
-    int32_t *a = ta.mutable_data<int32_t>();
-    int32_t *b = tb.mutable_data<int32_t>();
-    int32_t *c = tc.mutable_data<int32_t>();
-    bool correct = true;
-    for (int i = 0; i < n; i++) {
-      if (c[i] != a[i] + b[i]) correct = false;
-    }
-    LOG(INFO) << "OpenCL test result: " << (correct ? "correct" : "incorrect");
-  }
-
-  return 0;
-}
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -28,20 +28,11 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
      for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
        float weights = filter[out_chan * in_chan_num + in_chan];
        float *output_ptr = output_base + out_chan * pixel_num;
-        /* TODO fix vload/vstore */
-        /*
        for (int p = 0; p < 2; ++p) {
-          float4 in = vload4(p * 4, input_ptr);
-          float4 out = vload4(p * 4, output_ptr);
+          float4 in = vload4(p, input_ptr);
+          float4 out = vload4(p, output_ptr);
          out += in * weights;
-          vstore4(out, p * 4, output_ptr);
-        }
-        */
-        for (int p = 0; p < 8; ++p) {
-          float in = input_ptr[p];
-          float out = output_ptr[p];
-          out += in * weights;
-          output_ptr[p] = out;
+          vstore4(out, p, output_ptr);
        }
      }
    } else {

--- a/mace/kernels/opencl/cl/simple_add.cl
+++ b/mace/kernels/opencl/cl/simple_add.cl
-void kernel simple_add(global const int *a,
-                       global const int *b,
-                       global int *c,
-                       global const int *step) {
-  int id = get_global_id(0);
-  int start = step[0] * id;
-  int stop = start + step[0];
-  for (int i = start; i < stop; i++) c[i] = a[i] + b[i];
-}
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -81,7 +81,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
    cl_int error;
    conv_2d(cl::EnqueueArgs(runtime->command_queue(),
                            cl::NDRange(chan_blk_num, pixel_blk_num),
-                            cl::NullRange),
+                            cl::NDRange(1, 64)),
            *(static_cast<cl::Buffer *>(input->buffer())),
            *(static_cast<cl::Buffer *>(filter->buffer())),
            *(static_cast<cl::Buffer *>(output->buffer())),