Merge branch 'opencl' into 'master'

Add conv2d opencl kernel placeholder See merge request !72

Merge branch 'opencl' into 'master'
Add conv2d opencl kernel placeholder See merge request !72
8dc58b60 · 李寅 · ebff986d · 344a9fea · 8dc58b60 · 8dc58b60
17 changed file
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -31,7 +31,7 @@ void OpenCLAllocator::Delete(void *buffer) {
 void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Get()->command_queue();
-  // TODO (heliangliang) Non-blocking call
+  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -4,8 +4,12 @@
 #include <cstdlib>
 #include <fstream>
+#include <memory>
 #include <mutex>
+#include <dirent.h>
+#include <errno.h>
 #include "mace/core/logging.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
@@ -13,8 +17,7 @@
 namespace mace {
 namespace {
-bool ReadSourceFile(const char *filename, std::string *content) {
+bool ReadSourceFile(const std::string &filename, std::string *content) {
-  MACE_CHECK_NOTNULL(filename);
  MACE_CHECK_NOTNULL(content);
  *content = "";
  std::ifstream ifs(filename, std::ifstream::in);
@@ -31,26 +34,50 @@ bool ReadSourceFile(const char *filename, std::string *content) {
 }
 bool BuildProgram(OpenCLRuntime *runtime,
-                  const char *filename,
+                  const std::string &path,
                  cl::Program *program) {
-  MACE_CHECK_NOTNULL(filename);
  MACE_CHECK_NOTNULL(program);
-  std::string kernel_code;
+  auto closer = [](DIR *d) {
-  if (!ReadSourceFile(filename, &kernel_code)) {
+    if (d != nullptr) closedir(d);
-    LOG(ERROR) << "Failed to read kernel source " << filename;
+  };
-    return false;
+  std::unique_ptr<DIR, decltype(closer)> dir(opendir(path.c_str()), closer);
-  }
+  MACE_CHECK_NOTNULL(dir.get());
+  const std::string kSourceSuffix = ".cl";
  cl::Program::Sources sources;
-  sources.push_back({kernel_code.c_str(), kernel_code.length()});
+  errno = 0;
+  dirent *entry = readdir(dir.get());
+  MACE_CHECK(errno == 0);
+  while (entry != nullptr) {
+    if (entry->d_type == DT_REG) {
+      std::string d_name(entry->d_name);
+      if (d_name.size() > kSourceSuffix.size() &&
+          d_name.compare(d_name.size() - kSourceSuffix.size(),
+                         kSourceSuffix.size(), kSourceSuffix) == 0) {
+        std::string filename = path + d_name;
+        std::string kernel_source;
+        MACE_CHECK(ReadSourceFile(filename, &kernel_source));
+        sources.push_back({kernel_source.c_str(), kernel_source.length()});
+      }
+    }
+    entry = readdir(dir.get());
+    MACE_CHECK(errno == 0);
+  };
  *program = cl::Program(runtime->context(), sources);
-  if (program->build({runtime->device()}) != CL_SUCCESS) {
+  std::string build_options = "-Werror -cl-mad-enable -I" + path;
-    LOG(INFO) << "Error building: "
+  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-              << program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
+  if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) {
-    return false;
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
+        CL_BUILD_ERROR) {
+      std::string build_log =
+          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
+      LOG(INFO) << "Program build log: " << build_log;
+    }
+    LOG(FATAL) << "Build program failed";
  }
  return true;
 }
@@ -123,24 +150,16 @@ cl::Device &OpenCLRuntime::device() { return device_; }
 cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
-cl::Program OpenCLRuntime::GetProgram(const std::string &name) {
+cl::Program &OpenCLRuntime::program() {
-  static const char *kernel_source_path = getenv("MACE_KERNEL_SOURCE_PATH");
+  // TODO(heliangliang) Support binary format
-  std::string filename = name;
+  static const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  if (kernel_source_path != nullptr) {
+  std::string path(kernel_path == nullptr ? "" : kernel_path);
-    filename = kernel_source_path + name;
-  }
-  std::lock_guard<std::mutex> lock(program_lock_);
+  std::call_once(build_flag_, [this, &path]() {
-  // TODO (heliangliang) Support binary format
+    MACE_CHECK(BuildProgram(this, path, &program_));
-  auto iter = programs_.find(name);
+  });
-  if (iter != programs_.end()) {
-    return iter->second;
+  return program_;
-  } else {
-    cl::Program program;
-    MACE_CHECK(BuildProgram(this, filename.c_str(), &program));
-    programs_.emplace(name, program);
-    return program;
-  }
 }
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -28,14 +28,14 @@ class OpenCLRuntime {
  cl::Context &context();
  cl::Device &device();
  cl::CommandQueue &command_queue();
-  cl::Program GetProgram(const std::string &name);
+  cl::Program &program();
 private:
  cl::Context context_;
  cl::CommandQueue command_queue_;
  cl::Device device_;
-  std::map<std::string, cl::Program> programs_;
+  cl::Program program_;
-  std::mutex program_lock_;
+  std::once_flag build_flag_;
 };
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_smoketest.cc
+++ b/mace/core/runtime/opencl/opencl_smoketest.cc
@@ -40,7 +40,7 @@ int main() {
    step[0] = step_size;
  }
-  auto program = runtime->GetProgram("simple_add.cl");
+  auto program = runtime->program();
  auto simple_add =
      cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>(

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -195,8 +195,8 @@ OpenCLLibraryImpl &OpenCLLibraryImpl::Get() {
 bool OpenCLLibraryImpl::Load() {
  if (loaded()) return true;
-  // TODO (heliangliang) Make this configurable
+  // TODO(heliangliang) Make this configurable
-  // TODO (heliangliang) Benchmark 64 bit overhead
+  // TODO(heliangliang) Benchmark 64 bit overhead
  static const std::vector<std::string> paths = {
 #if defined(__aarch64__)
    // Qualcomm Adreno

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -88,7 +88,7 @@ class Tensor {
   * Map the device buffer as CPU buffer to access the data, unmap must be
   * called later
   */
-  inline void Map() {
+  inline void Map() const {
    if (!OnHost()) {
      MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
      data_ = alloc_->Map(buffer_, size_ * SizeOfType());
@@ -98,7 +98,7 @@ class Tensor {
  /*
   *  Unmap the device buffer
   */
-  inline void Unmap() {
+  inline void Unmap() const {
    if (!OnHost()) {
      MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
      alloc_->Unmap(buffer_, data_);
@@ -187,7 +187,7 @@ class Tensor {
    LOG(INFO) << os.str();
  }
-  inline size_t SizeOfType() {
+  inline size_t SizeOfType() const {
    size_t type_size = 0;
    CASES(dtype_, type_size = sizeof(T));
    return type_size;
@@ -203,14 +203,15 @@ class Tensor {
  class MappingGuard {
   public:
-    MappingGuard(Tensor *tensor) : tensor_(tensor) {
+    MappingGuard(const Tensor *tensor) : tensor_(tensor) {
-      MACE_ASSERT(tensor_ != nullptr);
+      if (tensor_ != nullptr) tensor_->Map();
-      tensor_->Map();
+    }
+    ~MappingGuard() {
+      if (tensor_ != nullptr) tensor_->Unmap();
    }
-    ~MappingGuard() { tensor_->Unmap(); }
   private:
-    Tensor *tensor_;
+    const Tensor *tensor_;
  };
 private:
@@ -233,7 +234,7 @@ class Tensor {
  // read or write
  void *buffer_;
  // Mapped buffer
-  void *data_;
+  mutable void *data_;
  vector<index_t> shape_;
  DISABLE_COPY_AND_ASSIGN(Tensor);

--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -84,7 +84,7 @@ int main(int argc, char **argv) {
  Workspace ws;
  ws.LoadModelTensor(net_def, DeviceType::CPU);
  Tensor *input_tensor =
-      ws.CreateTensor(input_node + ":0", cpu_allocator(), DT_FLOAT);
+      ws.CreateTensor(input_node + ":0", GetDeviceAllocator(DeviceType::CPU), DT_FLOAT);
  input_tensor->Resize(shape);
  float *input_data = input_tensor->mutable_data<float>();

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -11,8 +11,8 @@ load("//mace:mace.bzl", "if_android")
 cc_library(
    name = "kernels",
-    srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc"])),
+    srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc", "opencl/*.cc"])),
-    hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h"])),
+    hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h", "opencl/*.h"])),
    copts = [
        "-std=c++11",
        "-fopenmp",
@@ -20,6 +20,7 @@ cc_library(
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",
+        "//mace/core:opencl_runtime",
        "//mace/utils",
    ],
 )

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -19,27 +19,26 @@ struct Conv2dFunctor {
                const int *dilations)
      : strides_(strides), paddings_(paddings), dilations_(dilations) {}
-  void operator()(const T *input,  // NCHW
+  void operator()(const Tensor *input,
-                  const index_t *input_shape,
+                  const Tensor *filter,
-                  const T *filter,  // c_out, c_in, kernel_h, kernel_w
+                  const Tensor *bias,
-                  const index_t *filter_shape,
+                  Tensor *output) {
-                  const T *bias,  // c_out
+    MACE_CHECK_NOTNULL(input);
-                  T *output,      // NCHW
+    MACE_CHECK_NOTNULL(filter);
-                  const index_t *output_shape) {
    MACE_CHECK_NOTNULL(output);
-    index_t batch = output_shape[0];
+    index_t batch = output->shape()[0];
-    index_t channels = output_shape[1];
+    index_t channels = output->shape()[1];
-    index_t height = output_shape[2];
+    index_t height = output->shape()[2];
-    index_t width = output_shape[3];
+    index_t width = output->shape()[3];
-    index_t input_batch = input_shape[0];
+    index_t input_batch = input->shape()[0];
-    index_t input_channels = input_shape[1];
+    index_t input_channels = input->shape()[1];
-    index_t input_height = input_shape[2];
+    index_t input_height = input->shape()[2];
-    index_t input_width = input_shape[3];
+    index_t input_width = input->shape()[3];
-    index_t kernel_h = filter_shape[2];
+    index_t kernel_h = filter->shape()[2];
-    index_t kernel_w = filter_shape[3];
+    index_t kernel_w = filter->shape()[3];
    int stride_h = strides_[0];
    int stride_w = strides_[1];
@@ -57,17 +56,26 @@ struct Conv2dFunctor {
    index_t kernel_size = input_channels * kernel_h * kernel_w;
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard bias_mapper(bias);
+    Tensor::MappingGuard output_mapper(output);
+    auto input_data = input->data<T>();
+    auto filter_data = filter->data<T>();
+    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
+    auto output_data = output->mutable_data<T>();
 #pragma omp parallel for collapse(2)
    for (int n = 0; n < batch; ++n) {
      for (int c = 0; c < channels; ++c) {
-        T bias_channel = bias ? bias[c] : 0;
+        T bias_channel = bias_data ? bias_data[c] : 0;
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            index_t offset = n * channels * height * width +
                             c * height * width + h * width + w;
-            output[offset] = bias_channel;
+            output_data[offset] = bias_channel;
            T sum = 0;
-            const T *filter_ptr = filter + c * kernel_size;
+            const T *filter_ptr = filter_data + c * kernel_size;
            for (int inc = 0; inc < input_channels; ++inc) {
              for (int kh = 0; kh < kernel_h; ++kh) {
                for (int kw = 0; kw < kernel_w; ++kw) {
@@ -86,13 +94,13 @@ struct Conv2dFunctor {
                        n * input_channels * input_height * input_width +
                        inc * input_height * input_width + inh * input_width +
                        inw;
-                    sum += input[input_offset] * *filter_ptr;
+                    sum += input_data[input_offset] * *filter_ptr;
                  }
                  ++filter_ptr;
                }
              }
            }
-            output[offset] += sum;
+            output_data[offset] += sum;
          }
        }
      }
@@ -105,16 +113,12 @@ struct Conv2dFunctor {
 };
 template <>
-void Conv2dFunctor<DeviceType::NEON, float>::operator()(
+void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
-    const float *input,
+                                                        const Tensor *filter,
-    const index_t *input_shape,
+                                                        const Tensor *bias,
-    const float *filter,
+                                                        Tensor *output);
-    const index_t *filter_shape,
-    const float *bias,
+}  // namespace kernels
-    float *output,
+}  // namespace mace
-    const index_t *output_shape);
-}  //  namespace kernels
-}  //  namespace mace
 #endif  // MACE_KERNELS_CONV_2D_H_
--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -41,14 +41,14 @@ extern void Conv2dNeonK5x5S1(const float *input,
                             const index_t *output_shape);
 template <>
-void Conv2dFunctor<DeviceType::NEON, float>::operator()(
+void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
-    const float *input,
+                                                        const Tensor *filter,
-    const index_t *input_shape,
+                                                        const Tensor *bias,
-    const float *filter,
+                                                        Tensor *output) {
-    const index_t *filter_shape,
+  MACE_CHECK_NOTNULL(input);
-    const float *bias,
+  MACE_CHECK_NOTNULL(filter);
-    float *output,
+  MACE_CHECK_NOTNULL(output);
-    const index_t *output_shape) {
  typedef void (*Conv2dNeonFunction)(
      const float *input, const index_t *input_shape, const float *filter,
      const index_t *filter_shape, const float *bias, float *output,
@@ -61,8 +61,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(
      {nullptr, nullptr},
      {Conv2dNeonK5x5S1, nullptr}};
  // not implement yet
-  index_t kernel_h = filter_shape[2];
+  index_t kernel_h = filter->shape()[2];
-  index_t kernel_w = filter_shape[3];
+  index_t kernel_w = filter->shape()[3];
  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
@@ -71,21 +71,32 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << " is not implemented yet, using slow version";
    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input, input_shape, filter, filter_shape, bias, output, output_shape);
+        input, filter, bias, output);
    return;
  }
+  Tensor::MappingGuard input_mapper(input);
+  Tensor::MappingGuard filter_mapper(filter);
+  Tensor::MappingGuard bias_mapper(bias);
+  Tensor::MappingGuard output_mapper(output);
+  auto input_data = input->data<float>();
+  auto input_shape = input->shape().data();
+  auto filter_data = filter->data<float>();
+  auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
+  auto output_data = output->mutable_data<float>();
+  auto output_shape = output->shape().data();
  // Keep this alive during kernel execution
  Tensor padded_input;
  if (paddings_[0] > 0 || paddings_[1] > 0) {
-    ConstructInputWithPadding(input, input_shape, paddings_.data(),
+    ConstructInputWithPadding(input_data, input->shape().data(),
-                              &padded_input);
+                              paddings_.data(), &padded_input);
-    input = padded_input.data<float>();
+    input_data = padded_input.data<float>();
    input_shape = padded_input.shape().data();
  }
  auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_neon_func(input, input_shape, filter, nullptr, bias, output,
+  conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
-                   output_shape);
+                   bias_data, output_data, output_shape);
 }
 }  //  namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+namespace mace {
+namespace kernels {
+extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, Tensor *output);
+template <>
+void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+                                                          const Tensor *filter,
+                                                          const Tensor *bias,
+                                                          Tensor *output) {
+  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
+                                       const Tensor *bias, Tensor *output);
+  // Selection matrix: kernel_size x stride_size
+  static const Conv2dOpenclFunction selector[5][2] = {
+      {Conv2dOpenclK1x1S1, nullptr},
+      {nullptr, nullptr},
+      {nullptr, nullptr},
+      {nullptr, nullptr},
+      {nullptr, nullptr}};
+  index_t kernel_h = filter->shape()[2];
+  index_t kernel_w = filter->shape()[3];
+  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides_[0] << "x" << strides_[1]
+                 << " is not implemented yet, using slow version";
+    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
+    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
+        input, filter, bias, output);
+    return;
+  }
+  MACE_CHECK(paddings_[0] == 1 && paddings_[1] == 1, "Padding not supported");
+  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+  conv2d_func(input, filter, bias, output);
+}
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#include "mace/core/common.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/utils/utils.h"
+namespace mace {
+namespace kernels {
+static constexpr index_t kInputChannelBlockSize = 2;
+static constexpr index_t kOutputChannelBlockSize = 4;
+extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, Tensor *output) {
+  const index_t batch = output->shape()[0];
+  const index_t channels = output->shape()[1];
+  const index_t height = output->shape()[2];
+  const index_t width = output->shape()[3];
+  const index_t input_batch = input->shape()[0];
+  const index_t input_channels = input->shape()[1];
+  const index_t input_height = input->shape()[2];
+  const index_t input_width = input->shape()[3];
+  MACE_CHECK(input_batch == batch && input_height == height &&
+             input_width == width);
+  const index_t total_pixels = height * width;
+  const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize);
+};
+}  // namespace kernels
+}  // namespace mace
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
 REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
+REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>);
 }  // namespace mace
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -25,12 +25,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
  bool Run() override {
    const Tensor *input = this->Input(INPUT);
    const Tensor *filter = this->Input(FILTER);
-    const T *bias_data = nullptr;
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
-    if (this->InputSize() >= 3) {
-      const Tensor *bias = this->Input(BIAS);
-      bias_data = bias->data<T>();
-    }
    Tensor *output = this->Output(OUTPUT);
    std::vector<index_t> output_shape(4);
@@ -42,9 +37,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
    output->Resize(output_shape);
    functor_.paddings_ = paddings;
-    functor_(input->data<T>(), input->shape().data(), filter->data<T>(),
+    functor_(input, filter, bias, output);
-             filter->shape().data(), bias_data, output->mutable_data<T>(),
-             output->shape().data());
    return true;
  }

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -67,9 +67,10 @@ static void Conv2d(int iters,
  BENCHMARK(                                                                                       \
      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
-#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)       \
+#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);  \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON);
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -48,7 +48,7 @@ class OpsTestNet {
                         const std::vector<index_t> &shape,
                         const std::vector<T> &data) {
    Tensor *input =
-        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
    input->Resize(shape);
    T *input_data = input->mutable_data<T>();
    MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
@@ -60,7 +60,7 @@ class OpsTestNet {
                        const std::vector<index_t> &shape,
                        const T data) {
    Tensor *input =
-        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
    input->Resize(shape);
    T *input_data = input->mutable_data<T>();
    std::fill(input_data, input_data + input->size(), data);
@@ -71,7 +71,7 @@ class OpsTestNet {
                      const std::vector<index_t> &shape,
                      bool positive = false) {
    Tensor *input =
-        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
+        ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
    input->Resize(shape);
    float *input_data = input->mutable_data<T>();
@@ -206,7 +206,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
 template <typename T>
 unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
                                const std::vector<T> &data) {
-  unique_ptr<Tensor> res(new Tensor(cpu_allocator(), DataTypeToEnum<T>::v()));
+  unique_ptr<Tensor> res(new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
  res->Resize(shape);
  T *input_data = res->mutable_data<T>();
  memcpy(input_data, data.data(), data.size() * sizeof(T));

--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
@@ -269,7 +269,7 @@ int Main(int argc, char **argv) {
  // Load inputs
  for (size_t i = 0; i < inputs_count; ++i) {
    Tensor *input_tensor =
-        ws.CreateTensor(input_layers[i], cpu_allocator(), DT_FLOAT);
+        ws.CreateTensor(input_layers[i], GetDeviceAllocator(DeviceType::CPU), DT_FLOAT);
    vector<index_t> shapes;
    str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes);
    input_tensor->Resize(shapes);