Change DeviceType::OPENCL to DeviceType::GPU and fix some bugs.

3ba07e0b · liuqi · d98386d6 · 3ba07e0b · 3ba07e0b · 3ba07e0b
142 changed file
--- a/docs/developer/adding_a_new_op.md
+++ b/docs/developer/adding_a_new_op.md
@@ -24,16 +24,16 @@ void Register_Custom_Op(OperatorRegistry *op_registry) {
                    Custom_Op<DeviceType::CPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, float>);
+                    Custom_Op<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, half>);
+                    Custom_Op<DeviceType::GPU, half>);
 }
 }  // namespace ops

--- a/docs/user/introduction.md
+++ b/docs/user/introduction.md
@@ -316,7 +316,7 @@ unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data
 NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
 //3. 声明设备类型
-DeviceType device_type = DeviceType::OPENCL;
+DeviceType device_type = DeviceType::GPU;
 //4. 定义输入输出名称数组
 std::vector<std::string> input_names = {...};
@@ -350,8 +350,8 @@ for (size_t i = 0; i < output_count; ++i) {
 //6. 创建MaceEngine对象
 mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-//7. 如果设备类型是OPENCL或HEXAGON，可以在此释放model_data
+//7. 如果设备类型是GPU或者HEXAGON，可以在此释放model_data
-if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
  mace::MACE_MODEL_TAG::UnloadModelData(model_data);
 }

--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -108,10 +108,8 @@ inline int64_t NowMicros() {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
+  } else if (device_str.compare("GPU") == 0) {
-    return DeviceType::NEON;
+    return DeviceType::GPU;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -198,7 +196,7 @@ bool Run(MaceEngine *engine,
  return true;
 }
-DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]");
+DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
 DEFINE_string(input_node, "input_node0,input_node1",
              "input nodes, separated by comma");
 DEFINE_string(output_node, "output_node0,output_node1",
@@ -279,7 +277,7 @@ int Main(int argc, char **argv) {
  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (device_type == DeviceType::OPENCL) {
+  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
@@ -347,7 +345,7 @@ int Main(int argc, char **argv) {
  LOG(INFO) << "Run init";
  std::unique_ptr<mace::MaceEngine> engine_ptr(
      new mace::MaceEngine(&net_def, device_type, input_names, output_names));
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
  }

--- a/mace/benchmark/model_throughput_test.cc
+++ b/mace/benchmark/model_throughput_test.cc
@@ -141,10 +141,8 @@ std::string FormatName(const std::string input) {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
+  } else if (device_str.compare("GPU") == 0) {
-    return DeviceType::NEON;
+    return DeviceType::GPU;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -277,7 +275,7 @@ int Main(int argc, char **argv) {
      FLAGS_gpu_model_data_file.c_str());
  NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
-  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names,
+  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::GPU, input_names,
                              output_names);
  mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);

--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -34,9 +34,8 @@ Allocator *GetDeviceAllocator(DeviceType type) {
 }
 MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
-MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
 #ifdef MACE_ENABLE_OPENCL
-MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
+MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
 #endif
 MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -189,7 +189,7 @@ class Image : public BufferBase {
 public:
  Image()
      : BufferBase(0),
-        allocator_(GetDeviceAllocator(OPENCL)),
+        allocator_(GetDeviceAllocator(GPU)),
        buf_(nullptr),
        mapped_buf_(nullptr) {}
@@ -198,7 +198,7 @@ class Image : public BufferBase {
            std::accumulate(
                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
            GetEnumTypeSize(data_type)),
-        allocator_(GetDeviceAllocator(OPENCL)),
+        allocator_(GetDeviceAllocator(GPU)),
        mapped_buf_(nullptr) {
    shape_ = shape;
    data_type_ = data_type;

--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -193,7 +193,7 @@ MaceStatus MaceEngine::Impl::Run(
    input_tensors.push_back(input_tensor);
  }
  for (auto &output : *outputs) {
-    if (device_type_ == DeviceType::OPENCL) {
+    if (device_type_ == DeviceType::GPU) {
      MACE_CHECK(output.second.shape().size() == 4,
                 "The outputs' shape must be 4-dimension with NHWC format,"
                     " please use 1 to fill missing dimensions");
@@ -217,7 +217,7 @@ MaceStatus MaceEngine::Impl::Run(
 #endif
 #ifdef MACE_ENABLE_OPENCL
-  if (device_type_ == OPENCL) {
+  if (device_type_ == GPU) {
    OpenCLRuntime::Global()->SaveBuiltCLProgram();
  }
 #endif

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -54,7 +54,7 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
    auto &op = *iter;
    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
                        op->debug_def().type(), ")");
-    bool future_wait = (device_type_ == DeviceType::OPENCL &&
+    bool future_wait = (device_type_ == DeviceType::GPU &&
                        (run_metadata != nullptr ||
                         std::distance(iter, operators_.end()) == 1));

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -88,7 +88,6 @@ extern void Register_Dequantize(OperatorRegistry *op_registry);
 extern void Register_Eltwise(OperatorRegistry *op_registry);
 extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
 extern void Register_FullyConnected(OperatorRegistry *op_registry);
-extern void Register_FusedConv2D(OperatorRegistry *op_registry);
 extern void Register_LocalResponseNorm(OperatorRegistry *op_registry);
 extern void Register_MatMul(OperatorRegistry *op_registry);
 extern void Register_Pad(OperatorRegistry *op_registry);
@@ -96,7 +95,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
 extern void Register_Proposal(OperatorRegistry *op_registry);
 extern void Register_PSROIAlign(OperatorRegistry *op_registry);
 extern void Register_Quantize(OperatorRegistry *op_registry);
-extern void Register_ReOrganize(OperatorRegistry *op_registry);
 extern void Register_Requantize(OperatorRegistry *op_registry);
 extern void Register_Reshape(OperatorRegistry *op_registry);
 extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
@@ -130,7 +128,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_Eltwise(this);
  ops::Register_FoldedBatchNorm(this);
  ops::Register_FullyConnected(this);
-  ops::Register_FusedConv2D(this);
  ops::Register_LocalResponseNorm(this);
  ops::Register_MatMul(this);
  ops::Register_Pad(this);
@@ -139,7 +136,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_PSROIAlign(this);
  ops::Register_Quantize(this);
  ops::Register_Requantize(this);
-  ops::Register_ReOrganize(this);
  ops::Register_Reshape(this);
  ops::Register_ResizeBilinear(this);
  ops::Register_Slice(this);

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -82,7 +82,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  VLOG(3) << "Model data size: " << model_data_size;
  if (model_data_size > 0) {
-    if (type == DeviceType::CPU || type == DeviceType::NEON) {
+    if (type == DeviceType::CPU) {
      tensor_buffer_ = std::unique_ptr<Buffer>(
          new Buffer(GetDeviceAllocator(type),
                     model_data_ptr,
@@ -119,7 +119,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
    tensor_map_[const_tensor.name()] = std::move(tensor);
  }
-  if (type == DeviceType::CPU || type == DeviceType::OPENCL) {
+  if (type == DeviceType::CPU || type == DeviceType::GPU) {
    CreateOutputTensorBuffer(net_def, type);
  }
 }
@@ -149,7 +149,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (device_type == DeviceType::OPENCL) {
+    if (device_type == DeviceType::GPU) {
      std::unique_ptr<BufferBase> image_buf(
          new Image({mem_block.x(), mem_block.y()}, dtype));
      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
@@ -170,7 +170,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
        std::unique_ptr<Tensor> tensor
            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
        tensor->SetSourceOpName(op.name());
-        if (device_type == DeviceType::OPENCL) {
+        if (device_type == DeviceType::GPU) {
          VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
                  << " Mem: "  << mem_ids[i]
                  << " Image shape: "
@@ -191,7 +191,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
 }
 ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
-  if (device_type == CPU || device_type == NEON) {
+  if (device_type == CPU) {
    return host_scratch_buffer_.get();
  } else {
    return nullptr;

--- a/mace/examples/example.cc
+++ b/mace/examples/example.cc
@@ -22,7 +22,7 @@
 *          --input_file=input_data \
 *          --output_file=mace.out  \
 *          --model_data_file=model_data.data \
- *          --device=OPENCL
+ *          --device=GPU
 */
 #include <malloc.h>
 #include <stdint.h>
@@ -102,10 +102,8 @@ std::string FormatName(const std::string input) {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
+  } else if (device_str.compare("GPU") == 0) {
-    return DeviceType::NEON;
+    return DeviceType::GPU;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -135,7 +133,7 @@ DEFINE_string(output_file,
 DEFINE_string(model_data_file,
              "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
-DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
+DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
 DEFINE_int32(round, 1, "round");
 DEFINE_int32(restart_round, 1, "restart round");
 DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
@@ -160,7 +158,7 @@ bool RunModel(const std::vector<std::string> &input_names,
  MaceStatus res = mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (device_type == DeviceType::OPENCL) {
+  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
@@ -178,7 +176,7 @@ bool RunModel(const std::vector<std::string> &input_names,
  // Init model
  mace::MaceEngine engine(&net_def, device_type, input_names,
                          output_names);
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
  }

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -162,7 +162,7 @@ class ActivationFunctor<DeviceType::CPU, float> {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationFunctor<DeviceType::OPENCL, T> {
+class ActivationFunctor<DeviceType::GPU, T> {
 public:
  ActivationFunctor(ActivationType type, T relux_max_limit)
      : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -93,7 +93,7 @@ struct AddNFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct AddNFunctor<DeviceType::OPENCL, T> {
+struct AddNFunctor<DeviceType::GPU, T> {
  void operator()(const std::vector<const Tensor *> &input_tensors,
                  Tensor *output_tensor,
                  StatsFuture *future);

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -128,7 +128,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
+struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -65,7 +65,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct BiasAddFunctor<DeviceType::OPENCL, T> {
+struct BiasAddFunctor<DeviceType::GPU, T> {
  void operator()(const Tensor *input,
                  const Tensor *bias,
                  Tensor *output,

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -44,7 +44,7 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
 };
 template <typename T>
-struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
+struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
  explicit BufferToImageFunctor(bool i2b = false)
      : BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -67,7 +67,7 @@ struct ChannelShuffleFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
+struct ChannelShuffleFunctor<DeviceType::GPU, T> {
  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -93,7 +93,7 @@ struct ConcatFunctor : ConcatFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
+struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
  void operator()(const std::vector<const Tensor *> &input_list,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -615,7 +615,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
+struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
  Conv2dFunctor(const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -117,7 +117,7 @@ struct DepthToSpaceOpFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
+struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
  DepthToSpaceOpFunctor(const int block_size, bool d2s)
    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -297,7 +297,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
+struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
  : DepthwiseConv2dFunctorBase {
  DepthwiseConv2dFunctor(const int *strides,
                         const Padding padding_type,

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -363,7 +363,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float value)

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -88,7 +88,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
+struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
  FullyConnectedFunctor(const int /*BufferType*/ weight_type,
                        const ActivationType activation,
                        const float relux_max_limit)

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -75,7 +75,7 @@ struct MatMulFunctor {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct MatMulFunctor<DeviceType::OPENCL, T> {
+struct MatMulFunctor<DeviceType::GPU, T> {
  void operator()(const Tensor *A,
                  const Tensor *B,
                  Tensor *C,

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          const Tensor *alpha,
                                                          Tensor *output,
                                                          StatsFuture *future) {
@@ -46,7 +46,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -124,7 +124,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }
-template struct ActivationFunctor<DeviceType::OPENCL, float>;
+template struct ActivationFunctor<DeviceType::GPU, float>;
-template struct ActivationFunctor<DeviceType::OPENCL, half>;
+template struct ActivationFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void AddNFunctor<DeviceType::OPENCL, T>::operator()(
+void AddNFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_tensors,
    Tensor *output_tensor,
    StatsFuture *future) {
@@ -58,7 +58,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -119,9 +119,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct AddNFunctor<DeviceType::OPENCL, float>;
+template struct AddNFunctor<DeviceType::GPU, float>;
-template struct AddNFunctor<DeviceType::OPENCL, half>;
+template struct AddNFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                         const Tensor *scale,
                                                         const Tensor *offset,
                                                         const Tensor *mean,
@@ -56,7 +56,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -130,7 +130,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }
-template struct BatchNormFunctor<DeviceType::OPENCL, float>;
+template struct BatchNormFunctor<DeviceType::GPU, float>;
-template struct BatchNormFunctor<DeviceType::OPENCL, half>;
+template struct BatchNormFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       const Tensor *bias,
                                                       Tensor *output,
                                                       StatsFuture *future) {
@@ -49,7 +49,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -115,7 +115,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }
-template struct BiasAddFunctor<DeviceType::OPENCL, float>;
+template struct BiasAddFunctor<DeviceType::GPU, float>;
-template struct BiasAddFunctor<DeviceType::OPENCL, half>;
+template struct BiasAddFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -20,7 +20,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
+void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
  std::vector<size_t> image_shape;
@@ -95,7 +95,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    if (!kernel_error_) {
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -177,8 +177,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
+template struct BufferToImageFunctor<DeviceType::GPU, float>;
-template struct BufferToImageFunctor<DeviceType::OPENCL, half>;
+template struct BufferToImageFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
+void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
@@ -56,7 +56,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -108,8 +108,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 }
 template
-struct ChannelShuffleFunctor<DeviceType::OPENCL, float>;
+struct ChannelShuffleFunctor<DeviceType::GPU, float>;
 template
-struct ChannelShuffleFunctor<DeviceType::OPENCL, half>;
+struct ChannelShuffleFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -50,7 +50,7 @@ static void Concat2(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -132,7 +132,7 @@ static void ConcatN(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -216,7 +216,7 @@ static void ConcatN(cl::Kernel *kernel,
 }
 template <typename T>
-void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
+void ConcatFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_list,
    Tensor *output,
    StatsFuture *future) {
@@ -264,8 +264,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct ConcatFunctor<DeviceType::OPENCL, float>;
+template struct ConcatFunctor<DeviceType::GPU, float>;
-template struct ConcatFunctor<DeviceType::OPENCL, half>;
+template struct ConcatFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         std::unique_ptr<BufferBase> *kernel_error);
 template <typename T>
-void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                      const Tensor *filter,
                                                      const Tensor *bias,
                                                      Tensor *output,
@@ -128,8 +128,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }
-template struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template struct Conv2dFunctor<DeviceType::GPU, float>;
-template struct Conv2dFunctor<DeviceType::OPENCL, half>;
+template struct Conv2dFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -61,7 +61,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -58,7 +58,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -58,7 +58,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
+void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t input_height = input->dim(1);
@@ -86,7 +86,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -145,8 +145,8 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
+template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
-template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, half>;
+template struct DepthToSpaceOpFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -66,7 +66,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -163,7 +163,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
 }
 template <typename T>
-void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
+void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const Tensor *filter,
    const Tensor *bias,
@@ -215,8 +215,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
                  &kwg_size_, &kernel_error_);
 }
-template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
+template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
-template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, half>;
+template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
+void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                                                       const Tensor *input1,
                                                       Tensor *output,
                                                       StatsFuture *future) {
@@ -74,7 +74,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -129,7 +129,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
  }
 }
-template struct EltwiseFunctor<DeviceType::OPENCL, float>;
+template struct EltwiseFunctor<DeviceType::GPU, float>;
-template struct EltwiseFunctor<DeviceType::OPENCL, half>;
+template struct EltwiseFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -76,7 +76,7 @@ void FCWXKernel(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -202,7 +202,7 @@ void FCWTXKernel(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel,
 }  // namespace
 template <typename T>
-void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
+void FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const Tensor *weight,
    const Tensor *bias,
@@ -305,9 +305,9 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
+template struct FullyConnectedFunctor<DeviceType::GPU, float>;
-template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
+template struct FullyConnectedFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
+void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                      const Tensor *B,
                                                      Tensor *C,
                                                      StatsFuture *future) {
@@ -53,7 +53,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -98,9 +98,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  }
 }
-template struct MatMulFunctor<DeviceType::OPENCL, float>;
+template struct MatMulFunctor<DeviceType::GPU, float>;
-template struct MatMulFunctor<DeviceType::OPENCL, half>;
+template struct MatMulFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -56,7 +56,7 @@ const bool BufferToImageOpImpl(Tensor *buffer,
  if (runtime->IsOutOfRangeCheckEnabled()) {
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
    kernel_error->Map(nullptr);
    *(kernel_error->mutable_data<char>()) = 0;
    kernel_error->UnMap();
@@ -136,13 +136,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
  std::vector<index_t> buffer_shape = {batch, height, width, channels};
  Workspace ws;
  Tensor *buffer = ws.CreateTensor("Buffer",
-                                   GetDeviceAllocator(DeviceType::OPENCL),
+                                   GetDeviceAllocator(DeviceType::GPU),
                                   DataTypeToEnum<float>::v());
  buffer->Resize(buffer_shape);
  std::vector<size_t> image_shape;
  Tensor *image = ws.CreateTensor("Image",
-                                  GetDeviceAllocator(DeviceType::OPENCL),
+                                  GetDeviceAllocator(DeviceType::GPU),
                                  DataTypeToEnum<float>::v());
  CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
  image->ResizeImage(buffer->shape(), image_shape);

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 template<typename T>
-void PadFunctor<DeviceType::OPENCL, T>::operator()(
+void PadFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
@@ -59,7 +59,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -115,9 +115,9 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
 }
 template
-struct PadFunctor<DeviceType::OPENCL, float>;
+struct PadFunctor<DeviceType::GPU, float>;
 template
-struct PadFunctor<DeviceType::OPENCL, half>;
+struct PadFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       Tensor *output,
                                                       StatsFuture *future) {
  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
@@ -50,7 +50,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -148,7 +148,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }
-template struct PoolingFunctor<DeviceType::OPENCL, float>;
+template struct PoolingFunctor<DeviceType::GPU, float>;
-template struct PoolingFunctor<DeviceType::OPENCL, half>;
+template struct PoolingFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
+void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t in_height = input->dim(1);
@@ -50,7 +50,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -113,8 +113,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
+template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
-template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
+template struct ResizeBilinearFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {
 template<typename T>
-void SliceFunctor<DeviceType::OPENCL, T>::operator()(
+void SliceFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const std::vector<Tensor *> &output_list,
    StatsFuture *future) {
@@ -51,7 +51,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -133,9 +133,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
 }
 template
-struct SliceFunctor<DeviceType::OPENCL, float>;
+struct SliceFunctor<DeviceType::GPU, float>;
 template
-struct SliceFunctor<DeviceType::OPENCL, half>;
+struct SliceFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
+void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                                                       Tensor *output,
                                                       StatsFuture *future) {
  const index_t batch = logits->dim(0);
@@ -49,7 +49,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -95,7 +95,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  }
 }
-template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
+template struct SoftmaxFunctor<DeviceType::GPU, float>;
-template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
+template struct SoftmaxFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -25,7 +25,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
+void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    Tensor *space_tensor,
    const std::vector<index_t> &output_shape,
    Tensor *batch_tensor,
@@ -60,7 +60,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -120,8 +120,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
+template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
-template struct SpaceToBatchFunctor<DeviceType::OPENCL, half>;
+template struct SpaceToBatchFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
+void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
@@ -39,7 +39,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -117,7 +117,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
 }
 template <typename T>
-void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
+void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input_tensor,
    const Tensor *bias,
    Tensor *output_tensor,
@@ -138,7 +138,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -231,11 +231,11 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
+template struct WinogradTransformFunctor<DeviceType::GPU, float>;
-template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
+template struct WinogradTransformFunctor<DeviceType::GPU, half>;
-template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
+template struct WinogradInverseTransformFunctor<DeviceType::GPU, float>;
-template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
+template struct WinogradInverseTransformFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -61,21 +61,21 @@ struct PadFunctor : public PadFunctorBase {
    std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);
    const index_t batch = input->dim(0);
-    const index_t height = input->dim(1);
+    const index_t channel = input->dim(1);
-    const index_t width = input->dim(2);
+    const index_t height = input->dim(2);
-    const index_t channel = input->dim(3);
+    const index_t width = input->dim(3);
 #pragma omp parallel for collapse(3)
    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channel; ++c) {
        for (index_t h = 0; h < height; ++h) {
-        for (index_t w = 0; w < width; ++w) {
+          const index_t in_offset = (((b * channel + c) * height) + h) * width;
-          const index_t in_offset = (((b * height + h) * width) + w) * channel;
          const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1)
-              + (h + this->paddings_[2])) * output->dim(2)
+              + (c + this->paddings_[2])) * output->dim(2)
-              + (w + this->paddings_[4])) * output->dim(3)
+              + (h + this->paddings_[4])) * output->dim(3)
              + this->paddings_[6];
          memcpy(output_ptr + out_offset,
                 input_ptr + in_offset,
-                 channel * sizeof(T));
+                 width * sizeof(T));
        }
      }
    }
@@ -84,7 +84,7 @@ struct PadFunctor : public PadFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct PadFunctor<DeviceType::OPENCL, T> : PadFunctorBase {
+struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
  PadFunctor(const std::vector<int> &paddings,
             const float constant_value)
      : PadFunctorBase(paddings, constant_value) {}

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -261,7 +261,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
+struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
  PoolingFunctor(const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,

--- a/mace/kernels/reorganize.h
+++ b/mace/kernels/reorganize.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_KERNELS_REORGANIZE_H_
-#define MACE_KERNELS_REORGANIZE_H_
-#include <vector>
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-namespace mace {
-namespace kernels {
-template <DeviceType D, typename T>
-struct ReOrganizeFunctor {
-  void operator()(const Tensor *input,
-                  const std::vector<index_t> &out_shape,
-                  Tensor *output,
-                  StatsFuture *future) {
-    const bool w2c = out_shape[3] > input->dim(3);
-    const index_t height = input->dim(1);
-    const index_t input_width = input->dim(2);
-    const index_t input_chan = input->dim(3);
-    const index_t output_width = output->dim(2);
-    const index_t output_chan = output->dim(3);
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
-    if (w2c) {
-      MACE_CHECK((out_shape[3] % input->dim(3)) == 0);
-      const index_t multiplier = out_shape[3] / input->dim(3);
-#pragma omp parallel for collapse(4)
-      for (index_t n = 0; n < out_shape[0]; ++n) {
-        for (index_t h = 0; h < out_shape[1]; ++h) {
-          for (index_t w = 0; w < out_shape[2]; ++w) {
-            for (index_t c = 0; c < out_shape[3]; ++c) {
-              const index_t out_offset =
-                  ((n * height + h) * output_width + w)
-                      * output_chan + c;
-              const index_t in_w_idx = w + (c % multiplier) * output_width;
-              const index_t in_chan_idx = c / multiplier;
-              const index_t in_offset =
-                  ((n * height + h) * input_width + in_w_idx)
-                      * input_chan + in_chan_idx;
-              output_ptr[out_offset] = input_ptr[in_offset];
-            }
-          }
-        }
-      }
-    } else {
-      MACE_CHECK((input->dim(3) % out_shape[3]) == 0);
-      const index_t multiplier = input->dim(3) / out_shape[3];
-#pragma omp parallel for collapse(4)
-      for (index_t n = 0; n < out_shape[0]; ++n) {
-        for (index_t h = 0; h < out_shape[1]; ++h) {
-          for (index_t w = 0; w < out_shape[2]; ++w) {
-            for (index_t c = 0; c < out_shape[3]; ++c) {
-              const index_t out_offset =
-                  ((n * height + h) * output_width + w)
-                      * output_chan + c;
-              const index_t in_w_idx = w % input_width;
-              const index_t in_chan_idx = w / input_width + c * multiplier;
-              const index_t in_offset =
-                  ((n * height + h) * input_width + in_w_idx)
-                      * input_chan + in_chan_idx;
-              output_ptr[out_offset] = input_ptr[in_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace kernels
-}  // namespace mace
-#endif  // MACE_KERNELS_REORGANIZE_H_
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -179,7 +179,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
+struct ResizeBilinearFunctor<DeviceType::GPU, T>
  : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
    : ResizeBilinearFunctorBase(size, align_corners) {}

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -86,7 +86,7 @@ struct SliceFunctor : SliceFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
+struct SliceFunctor<DeviceType::GPU, T> : SliceFunctorBase {
  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
  void operator()(const Tensor *input,

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -94,7 +94,7 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct SoftmaxFunctor<DeviceType::OPENCL, T> {
+struct SoftmaxFunctor<DeviceType::GPU, T> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
  cl::Kernel kernel_;

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -59,7 +59,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
+struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(const std::vector<int> &paddings,
                      const std::vector<int> &block_shape,
                      bool b2s)

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -57,7 +57,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct WinogradTransformFunctor<DeviceType::OPENCL, T>
+struct WinogradTransformFunctor<DeviceType::GPU, T>
    : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
                           const std::vector<int> &paddings)
@@ -111,7 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
+struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
    : WinogradInverseTransformFunctorBase {
  WinogradInverseTransformFunctor(const int batch,
                                  const int height,

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -26,16 +26,16 @@ void Register_Activation(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ActivationOp<DeviceType::OPENCL, float>);
+                    ActivationOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ActivationOp<DeviceType::OPENCL, half>);
+                    ActivationOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -33,7 +33,7 @@ void ReluBenchmark(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -45,7 +45,7 @@ void ReluBenchmark(
      .Output("Output")
      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -83,8 +83,8 @@ void ReluBenchmark(
 #define BM_RELU(N, C, H, W)                 \
  BM_RELU_MACRO(N, C, H, W, float, CPU);    \
-  BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
+  BM_RELU_MACRO(N, C, H, W, float, GPU); \
-  BM_RELU_MACRO(N, C, H, W, half, OPENCL);
+  BM_RELU_MACRO(N, C, H, W, half, GPU);
 BM_RELU(1, 1, 512, 512);
 BM_RELU(1, 3, 128, 128);
@@ -107,7 +107,7 @@ void ReluxBenchmark(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -151,8 +151,8 @@ void ReluxBenchmark(
 #define BM_RELUX(N, C, H, W)                 \
  BM_RELUX_MACRO(N, C, H, W, float, CPU);    \
-  BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
+  BM_RELUX_MACRO(N, C, H, W, float, GPU); \
-  BM_RELUX_MACRO(N, C, H, W, half, OPENCL);
+  BM_RELUX_MACRO(N, C, H, W, half, GPU);
 BM_RELUX(1, 1, 512, 512);
 BM_RELUX(1, 3, 128, 128);
@@ -171,7 +171,7 @@ void PreluBenchmark(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -185,7 +185,7 @@ void PreluBenchmark(
      .Output("Output")
      .AddStringArg("activation", "PRELU")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
@@ -226,8 +226,8 @@ void PreluBenchmark(
 #define BM_PRELU(N, C, H, W)                 \
  BM_PRELU_MACRO(N, C, H, W, float, CPU);    \
-  BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
+  BM_PRELU_MACRO(N, C, H, W, float, GPU); \
-  BM_PRELU_MACRO(N, C, H, W, half, OPENCL);
+  BM_PRELU_MACRO(N, C, H, W, half, GPU);
 BM_PRELU(1, 1, 512, 512);
 BM_PRELU(1, 3, 128, 128);
@@ -250,7 +250,7 @@ void TanhBenchmark(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -292,8 +292,8 @@ void TanhBenchmark(
 #define BM_TANH(N, C, H, W)                 \
  BM_TANH_MACRO(N, C, H, W, float, CPU);    \
-  BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
+  BM_TANH_MACRO(N, C, H, W, float, GPU); \
-  BM_TANH_MACRO(N, C, H, W, half, OPENCL);
+  BM_TANH_MACRO(N, C, H, W, half, GPU);
 BM_TANH(1, 1, 512, 512);
 BM_TANH(1, 3, 128, 128);
@@ -316,7 +316,7 @@ void SigmoidBenchmark(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -359,8 +359,8 @@ void SigmoidBenchmark(
 #define BM_SIGMOID(N, C, H, W)                 \
  BM_SIGMOID_MACRO(N, C, H, W, float, CPU);    \
-  BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \
+  BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
-  BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL);
+  BM_SIGMOID_MACRO(N, C, H, W, half, GPU);
 BM_SIGMOID(1, 1, 512, 512);
 BM_SIGMOID(1, 3, 128, 128);

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -31,7 +31,7 @@ void TestSimpleRelu() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -68,7 +68,7 @@ void TestSimpleRelu() {
 TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); }
 TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
-  TestSimpleRelu<DeviceType::OPENCL>();
+  TestSimpleRelu<DeviceType::GPU>();
 }
 namespace {
@@ -79,7 +79,7 @@ void TestUnalignedSimpleRelu() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -117,7 +117,7 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
 }
 TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
-  TestUnalignedSimpleRelu<DeviceType::OPENCL>();
+  TestUnalignedSimpleRelu<DeviceType::GPU>();
 }
@@ -131,7 +131,7 @@ void TestSimpleRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -170,7 +170,7 @@ void TestSimpleRelux() {
 TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }
 TEST_F(ActivationOpTest, OPENCLSimple) {
-  TestSimpleRelux<DeviceType::OPENCL>();
+  TestSimpleRelux<DeviceType::GPU>();
 }
 namespace {
@@ -183,7 +183,7 @@ void TestSimpleReluRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -224,7 +224,7 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
 }
 TEST_F(ActivationOpTest, OPENCLSimpleRelux) {
-  TestSimpleReluRelux<DeviceType::OPENCL>();
+  TestSimpleReluRelux<DeviceType::GPU>();
 }
 namespace {
@@ -238,7 +238,7 @@ void TestSimplePrelu() {
      {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
@@ -283,7 +283,7 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) {
 }
 TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
-  TestSimplePrelu<DeviceType::OPENCL>();
+  TestSimplePrelu<DeviceType::GPU>();
 }
 namespace {
@@ -296,7 +296,7 @@ void TestSimpleTanh() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -336,7 +336,7 @@ void TestSimpleTanh() {
 TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); }
 TEST_F(ActivationOpTest, OPENCLSimpleTanh) {
-  TestSimpleTanh<DeviceType::OPENCL>();
+  TestSimpleTanh<DeviceType::GPU>();
 }
 namespace {
@@ -349,7 +349,7 @@ void TestSimpleSigmoid() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -392,7 +392,7 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
 }
 TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) {
-  TestSimpleSigmoid<DeviceType::OPENCL>();
+  TestSimpleSigmoid<DeviceType::GPU>();
 }
 }  // namespace test

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -26,16 +26,16 @@ void Register_AddN(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    AddNOp<DeviceType::OPENCL, float>);
+                    AddNOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    AddNOp<DeviceType::OPENCL, half>);
+                    AddNOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -33,7 +33,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
  }
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    for (int i = 0; i < inputs; ++i) {
      BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
                          MakeString("InputImage", i).c_str(),
@@ -82,8 +82,8 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
 #define BM_ADDN(INPUTS, N, H, W, C)                 \
  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU);    \
-  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
-  BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);
 BM_ADDN(2, 1, 256, 256, 32);
 BM_ADDN(2, 1, 128, 128, 32);

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -64,7 +64,7 @@ void SimpleAdd3() {
                                  {-0.1582, 2, 3, 4, 5, 6});
  const int input_num = 4;
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    // run on gpu
    for (int i = 0; i < input_num; ++i) {
      BufferToImage<D, half>(&net, MakeString("Input", i),
@@ -105,7 +105,7 @@ void SimpleAdd3() {
 }  // namespace
 TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
-TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::OPENCL>(); }
+TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::GPU>(); }
 namespace {
 template <DeviceType D>
@@ -166,7 +166,7 @@ void RandomTest() {
 }
 }  // namespace
-TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
+TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::GPU>(); }
 }  // namespace test
 }  // namespace ops

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -26,16 +26,16 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BatchNormOp<DeviceType::OPENCL, float>);
+                    BatchNormOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BatchNormOp<DeviceType::OPENCL, half>);
+                    BatchNormOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -32,7 +32,7 @@ void BatchNorm(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -52,7 +52,7 @@ void BatchNorm(
      .AddFloatArg("epsilon", 1e-3)
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -107,8 +107,8 @@ void BatchNorm(
 #define BM_BATCH_NORM(N, C, H, W)                 \
  BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU);    \
-  BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
+  BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
-  BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
+  BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);
 BM_BATCH_NORM(1, 1, 512, 512);
 BM_BATCH_NORM(1, 3, 128, 128);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -49,7 +49,7 @@ void Simple() {
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -90,7 +90,7 @@ void Simple() {
 TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
-TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
+TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }
 TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // generate random input
@@ -103,12 +103,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -139,15 +139,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -162,14 +162,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // Tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -186,12 +186,12 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -221,15 +221,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
                                          kernels::BufferType::ARGUMENT);
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -245,14 +245,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  // Tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
 }
@@ -269,12 +269,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -304,15 +304,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -327,14 +327,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  // tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -351,12 +351,12 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -386,15 +386,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
                                          kernels::BufferType::ARGUMENT);
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -410,14 +410,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  // tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
 }

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -20,15 +20,15 @@ namespace ops {
 void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BatchToSpaceNDOp<DeviceType::OPENCL, float>);
+                    BatchToSpaceNDOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BatchToSpaceNDOp<DeviceType::OPENCL, half>);
+                    BatchToSpaceNDOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -63,7 +63,7 @@ void BMBatchToSpace(
  BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
 #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
-  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, OPENCL);
+  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU);
 BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
 BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -26,16 +26,16 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BiasAddOp<DeviceType::OPENCL, float>);
+                    BiasAddOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BiasAddOp<DeviceType::OPENCL, half>);
+                    BiasAddOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -31,7 +31,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -44,7 +44,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
      .Input("Bias")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
@@ -84,8 +84,8 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
 #define BM_BIAS_ADD(N, C, H, W)                 \
  BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU);    \
-  BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \
+  BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
-  BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL);
+  BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
 BM_BIAS_ADD(1, 1, 512, 512);
 BM_BIAS_ADD(1, 3, 128, 128);

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -47,7 +47,7 @@ void BiasAddSimple() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Bias", "BiasImage",
@@ -80,7 +80,7 @@ void BiasAddSimple() {
 TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }
 TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
-  BiasAddSimple<DeviceType::OPENCL>();
+  BiasAddSimple<DeviceType::GPU>();
 }
 TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
@@ -94,9 +94,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -123,9 +123,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -135,10 +135,10 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
      .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
@@ -154,9 +154,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -182,9 +182,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -194,10 +194,10 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
      .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }

--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -19,16 +19,16 @@ namespace ops {
 void Register_BufferToImage(OperatorRegistry *op_registry) {
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BufferToImageOp<DeviceType::OPENCL, float>);
+                    BufferToImageOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BufferToImageOp<DeviceType::OPENCL, half>);
+                    BufferToImageOp<DeviceType::GPU, half>);
 }
 }  // namespace ops

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -54,73 +54,73 @@ void TestBidirectionTransform(const int type,
 }  // namespace
 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {1});
 }
 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT, {11});
 }
 TEST(BufferToImageTest, ArgMedia) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {11});
 }
 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {256});
 }
 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {1, 2, 3, 1});
 }
 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {1, 2, 3, 3});
 }
 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {3, 2, 3, 3});
 }
 TEST(BufferToImageTest, InputMedia) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {3, 13, 17, 128});
 }
 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {3, 64, 64, 256});
 }
 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {1, 1, 3, 5});
 }
 TEST(BufferToImageTest, Filter1x1Media) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {1, 1, 13, 17});
 }
 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {1, 1, 128, 512});
 }
 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {3, 3, 3, 5});
 }
 TEST(BufferToImageTest, Filter3x3Meida) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {3, 3, 13, 17});
 }
 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {3, 3, 128, 256});
 }
@@ -158,7 +158,7 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace
 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
                                                             {11});
 }
@@ -203,7 +203,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
  const unsigned char input_data[] = {
      0xCD, 0x3C, 0x33, 0x40,
  };
-  TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
      kernels::ARGUMENT, {2}, input_data);
 }

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -26,16 +26,16 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ChannelShuffleOp<DeviceType::OPENCL, float>);
+                    ChannelShuffleOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ChannelShuffleOp<DeviceType::OPENCL, half>);
+                    ChannelShuffleOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -35,7 +35,7 @@ class ChannelShuffleOp : public Operator<D, T> {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    int channels;
-    if (D == OPENCL) {
+    if (D == GPU) {
      channels = input->dim(3);
    } else if (D == CPU) {
      channels = input->dim(1);

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -31,7 +31,7 @@ void ChannelShuffle(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -42,7 +42,7 @@ void ChannelShuffle(
      .Input("Input")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -82,8 +82,8 @@ void ChannelShuffle(
 #define BM_CHANNEL_SHUFFLE(N, C, H, W, G)                 \
  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU);    \
-  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \
+  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
-  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL);
+  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);
 BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
 BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -60,11 +60,11 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<DeviceType::OPENCL, float>(
+  net.AddInputFromArray<DeviceType::GPU, float>(
      "Input", {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
@@ -74,10 +74,10 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
      .Finalize(net.NewOperatorDef());
  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  // Transfer output
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  // Check

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -26,16 +26,16 @@ void Register_Concat(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ConcatOp<DeviceType::OPENCL, float>);
+                    ConcatOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ConcatOp<DeviceType::OPENCL, half>);
+                    ConcatOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -74,12 +74,12 @@ void OpenclConcatHelper(int iters,
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0);
+  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1);
+  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImage0",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImage1",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Concat", "ConcatBM")
      .Input("InputImage0")
@@ -91,7 +91,7 @@ void OpenclConcatHelper(int iters,
  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(DeviceType::OPENCL);
+    net.RunOp(DeviceType::GPU);
  }
  const int64_t tot =
@@ -101,7 +101,7 @@ void OpenclConcatHelper(int iters,
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(DeviceType::OPENCL);
+    net.RunOp(DeviceType::GPU);
  }
 }
 }  // namespace

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -171,9 +171,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
    concat_axis_size += shapes[i][axis];
    GenerateRandomRealTypeData(shapes[i], &inputs[i]);
    input_ptrs[i] = inputs[i].data();
-    net.AddInputFromArray<DeviceType::OPENCL, float>(input_name,
+    net.AddInputFromArray<DeviceType::GPU, float>(input_name,
                                                     shapes[i], inputs[i]);
-    BufferToImage<DeviceType::OPENCL, T>(&net, input_name, image_name,
+    BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
                                         kernels::BufferType::IN_OUT_CHANNEL);
  }
@@ -188,9 +188,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
      .Finalize(net.NewOperatorDef());
  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  // Check

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -26,16 +26,16 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    Conv2dOp<DeviceType::OPENCL, float>);
+                    Conv2dOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    Conv2dOp<DeviceType::OPENCL, half>);
+                    Conv2dOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -16,6 +16,7 @@
 #define MACE_OPS_CONV_2D_H_
 #include <memory>
+#include <string>
 #include "mace/core/operator.h"
 #include "mace/kernels/conv_2d.h"
@@ -33,8 +34,10 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                 this->padding_type_,
                 this->paddings_,
                 this->dilations_.data(),
-                 kernels::ActivationType::NOOP,
+                 kernels::StringToActivationType(
-                 0.0f,
+                     OperatorBase::GetSingleArgument<std::string>("activation",
+                                                                  "NOOP")),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
                 static_cast<bool>(OperatorBase::GetSingleArgument<int>(
                     "is_filter_transformed", false)),
                 ws->GetScratchBuffer(D)) {}

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -47,7 +47,7 @@ void Conv2d(int iters,
                                 {output_channels, channels, kernel_h,
                                  kernel_w});
    net.AddRandomInput<D, float>("Bias", {output_channels});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
    net.AddRandomInput<D, float>("Filter",
                                 {kernel_h, kernel_w, output_channels,
@@ -68,7 +68,7 @@ void Conv2d(int iters,
      .AddIntsArg("dilations", {dilation, dilation})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -138,8 +138,8 @@ void Conv2d(int iters,
 #define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC)                 \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU);    \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -63,7 +63,7 @@ void TestNHWCSimple3x3VALID() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -134,7 +134,7 @@ void TestNHWCSimple3x3SAME() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -176,8 +176,8 @@ TEST_F(Conv2dOpTest, CPUSimple) {
 }
 TEST_F(Conv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3VALID<DeviceType::GPU, float>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3SAME<DeviceType::GPU, float>();
 }
 namespace {
@@ -219,7 +219,7 @@ void TestNHWCSimple3x3WithoutBias() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -255,7 +255,7 @@ TEST_F(Conv2dOpTest, CPUWithoutBias) {
 }
 TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
 }
 namespace {
@@ -301,7 +301,7 @@ void TestNHWCCombined3x3() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -341,9 +341,164 @@ TEST_F(Conv2dOpTest, CPUStride2) {
 }
 TEST_F(Conv2dOpTest, OPENCLStride2) {
-  TestNHWCCombined3x3<DeviceType::OPENCL, float>();
+  TestNHWCCombined3x3<DeviceType::GPU, float>();
 }
+namespace {
+template<DeviceType D, typename T>
+void TestFusedNHWCSimple3x3VALID() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {3, 3, 1, 2},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(&net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
+}
+template<DeviceType D, typename T>
+void TestFusedNHWCSimple3x3WithoutBias() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {3, 3, 1, 2},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  // Check
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
+}
+}  // namespace
+TEST_F(Conv2dOpTest, FusedCPUSimple) {
+  TestFusedNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestFusedNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
+}
+TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
+  TestFusedNHWCSimple3x3VALID<DeviceType::GPU, float>();
+  TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
+}
 namespace {
 template<DeviceType D>
 void TestConv1x1() {
@@ -389,7 +544,7 @@ void TestConv1x1() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Filter", "FilterImage",
@@ -431,7 +586,7 @@ void TestConv1x1() {
 TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
-TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
+TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
 namespace {
 template<DeviceType D, typename T>
@@ -524,18 +679,18 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
 }  // namespace
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 2);
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 2);
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({31, 113, 13, 17}, 3);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 32, 13, 17}, 4);
 }
 namespace {
@@ -644,52 +799,52 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
 }  // namespace
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16},
                                                {2, 2});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16},
                                                {2, 2});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16},
                                                {4, 4});
 }
@@ -787,15 +942,115 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
 }  // namespace
 TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
+  TestDilationConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, 2);
 }
 TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
+  TestDilationConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, 4);
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
+  TestDilationConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, 4);
+}
+namespace {
+template<DeviceType D>
+void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
+                               const std::vector<index_t> &filter_shape,
+                               const std::vector<int> &dilations) {
+  testing::internal::LogToStderr();
+  auto func = [&](int stride_h, int stride_w, Padding type) {
+    srand(time(NULL));
+    // generate random input
+    index_t batch = 1;
+    index_t height = image_shape[0];
+    index_t width = image_shape[1];
+    index_t kernel_h = filter_shape[0];
+    index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
+    OpsTestNet net;
+    // Add input data
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>(
+        "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    // Construct graph
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", dilations)
+        .Finalize(net.NewOperatorDef());
+    // run on cpu
+    net.RunOp();
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+    // run on gpu
+    BufferToImage<D, half>(&net, "Input", "InputImage",
+                           kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(&net, "Filter", "FilterImage",
+                           kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, half>(&net, "Bias", "BiasImage",
+                           kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", dilations)
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
+                            1e-2, 1e-1);
+  };
+  func(1, 1, VALID);
+  func(1, 1, SAME);
+}
+}  // namespace
+TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) {
+  TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3},
+                                                {2, 2});
+}
+TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
+  TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16},
+                                                {2, 2});
 }
 namespace {
@@ -887,16 +1142,16 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
 }  // namespace
 TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
                                                     {2, 2});
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {4, 4});
 }
 }  // namespace test

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -32,7 +32,7 @@ TEST(CoreTest, INIT_MODE) {
      .Finalize(&op_defs[op_defs.size() - 1]);
  Tensor *input =
-      ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::OPENCL),
+      ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
                      DataTypeToEnum<float>::v());
  input->Resize({1, 3, 3, 3});
  {
@@ -54,13 +54,13 @@ TEST(CoreTest, INIT_MODE) {
  }
  std::shared_ptr<OperatorRegistry> op_registry(new OperatorRegistry());
  auto net =
-      CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL, NetMode::INIT);
+      CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT);
  net->Run();
  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
-  net = CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL);
+  net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU);
  net->Run();
  EXPECT_TRUE(ws.GetTensor("Output") != nullptr);

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -26,16 +26,16 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    DepthToSpaceOp<DeviceType::OPENCL, float>);
+                    DepthToSpaceOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    DepthToSpaceOp<DeviceType::OPENCL, half>);
+                    DepthToSpaceOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -40,7 +40,7 @@ class DepthToSpaceOp : public Operator<D, T> {
    int input_depth;
    if (D == CPU) {
      input_depth = input->dim(1);
-    } else if (D == OPENCL) {
+    } else if (D == GPU) {
      input_depth = input->dim(3);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -31,7 +31,7 @@ void DepthToSpace(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -42,7 +42,7 @@ void DepthToSpace(
      .Input("Input")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
@@ -82,8 +82,8 @@ void DepthToSpace(
 #define BM_DEPTH_TO_SPACE(N, C, H, W, G)                 \
  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU);    \
-  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, OPENCL); \
+  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
-  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, OPENCL);
+  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);
 BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
 BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -65,8 +65,8 @@ void RunDepthToSpace(const bool d2s,
  }
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
-    ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
+    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
        kernels::BufferType::IN_OUT_CHANNEL);
  }
  auto expected = CreateTensor<float>(expected_shape, expected_data);
@@ -88,7 +88,7 @@ TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
 }
 TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 4, 4},
+  RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4},
      {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
      2,
@@ -110,7 +110,7 @@ TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
 }
 TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 2, 4},
+  RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4},
      {1, 2, 3, 4, 5, 6, 7, 8,
       9, 10, 11, 12, 13, 14, 15, 16},
      2,
@@ -132,7 +132,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
 }
 TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 2, 16},
+  RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
      2,
@@ -152,7 +152,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
 }
 TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 1, 16},
+  RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16},
      {1,  2,  3,  4,  5,  6,  7,  8,
       9,  10, 11, 12, 13, 14, 15, 16},
      2,
@@ -165,7 +165,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
 TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
  const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0);
-  RunDepthToSpace<DeviceType::OPENCL>(true, {1, 192, 192, 128},
+  RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128},
                                      in,
                                      2,
                                      {1, 384, 384, 32},
@@ -234,19 +234,19 @@ void RandomTest(const bool d2s, const int block_size,
 }  // namespace
 TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
-  RandomTest<DeviceType::OPENCL, float>(true, 2, {1, 192, 192, 128});
+  RandomTest<DeviceType::GPU, float>(true, 2, {1, 192, 192, 128});
 }
 TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
-RandomTest<DeviceType::OPENCL, half>(true, 2, {1, 192, 192, 128});
+RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
 }
 TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
-RandomTest<DeviceType::OPENCL, float>(false, 2, {1, 384, 384, 32});
+RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
 }
 TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
-RandomTest<DeviceType::OPENCL, half>(false, 2, {1, 384, 384, 32});
+RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
 }
 }  // namespace test

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -26,16 +26,16 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    DepthwiseConv2dOp<DeviceType::OPENCL, float>);
+                    DepthwiseConv2dOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    DepthwiseConv2dOp<DeviceType::OPENCL, half>);
+                    DepthwiseConv2dOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -46,7 +46,7 @@ void DepthwiseConv2d(int iters,
    net.AddRandomInput<D, float>(
      "Filter", {multiplier, input_channels, kernel_h, kernel_w});
    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, height, width, input_channels});
    net.AddRandomInput<D, float>(
@@ -67,7 +67,7 @@ void DepthwiseConv2d(int iters,
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -134,8 +134,8 @@ void DepthwiseConv2d(int iters,
 #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M)                 \
  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU);    \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);
 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
 BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -59,7 +59,7 @@ void SimpleValidTest() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -105,11 +105,11 @@ TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
 }
 TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) {
-  SimpleValidTest<DeviceType::OPENCL, float>();
+  SimpleValidTest<DeviceType::GPU, float>();
 }
 TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
-  SimpleValidTest<DeviceType::OPENCL, half>();
+  SimpleValidTest<DeviceType::GPU, half>();
 }
 namespace {
@@ -184,7 +184,7 @@ void ComplexValidTest() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -245,11 +245,11 @@ TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
 }
 TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
-  ComplexValidTest<DeviceType::OPENCL, float>();
+  ComplexValidTest<DeviceType::GPU, float>();
 }
 TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
-  ComplexValidTest<DeviceType::OPENCL, half>();
+  ComplexValidTest<DeviceType::GPU, half>();
 }
 namespace {
@@ -267,12 +267,12 @@ void TestNxNS12(const index_t height, const index_t width) {
    OpsTestNet net;
    // Add input data
-    net.AddRandomInput<DeviceType::OPENCL, float>("Input",
+    net.AddRandomInput<DeviceType::GPU, float>("Input",
                                                  {batch, height, width,
                                                   input_channels});
-    net.AddRandomInput<DeviceType::OPENCL, float>(
+    net.AddRandomInput<DeviceType::GPU, float>(
      "Filter", {kernel_h, kernel_w, input_channels, multiplier});
-    net.AddRandomInput<DeviceType::OPENCL, float>("Bias",
+    net.AddRandomInput<DeviceType::GPU, float>("Bias",
                                                  {multiplier
                                                     * input_channels});
@@ -307,11 +307,11 @@ void TestNxNS12(const index_t height, const index_t width) {
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
-    BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+    BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<DeviceType::OPENCL, T>(&net, "Filter", "FilterImage",
+    BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
                                         kernels::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
+    BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
                                         kernels::BufferType::ARGUMENT);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
      .Input("InputImage")
@@ -324,10 +324,10 @@ void TestNxNS12(const index_t height, const index_t width) {
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-    net.RunOp(DeviceType::OPENCL);
+    net.RunOp(DeviceType::GPU);
    // Transfer output
-    ImageToBuffer<DeviceType::OPENCL, float>(&net,
+    ImageToBuffer<DeviceType::GPU, float>(&net,
                                         "OutputImage",
                                         "DeviceOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -26,16 +26,16 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    EltwiseOp<DeviceType::OPENCL, float>);
+                    EltwiseOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    EltwiseOp<DeviceType::OPENCL, half>);
+                    EltwiseOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -34,10 +34,10 @@ void EltwiseBenchmark(
  net.AddRandomInput<D, T>("Input0", {n, h, w, c});
  net.AddRandomInput<D, T>("Input1", {n, h, w, c});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input0", "InputImg0",
+    BufferToImage<D, half>(&net, "Input0", "InputImg0",
                           kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Input1", "InputImg1",
+    BufferToImage<D, half>(&net, "Input1", "InputImg1",
                           kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("InputImg0")
@@ -90,8 +90,8 @@ void EltwiseBenchmark(
 #define BM_ELTWISE(ELT_TYPE, N, H, W, C)                 \
  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU);    \
-  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, OPENCL); \
+  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
-  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, OPENCL);
+  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);
 BM_ELTWISE(2, 1, 128, 128, 32);
 BM_ELTWISE(2, 1, 240, 240, 256);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -183,45 +183,45 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUM,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
                                             {1, 1, 1, 1}, {1}, 1,
                                             {2});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUB,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
                                             {1, 1, 2, 3},
                                             {1, 2, 3, 4, 5, 6},
                                             1,
                                             {0, 1, 2, 3, 4, 5});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::PROD,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
                                             {1, 1, 2, 3},
                                             {1, 2, 3, 4, 5, 6},
                                             2,
                                             {2, 4, 6, 8, 10, 12});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::DIV,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
                                             {1, 1, 2, 3},
                                             {2, 4, 6, 8, 10, 12},
                                             2,
                                             {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MIN,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
                                             {1, 1, 2, 3},
                                             {1, 2, 3, 4, 5, 6},
                                             1,
                                             {1, 1, 1, 1, 1, 1});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MAX,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
                                             {1, 1, 2, 3},
                                             {1, 2, 3, 4, 5, 6},
                                             3,
                                             {3, 3, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::NEG,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
                                             {1, 1, 2, 3},
                                             {1, 2, 3, 4, 5, 6},
                                             3,
                                             {-1, -2, -3, -4, -5, -6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::ABS,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::ABS,
                                             {1, 1, 2, 3},
                                             {-1, -2, -3, -4, -5, -6},
                                             3,
                                             {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SQR_DIFF,
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
                                             {1, 1, 2, 3},
                                             {1, 2, 3, 4, 5, 6},
                                             1,
@@ -277,49 +277,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUM,
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 1, 3}, {1, 2, 3},
      {2, 4, 6, 5, 7, 9});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUB,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUB,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::PROD,
      {1, 1, 1, 3}, {1, 2, 3},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 4, 9, 4, 10, 18});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::DIV,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 1, 1, 1, 5},
      {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::DIV,
      {1, 1, 1, 5}, {1, 1, 1, 2, 4},
      {1, 2, 1, 5},
      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
      {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MIN,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MAX,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SQR_DIFF,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
@@ -369,43 +369,43 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
                                               25});
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUM,
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {2, 4, 6, 8, 10, 12});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUM,
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {0.2, 0.4, 0.6, 0.8, 1, 1.2},
      {0.1, 0.1});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUB,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {0, 0, 0, 0, 0});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::PROD,
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 4, 9, 16, 25, 36});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::DIV,
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 1, 1, 1, 1});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MIN,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MAX,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SQR_DIFF,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
@@ -420,7 +420,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", shape, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -441,7 +441,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImg",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("InputImg")
@@ -452,15 +452,15 @@ void RandomTensorScalar(const kernels::EltwiseType type,
      .Finalize(net.NewOperatorDef());
  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
                            1e-2);
  }
 }
@@ -474,8 +474,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);
  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC,
                                                  "TInput0", NCHW);
@@ -496,9 +496,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImg0",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImg1",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("InputImg0")
@@ -510,15 +510,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
      .Finalize(net.NewOperatorDef());
  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
                            1e-2);
  }
 }

--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -26,16 +26,16 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    FoldedBatchNormOp<DeviceType::OPENCL, float>);
+                    FoldedBatchNormOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    FoldedBatchNormOp<DeviceType::OPENCL, half>);
+                    FoldedBatchNormOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -60,7 +60,7 @@ void Simple() {
    // Run
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -94,7 +94,7 @@ void Simple() {
 TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
-TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
+TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }
 TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  // generate random input
@@ -108,10 +108,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -138,11 +138,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -153,10 +153,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
    .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -173,10 +173,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -203,11 +203,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);
  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -219,10 +219,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
    .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
 }
@@ -239,10 +239,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -269,11 +269,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -284,9 +284,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
    .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -303,10 +303,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -333,11 +333,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);
  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -349,9 +349,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
    .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
 }

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -26,16 +26,16 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    FullyConnectedOp<DeviceType::OPENCL, float>);
+                    FullyConnectedOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    FullyConnectedOp<DeviceType::OPENCL, half>);
+                    FullyConnectedOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -43,7 +43,7 @@ void FCBenchmark(
      .Input("Bias")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
    BufferToImage<D, T>(&net, "Weight", "WeightImage",
                        weight_type);
@@ -93,8 +93,8 @@ void FCBenchmark(
 #define BM_FC(N, H, W, C, OC)                 \
  BM_FC_MACRO(N, H, W, C, OC, float, CPU);    \
-  BM_FC_MACRO(N, H, W, C, OC, float, OPENCL); \
+  BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
-  BM_FC_MACRO(N, H, W, C, OC, half, OPENCL);
+  BM_FC_MACRO(N, H, W, C, OC, half, GPU);
 BM_FC(1, 16, 16, 32, 32);
 BM_FC(1, 8, 8, 32, 1000);

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -51,7 +51,7 @@ void Simple(const std::vector<index_t> &input_shape,
    // Run
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Weight", "WeightImage",
@@ -104,14 +104,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
 }
 TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
-  Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
+  Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
                             {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
                             {206});
-  Simple<DeviceType::OPENCL>(
+  Simple<DeviceType::GPU>(
    {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
    {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
-  Simple<DeviceType::OPENCL>(
+  Simple<DeviceType::GPU>(
    {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
    {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
     4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
@@ -119,7 +119,7 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
 }
 TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
-  Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
+  Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
                             {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }
@@ -136,11 +136,11 @@ void Complex(const index_t batch,
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Weight", {out_channel, height * width * channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
  OpDefBuilder("FC", "FullyConnectedTest")
    .Input("Input")
@@ -159,11 +159,11 @@ void Complex(const index_t batch,
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
                                       kernels::BufferType::WEIGHT_HEIGHT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("FC", "FullyConnectedTest")
@@ -176,9 +176,9 @@ void Complex(const index_t batch,
    .Finalize(net.NewOperatorDef());
  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
@@ -225,11 +225,11 @@ void TestWXFormat(const index_t batch,
  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Weight", {out_channel, height * width * channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
  OpDefBuilder("FC", "FullyConnectedTest")
    .Input("Input")
@@ -248,11 +248,11 @@ void TestWXFormat(const index_t batch,
  expected.Copy(*net.GetOutput("Output"));
  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
                                       kernels::BufferType::WEIGHT_WIDTH);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);
  OpDefBuilder("FC", "FullyConnectedTest")
@@ -264,9 +264,9 @@ void TestWXFormat(const index_t batch,
    .Finalize(net.NewOperatorDef());
  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),

--- a/mace/ops/fused_conv_2d.cc
+++ b/mace/ops/fused_conv_2d.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "mace/ops/fused_conv_2d.h"
-namespace mace {
-namespace ops {
-void Register_FusedConv2D(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::CPU, float>);
-#ifdef MACE_ENABLE_OPENCL
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::OPENCL, float>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::OPENCL, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/reorganize.cc
+++ b/mace/ops/reorganize.cc
--- a/mace/ops/reorganize.h
+++ b/mace/ops/reorganize.h
--- a/mace/ops/reorganize_test.cc
+++ b/mace/ops/reorganize_test.cc
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
--- a/mace/ops/slice_benchmark.cc
+++ b/mace/ops/slice_benchmark.cc
--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/ops/winograd_inverse_transform.cc
+++ b/mace/ops/winograd_inverse_transform.cc
--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py