Change DeviceType::OPENCL to DeviceType::GPU and fix some bugs.

3ba07e0b · liuqi · d98386d6 · 3ba07e0b · 3ba07e0b · 3ba07e0b
142 changed file
--- a/docs/developer/adding_a_new_op.md
+++ b/docs/developer/adding_a_new_op.md
@@ -24,16 +24,16 @@ void Register_Custom_Op(OperatorRegistry *op_registry) {
                    Custom_Op<DeviceType::CPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, float>);
+                    Custom_Op<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("op_name")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, half>);
+                    Custom_Op<DeviceType::GPU, half>);
 }

 }  // namespace ops

--- a/docs/user/introduction.md
+++ b/docs/user/introduction.md
@@ -316,7 +316,7 @@ unsigned char *model_data = mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data
 NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);

 //3. 声明设备类型
-DeviceType device_type = DeviceType::OPENCL;
+DeviceType device_type = DeviceType::GPU;

 //4. 定义输入输出名称数组
 std::vector<std::string> input_names = {...};
@@ -350,8 +350,8 @@ for (size_t i = 0; i < output_count; ++i) {
 //6. 创建MaceEngine对象
 mace::MaceEngine engine(&net_def, device_type, input_names, output_names);

-//7. 如果设备类型是OPENCL或HEXAGON，可以在此释放model_data
-if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+//7. 如果设备类型是GPU或者HEXAGON，可以在此释放model_data
+if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
  mace::MACE_MODEL_TAG::UnloadModelData(model_data);
 }


--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -108,10 +108,8 @@ inline int64_t NowMicros() {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
-    return DeviceType::NEON;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
+  } else if (device_str.compare("GPU") == 0) {
+    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -198,7 +196,7 @@ bool Run(MaceEngine *engine,
  return true;
 }

-DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]");
+DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
 DEFINE_string(input_node, "input_node0,input_node1",
              "input nodes, separated by comma");
 DEFINE_string(output_node, "output_node0,output_node1",
@@ -279,7 +277,7 @@ int Main(int argc, char **argv) {
  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (device_type == DeviceType::OPENCL) {
+  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
@@ -347,7 +345,7 @@ int Main(int argc, char **argv) {
  LOG(INFO) << "Run init";
  std::unique_ptr<mace::MaceEngine> engine_ptr(
      new mace::MaceEngine(&net_def, device_type, input_names, output_names));
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
  }


--- a/mace/benchmark/model_throughput_test.cc
+++ b/mace/benchmark/model_throughput_test.cc
@@ -141,10 +141,8 @@ std::string FormatName(const std::string input) {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
-    return DeviceType::NEON;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
+  } else if (device_str.compare("GPU") == 0) {
+    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -277,7 +275,7 @@ int Main(int argc, char **argv) {
      FLAGS_gpu_model_data_file.c_str());
  NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);

-  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names,
+  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::GPU, input_names,
                              output_names);
  mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);


--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -34,9 +34,8 @@ Allocator *GetDeviceAllocator(DeviceType type) {
 }

 MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
-MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
 #ifdef MACE_ENABLE_OPENCL
-MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
+MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
 #endif
 MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());


--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -189,7 +189,7 @@ class Image : public BufferBase {
 public:
  Image()
      : BufferBase(0),
-        allocator_(GetDeviceAllocator(OPENCL)),
+        allocator_(GetDeviceAllocator(GPU)),
        buf_(nullptr),
        mapped_buf_(nullptr) {}

@@ -198,7 +198,7 @@ class Image : public BufferBase {
            std::accumulate(
                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
            GetEnumTypeSize(data_type)),
-        allocator_(GetDeviceAllocator(OPENCL)),
+        allocator_(GetDeviceAllocator(GPU)),
        mapped_buf_(nullptr) {
    shape_ = shape;
    data_type_ = data_type;

--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -193,7 +193,7 @@ MaceStatus MaceEngine::Impl::Run(
    input_tensors.push_back(input_tensor);
  }
  for (auto &output : *outputs) {
-    if (device_type_ == DeviceType::OPENCL) {
+    if (device_type_ == DeviceType::GPU) {
      MACE_CHECK(output.second.shape().size() == 4,
                 "The outputs' shape must be 4-dimension with NHWC format,"
                     " please use 1 to fill missing dimensions");
@@ -217,7 +217,7 @@ MaceStatus MaceEngine::Impl::Run(
 #endif

 #ifdef MACE_ENABLE_OPENCL
-  if (device_type_ == OPENCL) {
+  if (device_type_ == GPU) {
    OpenCLRuntime::Global()->SaveBuiltCLProgram();
  }
 #endif

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -54,7 +54,7 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
    auto &op = *iter;
    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
                        op->debug_def().type(), ")");
-    bool future_wait = (device_type_ == DeviceType::OPENCL &&
+    bool future_wait = (device_type_ == DeviceType::GPU &&
                        (run_metadata != nullptr ||
                         std::distance(iter, operators_.end()) == 1));


--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -88,7 +88,6 @@ extern void Register_Dequantize(OperatorRegistry *op_registry);
 extern void Register_Eltwise(OperatorRegistry *op_registry);
 extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
 extern void Register_FullyConnected(OperatorRegistry *op_registry);
-extern void Register_FusedConv2D(OperatorRegistry *op_registry);
 extern void Register_LocalResponseNorm(OperatorRegistry *op_registry);
 extern void Register_MatMul(OperatorRegistry *op_registry);
 extern void Register_Pad(OperatorRegistry *op_registry);
@@ -96,7 +95,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
 extern void Register_Proposal(OperatorRegistry *op_registry);
 extern void Register_PSROIAlign(OperatorRegistry *op_registry);
 extern void Register_Quantize(OperatorRegistry *op_registry);
-extern void Register_ReOrganize(OperatorRegistry *op_registry);
 extern void Register_Requantize(OperatorRegistry *op_registry);
 extern void Register_Reshape(OperatorRegistry *op_registry);
 extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
@@ -130,7 +128,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_Eltwise(this);
  ops::Register_FoldedBatchNorm(this);
  ops::Register_FullyConnected(this);
-  ops::Register_FusedConv2D(this);
  ops::Register_LocalResponseNorm(this);
  ops::Register_MatMul(this);
  ops::Register_Pad(this);
@@ -139,7 +136,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_PSROIAlign(this);
  ops::Register_Quantize(this);
  ops::Register_Requantize(this);
-  ops::Register_ReOrganize(this);
  ops::Register_Reshape(this);
  ops::Register_ResizeBilinear(this);
  ops::Register_Slice(this);

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -82,7 +82,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  VLOG(3) << "Model data size: " << model_data_size;

  if (model_data_size > 0) {
-    if (type == DeviceType::CPU || type == DeviceType::NEON) {
+    if (type == DeviceType::CPU) {
      tensor_buffer_ = std::unique_ptr<Buffer>(
          new Buffer(GetDeviceAllocator(type),
                     model_data_ptr,
@@ -119,7 +119,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
    tensor_map_[const_tensor.name()] = std::move(tensor);
  }

-  if (type == DeviceType::CPU || type == DeviceType::OPENCL) {
+  if (type == DeviceType::CPU || type == DeviceType::GPU) {
    CreateOutputTensorBuffer(net_def, type);
  }
 }
@@ -149,7 +149,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (device_type == DeviceType::OPENCL) {
+    if (device_type == DeviceType::GPU) {
      std::unique_ptr<BufferBase> image_buf(
          new Image({mem_block.x(), mem_block.y()}, dtype));
      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
@@ -170,7 +170,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
        std::unique_ptr<Tensor> tensor
            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
        tensor->SetSourceOpName(op.name());
-        if (device_type == DeviceType::OPENCL) {
+        if (device_type == DeviceType::GPU) {
          VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
                  << " Mem: "  << mem_ids[i]
                  << " Image shape: "
@@ -191,7 +191,7 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
 }

 ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
-  if (device_type == CPU || device_type == NEON) {
+  if (device_type == CPU) {
    return host_scratch_buffer_.get();
  } else {
    return nullptr;

--- a/mace/examples/example.cc
+++ b/mace/examples/example.cc
@@ -22,7 +22,7 @@
 *          --input_file=input_data \
 *          --output_file=mace.out  \
 *          --model_data_file=model_data.data \
- *          --device=OPENCL
+ *          --device=GPU
 */
 #include <malloc.h>
 #include <stdint.h>
@@ -102,10 +102,8 @@ std::string FormatName(const std::string input) {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
-    return DeviceType::NEON;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
+  } else if (device_str.compare("GPU") == 0) {
+    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -135,7 +133,7 @@ DEFINE_string(output_file,
 DEFINE_string(model_data_file,
              "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
-DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
+DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
 DEFINE_int32(round, 1, "round");
 DEFINE_int32(restart_round, 1, "restart round");
 DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
@@ -160,7 +158,7 @@ bool RunModel(const std::vector<std::string> &input_names,
  MaceStatus res = mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (device_type == DeviceType::OPENCL) {
+  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
@@ -178,7 +176,7 @@ bool RunModel(const std::vector<std::string> &input_names,
  // Init model
  mace::MaceEngine engine(&net_def, device_type, input_names,
                          output_names);
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
  }


--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -162,7 +162,7 @@ class ActivationFunctor<DeviceType::CPU, float> {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-class ActivationFunctor<DeviceType::OPENCL, T> {
+class ActivationFunctor<DeviceType::GPU, T> {
 public:
  ActivationFunctor(ActivationType type, T relux_max_limit)
      : activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -93,7 +93,7 @@ struct AddNFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct AddNFunctor<DeviceType::OPENCL, T> {
+struct AddNFunctor<DeviceType::GPU, T> {
  void operator()(const std::vector<const Tensor *> &input_tensors,
                  Tensor *output_tensor,
                  StatsFuture *future);

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -128,7 +128,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
+struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -65,7 +65,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct BiasAddFunctor<DeviceType::OPENCL, T> {
+struct BiasAddFunctor<DeviceType::GPU, T> {
  void operator()(const Tensor *input,
                  const Tensor *bias,
                  Tensor *output,

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -44,7 +44,7 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
 };

 template <typename T>
-struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
+struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
  explicit BufferToImageFunctor(bool i2b = false)
      : BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -67,7 +67,7 @@ struct ChannelShuffleFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
+struct ChannelShuffleFunctor<DeviceType::GPU, T> {
  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -93,7 +93,7 @@ struct ConcatFunctor : ConcatFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
+struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}

  void operator()(const std::vector<const Tensor *> &input_list,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -615,7 +615,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
+struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
  Conv2dFunctor(const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -117,7 +117,7 @@ struct DepthToSpaceOpFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
+struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
  DepthToSpaceOpFunctor(const int block_size, bool d2s)
    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -297,7 +297,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
+struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
  : DepthwiseConv2dFunctorBase {
  DepthwiseConv2dFunctor(const int *strides,
                         const Padding padding_type,

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -363,7 +363,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float value)

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -88,7 +88,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
+struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
  FullyConnectedFunctor(const int /*BufferType*/ weight_type,
                        const ActivationType activation,
                        const float relux_max_limit)

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -75,7 +75,7 @@ struct MatMulFunctor {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct MatMulFunctor<DeviceType::OPENCL, T> {
+struct MatMulFunctor<DeviceType::GPU, T> {
  void operator()(const Tensor *A,
                  const Tensor *B,
                  Tensor *C,

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          const Tensor *alpha,
                                                          Tensor *output,
                                                          StatsFuture *future) {
@@ -46,7 +46,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -124,7 +124,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }

-template struct ActivationFunctor<DeviceType::OPENCL, float>;
-template struct ActivationFunctor<DeviceType::OPENCL, half>;
+template struct ActivationFunctor<DeviceType::GPU, float>;
+template struct ActivationFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void AddNFunctor<DeviceType::OPENCL, T>::operator()(
+void AddNFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_tensors,
    Tensor *output_tensor,
    StatsFuture *future) {
@@ -58,7 +58,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -119,9 +119,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct AddNFunctor<DeviceType::OPENCL, float>;
+template struct AddNFunctor<DeviceType::GPU, float>;

-template struct AddNFunctor<DeviceType::OPENCL, half>;
+template struct AddNFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                         const Tensor *scale,
                                                         const Tensor *offset,
                                                         const Tensor *mean,
@@ -56,7 +56,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -130,7 +130,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }

-template struct BatchNormFunctor<DeviceType::OPENCL, float>;
-template struct BatchNormFunctor<DeviceType::OPENCL, half>;
+template struct BatchNormFunctor<DeviceType::GPU, float>;
+template struct BatchNormFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       const Tensor *bias,
                                                       Tensor *output,
                                                       StatsFuture *future) {
@@ -49,7 +49,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -115,7 +115,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }

-template struct BiasAddFunctor<DeviceType::OPENCL, float>;
-template struct BiasAddFunctor<DeviceType::OPENCL, half>;
+template struct BiasAddFunctor<DeviceType::GPU, float>;
+template struct BiasAddFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -20,7 +20,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
+void BufferToImageFunctor<DeviceType::GPU, T>::operator()(
    Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
  std::vector<size_t> image_shape;

@@ -95,7 +95,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    if (!kernel_error_) {
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -177,8 +177,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
-template struct BufferToImageFunctor<DeviceType::OPENCL, half>;
+template struct BufferToImageFunctor<DeviceType::GPU, float>;
+template struct BufferToImageFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
+void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
@@ -56,7 +56,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -108,8 +108,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 }

 template
-struct ChannelShuffleFunctor<DeviceType::OPENCL, float>;
+struct ChannelShuffleFunctor<DeviceType::GPU, float>;
 template
-struct ChannelShuffleFunctor<DeviceType::OPENCL, half>;
+struct ChannelShuffleFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -50,7 +50,7 @@ static void Concat2(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -132,7 +132,7 @@ static void ConcatN(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -216,7 +216,7 @@ static void ConcatN(cl::Kernel *kernel,
 }

 template <typename T>
-void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
+void ConcatFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_list,
    Tensor *output,
    StatsFuture *future) {
@@ -264,8 +264,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct ConcatFunctor<DeviceType::OPENCL, float>;
-template struct ConcatFunctor<DeviceType::OPENCL, half>;
+template struct ConcatFunctor<DeviceType::GPU, float>;
+template struct ConcatFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -67,7 +67,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         std::unique_ptr<BufferBase> *kernel_error);

 template <typename T>
-void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                      const Tensor *filter,
                                                      const Tensor *bias,
                                                      Tensor *output,
@@ -128,8 +128,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }

-template struct Conv2dFunctor<DeviceType::OPENCL, float>;
-template struct Conv2dFunctor<DeviceType::OPENCL, half>;
+template struct Conv2dFunctor<DeviceType::GPU, float>;
+template struct Conv2dFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -61,7 +61,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -58,7 +58,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -58,7 +58,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
+void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t input_height = input->dim(1);
@@ -86,7 +86,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -145,8 +145,8 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
-template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, half>;
+template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
+template struct DepthToSpaceOpFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -66,7 +66,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -163,7 +163,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
 }

 template <typename T>
-void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
+void DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const Tensor *filter,
    const Tensor *bias,
@@ -215,8 +215,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
                  &kwg_size_, &kernel_error_);
 }

-template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
-template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, half>;
+template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
+template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
+void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                                                       const Tensor *input1,
                                                       Tensor *output,
                                                       StatsFuture *future) {
@@ -74,7 +74,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -129,7 +129,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
  }
 }

-template struct EltwiseFunctor<DeviceType::OPENCL, float>;
-template struct EltwiseFunctor<DeviceType::OPENCL, half>;
+template struct EltwiseFunctor<DeviceType::GPU, float>;
+template struct EltwiseFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -76,7 +76,7 @@ void FCWXKernel(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -202,7 +202,7 @@ void FCWTXKernel(cl::Kernel *kernel,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      *kernel_error = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      (*kernel_error)->Map(nullptr);
      *((*kernel_error)->mutable_data<char>()) = 0;
      (*kernel_error)->UnMap();
@@ -282,7 +282,7 @@ void FCWTXKernel(cl::Kernel *kernel,
 }  // namespace

 template <typename T>
-void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
+void FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const Tensor *weight,
    const Tensor *bias,
@@ -305,9 +305,9 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
+template struct FullyConnectedFunctor<DeviceType::GPU, float>;

-template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
+template struct FullyConnectedFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
+void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                      const Tensor *B,
                                                      Tensor *C,
                                                      StatsFuture *future) {
@@ -53,7 +53,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -98,9 +98,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  }
 }

-template struct MatMulFunctor<DeviceType::OPENCL, float>;
+template struct MatMulFunctor<DeviceType::GPU, float>;

-template struct MatMulFunctor<DeviceType::OPENCL, half>;
+template struct MatMulFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -56,7 +56,7 @@ const bool BufferToImageOpImpl(Tensor *buffer,
  if (runtime->IsOutOfRangeCheckEnabled()) {
    built_options.emplace("-DOUT_OF_RANGE_CHECK");
    kernel_error = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
    kernel_error->Map(nullptr);
    *(kernel_error->mutable_data<char>()) = 0;
    kernel_error->UnMap();
@@ -136,13 +136,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
  std::vector<index_t> buffer_shape = {batch, height, width, channels};
  Workspace ws;
  Tensor *buffer = ws.CreateTensor("Buffer",
-                                   GetDeviceAllocator(DeviceType::OPENCL),
+                                   GetDeviceAllocator(DeviceType::GPU),
                                   DataTypeToEnum<float>::v());
  buffer->Resize(buffer_shape);

  std::vector<size_t> image_shape;
  Tensor *image = ws.CreateTensor("Image",
-                                  GetDeviceAllocator(DeviceType::OPENCL),
+                                  GetDeviceAllocator(DeviceType::GPU),
                                  DataTypeToEnum<float>::v());
  CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
  image->ResizeImage(buffer->shape(), image_shape);

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {

 template<typename T>
-void PadFunctor<DeviceType::OPENCL, T>::operator()(
+void PadFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
@@ -59,7 +59,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-          new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+          new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -115,9 +115,9 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
 }

 template
-struct PadFunctor<DeviceType::OPENCL, float>;
+struct PadFunctor<DeviceType::GPU, float>;
 template
-struct PadFunctor<DeviceType::OPENCL, half>;
+struct PadFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       Tensor *output,
                                                       StatsFuture *future) {
  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
@@ -50,7 +50,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -148,7 +148,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
 }

-template struct PoolingFunctor<DeviceType::OPENCL, float>;
-template struct PoolingFunctor<DeviceType::OPENCL, half>;
+template struct PoolingFunctor<DeviceType::GPU, float>;
+template struct PoolingFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
+void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t in_height = input->dim(1);
@@ -50,7 +50,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -113,8 +113,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
-template struct ResizeBilinearFunctor<DeviceType::OPENCL, half>;
+template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
+template struct ResizeBilinearFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -21,7 +21,7 @@ namespace mace {
 namespace kernels {

 template<typename T>
-void SliceFunctor<DeviceType::OPENCL, T>::operator()(
+void SliceFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const std::vector<Tensor *> &output_list,
    StatsFuture *future) {
@@ -51,7 +51,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -133,9 +133,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
 }

 template
-struct SliceFunctor<DeviceType::OPENCL, float>;
+struct SliceFunctor<DeviceType::GPU, float>;
 template
-struct SliceFunctor<DeviceType::OPENCL, half>;
+struct SliceFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -23,7 +23,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
+void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                                                       Tensor *output,
                                                       StatsFuture *future) {
  const index_t batch = logits->dim(0);
@@ -49,7 +49,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -95,7 +95,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  }
 }

-template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
-template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
+template struct SoftmaxFunctor<DeviceType::GPU, float>;
+template struct SoftmaxFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -25,7 +25,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
+void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    Tensor *space_tensor,
    const std::vector<index_t> &output_shape,
    Tensor *batch_tensor,
@@ -60,7 +60,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -120,8 +120,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
-template struct SpaceToBatchFunctor<DeviceType::OPENCL, half>;
+template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
+template struct SpaceToBatchFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -22,7 +22,7 @@ namespace mace {
 namespace kernels {

 template <typename T>
-void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
+void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {

  auto runtime = OpenCLRuntime::Global();
@@ -39,7 +39,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -117,7 +117,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
 }

 template <typename T>
-void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
+void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input_tensor,
    const Tensor *bias,
    Tensor *output_tensor,
@@ -138,7 +138,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    if (runtime->IsOutOfRangeCheckEnabled()) {
      built_options.emplace("-DOUT_OF_RANGE_CHECK");
      kernel_error_ = std::move(std::unique_ptr<Buffer>(
-            new Buffer(GetDeviceAllocator(DeviceType::OPENCL), 1)));
+            new Buffer(GetDeviceAllocator(DeviceType::GPU), 1)));
      kernel_error_->Map(nullptr);
      *(kernel_error_->mutable_data<char>()) = 0;
      kernel_error_->UnMap();
@@ -231,11 +231,11 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }

-template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
-template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
+template struct WinogradTransformFunctor<DeviceType::GPU, float>;
+template struct WinogradTransformFunctor<DeviceType::GPU, half>;

-template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
-template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
+template struct WinogradInverseTransformFunctor<DeviceType::GPU, float>;
+template struct WinogradInverseTransformFunctor<DeviceType::GPU, half>;

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -61,21 +61,21 @@ struct PadFunctor : public PadFunctorBase {
    std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);

    const index_t batch = input->dim(0);
-    const index_t height = input->dim(1);
-    const index_t width = input->dim(2);
-    const index_t channel = input->dim(3);
+    const index_t channel = input->dim(1);
+    const index_t height = input->dim(2);
+    const index_t width = input->dim(3);
 #pragma omp parallel for collapse(3)
    for (index_t b = 0; b < batch; ++b) {
-      for (index_t h = 0; h < height; ++h) {
-        for (index_t w = 0; w < width; ++w) {
-          const index_t in_offset = (((b * height + h) * width) + w) * channel;
+      for (index_t c = 0; c < channel; ++c) {
+        for (index_t h = 0; h < height; ++h) {
+          const index_t in_offset = (((b * channel + c) * height) + h) * width;
          const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1)
-              + (h + this->paddings_[2])) * output->dim(2)
-              + (w + this->paddings_[4])) * output->dim(3)
+              + (c + this->paddings_[2])) * output->dim(2)
+              + (h + this->paddings_[4])) * output->dim(3)
              + this->paddings_[6];
          memcpy(output_ptr + out_offset,
                 input_ptr + in_offset,
-                 channel * sizeof(T));
+                 width * sizeof(T));
        }
      }
    }
@@ -84,7 +84,7 @@ struct PadFunctor : public PadFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct PadFunctor<DeviceType::OPENCL, T> : PadFunctorBase {
+struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
  PadFunctor(const std::vector<int> &paddings,
             const float constant_value)
      : PadFunctorBase(paddings, constant_value) {}

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -261,7 +261,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
+struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
  PoolingFunctor(const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,

--- a/mace/kernels/reorganize.h
+++ b/mace/kernels/reorganize.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_REORGANIZE_H_
-#define MACE_KERNELS_REORGANIZE_H_
-
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
-namespace mace {
-namespace kernels {
-
-template <DeviceType D, typename T>
-struct ReOrganizeFunctor {
-  void operator()(const Tensor *input,
-                  const std::vector<index_t> &out_shape,
-                  Tensor *output,
-                  StatsFuture *future) {
-    const bool w2c = out_shape[3] > input->dim(3);
-
-    const index_t height = input->dim(1);
-    const index_t input_width = input->dim(2);
-    const index_t input_chan = input->dim(3);
-    const index_t output_width = output->dim(2);
-    const index_t output_chan = output->dim(3);
-
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
-
-    if (w2c) {
-      MACE_CHECK((out_shape[3] % input->dim(3)) == 0);
-      const index_t multiplier = out_shape[3] / input->dim(3);
-#pragma omp parallel for collapse(4)
-      for (index_t n = 0; n < out_shape[0]; ++n) {
-        for (index_t h = 0; h < out_shape[1]; ++h) {
-          for (index_t w = 0; w < out_shape[2]; ++w) {
-            for (index_t c = 0; c < out_shape[3]; ++c) {
-              const index_t out_offset =
-                  ((n * height + h) * output_width + w)
-                      * output_chan + c;
-              const index_t in_w_idx = w + (c % multiplier) * output_width;
-              const index_t in_chan_idx = c / multiplier;
-              const index_t in_offset =
-                  ((n * height + h) * input_width + in_w_idx)
-                      * input_chan + in_chan_idx;
-              output_ptr[out_offset] = input_ptr[in_offset];
-            }
-          }
-        }
-      }
-    } else {
-      MACE_CHECK((input->dim(3) % out_shape[3]) == 0);
-      const index_t multiplier = input->dim(3) / out_shape[3];
-
-#pragma omp parallel for collapse(4)
-      for (index_t n = 0; n < out_shape[0]; ++n) {
-        for (index_t h = 0; h < out_shape[1]; ++h) {
-          for (index_t w = 0; w < out_shape[2]; ++w) {
-            for (index_t c = 0; c < out_shape[3]; ++c) {
-              const index_t out_offset =
-                  ((n * height + h) * output_width + w)
-                      * output_chan + c;
-              const index_t in_w_idx = w % input_width;
-              const index_t in_chan_idx = w / input_width + c * multiplier;
-              const index_t in_offset =
-                  ((n * height + h) * input_width + in_w_idx)
-                      * input_chan + in_chan_idx;
-              output_ptr[out_offset] = input_ptr[in_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_REORGANIZE_H_
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -179,7 +179,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
+struct ResizeBilinearFunctor<DeviceType::GPU, T>
  : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
    : ResizeBilinearFunctorBase(size, align_corners) {}

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -86,7 +86,7 @@ struct SliceFunctor : SliceFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
+struct SliceFunctor<DeviceType::GPU, T> : SliceFunctorBase {
  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}

  void operator()(const Tensor *input,

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -94,7 +94,7 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {

 #ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct SoftmaxFunctor<DeviceType::OPENCL, T> {
+struct SoftmaxFunctor<DeviceType::GPU, T> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);

  cl::Kernel kernel_;

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -59,7 +59,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
+struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(const std::vector<int> &paddings,
                      const std::vector<int> &block_shape,
                      bool b2s)

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -57,7 +57,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct WinogradTransformFunctor<DeviceType::OPENCL, T>
+struct WinogradTransformFunctor<DeviceType::GPU, T>
    : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
                           const std::vector<int> &paddings)
@@ -111,7 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {

 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
+struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
    : WinogradInverseTransformFunctorBase {
  WinogradInverseTransformFunctor(const int batch,
                                  const int height,

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -26,16 +26,16 @@ void Register_Activation(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ActivationOp<DeviceType::OPENCL, float>);
+                    ActivationOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ActivationOp<DeviceType::OPENCL, half>);
+                    ActivationOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -33,7 +33,7 @@ void ReluBenchmark(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -45,7 +45,7 @@ void ReluBenchmark(
      .Output("Output")
      .AddStringArg("activation", "RELU")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -83,8 +83,8 @@ void ReluBenchmark(

 #define BM_RELU(N, C, H, W)                 \
  BM_RELU_MACRO(N, C, H, W, float, CPU);    \
-  BM_RELU_MACRO(N, C, H, W, float, OPENCL); \
-  BM_RELU_MACRO(N, C, H, W, half, OPENCL);
+  BM_RELU_MACRO(N, C, H, W, float, GPU); \
+  BM_RELU_MACRO(N, C, H, W, half, GPU);

 BM_RELU(1, 1, 512, 512);
 BM_RELU(1, 3, 128, 128);
@@ -107,7 +107,7 @@ void ReluxBenchmark(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -151,8 +151,8 @@ void ReluxBenchmark(

 #define BM_RELUX(N, C, H, W)                 \
  BM_RELUX_MACRO(N, C, H, W, float, CPU);    \
-  BM_RELUX_MACRO(N, C, H, W, float, OPENCL); \
-  BM_RELUX_MACRO(N, C, H, W, half, OPENCL);
+  BM_RELUX_MACRO(N, C, H, W, float, GPU); \
+  BM_RELUX_MACRO(N, C, H, W, half, GPU);

 BM_RELUX(1, 1, 512, 512);
 BM_RELUX(1, 3, 128, 128);
@@ -171,7 +171,7 @@ void PreluBenchmark(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -185,7 +185,7 @@ void PreluBenchmark(
      .Output("Output")
      .AddStringArg("activation", "PRELU")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
@@ -226,8 +226,8 @@ void PreluBenchmark(

 #define BM_PRELU(N, C, H, W)                 \
  BM_PRELU_MACRO(N, C, H, W, float, CPU);    \
-  BM_PRELU_MACRO(N, C, H, W, float, OPENCL); \
-  BM_PRELU_MACRO(N, C, H, W, half, OPENCL);
+  BM_PRELU_MACRO(N, C, H, W, float, GPU); \
+  BM_PRELU_MACRO(N, C, H, W, half, GPU);

 BM_PRELU(1, 1, 512, 512);
 BM_PRELU(1, 3, 128, 128);
@@ -250,7 +250,7 @@ void TanhBenchmark(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -292,8 +292,8 @@ void TanhBenchmark(

 #define BM_TANH(N, C, H, W)                 \
  BM_TANH_MACRO(N, C, H, W, float, CPU);    \
-  BM_TANH_MACRO(N, C, H, W, float, OPENCL); \
-  BM_TANH_MACRO(N, C, H, W, half, OPENCL);
+  BM_TANH_MACRO(N, C, H, W, float, GPU); \
+  BM_TANH_MACRO(N, C, H, W, half, GPU);

 BM_TANH(1, 1, 512, 512);
 BM_TANH(1, 3, 128, 128);
@@ -316,7 +316,7 @@ void SigmoidBenchmark(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -359,8 +359,8 @@ void SigmoidBenchmark(

 #define BM_SIGMOID(N, C, H, W)                 \
  BM_SIGMOID_MACRO(N, C, H, W, float, CPU);    \
-  BM_SIGMOID_MACRO(N, C, H, W, float, OPENCL); \
-  BM_SIGMOID_MACRO(N, C, H, W, half, OPENCL);
+  BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
+  BM_SIGMOID_MACRO(N, C, H, W, half, GPU);

 BM_SIGMOID(1, 1, 512, 512);
 BM_SIGMOID(1, 3, 128, 128);

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -31,7 +31,7 @@ void TestSimpleRelu() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -68,7 +68,7 @@ void TestSimpleRelu() {
 TEST_F(ActivationOpTest, CPUSimpleRelu) { TestSimpleRelu<DeviceType::CPU>(); }

 TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
-  TestSimpleRelu<DeviceType::OPENCL>();
+  TestSimpleRelu<DeviceType::GPU>();
 }

 namespace {
@@ -79,7 +79,7 @@ void TestUnalignedSimpleRelu() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -117,7 +117,7 @@ TEST_F(ActivationOpTest, CPUUnalignedSimpleRelu) {
 }

 TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
-  TestUnalignedSimpleRelu<DeviceType::OPENCL>();
+  TestUnalignedSimpleRelu<DeviceType::GPU>();
 }


@@ -131,7 +131,7 @@ void TestSimpleRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -170,7 +170,7 @@ void TestSimpleRelux() {
 TEST_F(ActivationOpTest, CPUSimple) { TestSimpleRelux<DeviceType::CPU>(); }

 TEST_F(ActivationOpTest, OPENCLSimple) {
-  TestSimpleRelux<DeviceType::OPENCL>();
+  TestSimpleRelux<DeviceType::GPU>();
 }

 namespace {
@@ -183,7 +183,7 @@ void TestSimpleReluRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -224,7 +224,7 @@ TEST_F(ActivationOpTest, CPUSimpleRelux) {
 }

 TEST_F(ActivationOpTest, OPENCLSimpleRelux) {
-  TestSimpleReluRelux<DeviceType::OPENCL>();
+  TestSimpleReluRelux<DeviceType::GPU>();
 }

 namespace {
@@ -238,7 +238,7 @@ void TestSimplePrelu() {
      {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
@@ -283,7 +283,7 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) {
 }

 TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
-  TestSimplePrelu<DeviceType::OPENCL>();
+  TestSimplePrelu<DeviceType::GPU>();
 }

 namespace {
@@ -296,7 +296,7 @@ void TestSimpleTanh() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -336,7 +336,7 @@ void TestSimpleTanh() {
 TEST_F(ActivationOpTest, CPUSimpleTanh) { TestSimpleTanh<DeviceType::CPU>(); }

 TEST_F(ActivationOpTest, OPENCLSimpleTanh) {
-  TestSimpleTanh<DeviceType::OPENCL>();
+  TestSimpleTanh<DeviceType::GPU>();
 }

 namespace {
@@ -349,7 +349,7 @@ void TestSimpleSigmoid() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -392,7 +392,7 @@ TEST_F(ActivationOpTest, CPUSimpleSigmoid) {
 }

 TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) {
-  TestSimpleSigmoid<DeviceType::OPENCL>();
+  TestSimpleSigmoid<DeviceType::GPU>();
 }

 }  // namespace test

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -26,16 +26,16 @@ void Register_AddN(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    AddNOp<DeviceType::OPENCL, float>);
+                    AddNOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    AddNOp<DeviceType::OPENCL, half>);
+                    AddNOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -33,7 +33,7 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    for (int i = 0; i < inputs; ++i) {
      BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
                          MakeString("InputImage", i).c_str(),
@@ -82,8 +82,8 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {

 #define BM_ADDN(INPUTS, N, H, W, C)                 \
  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, CPU);    \
-  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, OPENCL); \
-  BM_ADDN_MACRO(INPUTS, N, H, W, C, half, OPENCL);
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, float, GPU); \
+  BM_ADDN_MACRO(INPUTS, N, H, W, C, half, GPU);

 BM_ADDN(2, 1, 256, 256, 32);
 BM_ADDN(2, 1, 128, 128, 32);

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -64,7 +64,7 @@ void SimpleAdd3() {
                                  {-0.1582, 2, 3, 4, 5, 6});

  const int input_num = 4;
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    // run on gpu
    for (int i = 0; i < input_num; ++i) {
      BufferToImage<D, half>(&net, MakeString("Input", i),
@@ -105,7 +105,7 @@ void SimpleAdd3() {
 }  // namespace

 TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
-TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::OPENCL>(); }
+TEST_F(AddnOpTest, GPUSimpleAdd3) { SimpleAdd3<DeviceType::GPU>(); }

 namespace {
 template <DeviceType D>
@@ -166,7 +166,7 @@ void RandomTest() {
 }
 }  // namespace

-TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
+TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::GPU>(); }

 }  // namespace test
 }  // namespace ops

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -26,16 +26,16 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BatchNormOp<DeviceType::OPENCL, float>);
+                    BatchNormOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BatchNormOp<DeviceType::OPENCL, half>);
+                    BatchNormOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -32,7 +32,7 @@ void BatchNorm(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -52,7 +52,7 @@ void BatchNorm(
      .AddFloatArg("epsilon", 1e-3)
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -107,8 +107,8 @@ void BatchNorm(

 #define BM_BATCH_NORM(N, C, H, W)                 \
  BM_BATCH_NORM_MACRO(N, C, H, W, float, CPU);    \
-  BM_BATCH_NORM_MACRO(N, C, H, W, float, OPENCL); \
-  BM_BATCH_NORM_MACRO(N, C, H, W, half, OPENCL);
+  BM_BATCH_NORM_MACRO(N, C, H, W, float, GPU); \
+  BM_BATCH_NORM_MACRO(N, C, H, W, half, GPU);

 BM_BATCH_NORM(1, 1, 512, 512);
 BM_BATCH_NORM(1, 3, 128, 128);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -49,7 +49,7 @@ void Simple() {

    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -90,7 +90,7 @@ void Simple() {

 TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }

-TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
+TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }

 TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // generate random input
@@ -103,12 +103,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -139,15 +139,15 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -162,14 +162,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {

  // Tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -186,12 +186,12 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -221,15 +221,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
                                          kernels::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -245,14 +245,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {

  // Tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
 }
@@ -269,12 +269,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -304,15 +304,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -327,14 +327,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {

  // tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -351,12 +351,12 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -386,15 +386,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Mean", "MeanImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Var", "VarImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
                                          kernels::BufferType::ARGUMENT);

  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -410,14 +410,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {

  // tuning
  setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  unsetenv("MACE_TUNING");

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
 }

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -20,15 +20,15 @@ namespace ops {
 void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BatchToSpaceNDOp<DeviceType::OPENCL, float>);
+                    BatchToSpaceNDOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BatchToSpaceNDOp<DeviceType::OPENCL, half>);
+                    BatchToSpaceNDOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -63,7 +63,7 @@ void BMBatchToSpace(
  BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)

 #define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
-  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, OPENCL);
+  BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU);

 BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
 BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -26,16 +26,16 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BiasAddOp<DeviceType::OPENCL, float>);
+                    BiasAddOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BiasAddOp<DeviceType::OPENCL, half>);
+                    BiasAddOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -31,7 +31,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -44,7 +44,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
      .Input("Bias")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
@@ -84,8 +84,8 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {

 #define BM_BIAS_ADD(N, C, H, W)                 \
  BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU);    \
-  BM_BIAS_ADD_MACRO(N, C, H, W, float, OPENCL); \
-  BM_BIAS_ADD_MACRO(N, C, H, W, half, OPENCL);
+  BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
+  BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);

 BM_BIAS_ADD(1, 1, 512, 512);
 BM_BIAS_ADD(1, 3, 128, 128);

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -47,7 +47,7 @@ void BiasAddSimple() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Bias", "BiasImage",
@@ -80,7 +80,7 @@ void BiasAddSimple() {
 TEST_F(BiasAddOpTest, BiasAddSimpleCPU) { BiasAddSimple<DeviceType::CPU>(); }

 TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
-  BiasAddSimple<DeviceType::OPENCL>();
+  BiasAddSimple<DeviceType::GPU>();
 }

 TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
@@ -94,9 +94,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -123,9 +123,9 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -135,10 +135,10 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
      .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }
@@ -154,9 +154,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -182,9 +182,9 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -194,10 +194,10 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
      .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
 }

--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -19,16 +19,16 @@ namespace ops {

 void Register_BufferToImage(OperatorRegistry *op_registry) {
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    BufferToImageOp<DeviceType::OPENCL, float>);
+                    BufferToImageOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    BufferToImageOp<DeviceType::OPENCL, half>);
+                    BufferToImageOp<DeviceType::GPU, half>);
 }

 }  // namespace ops

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -54,73 +54,73 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {1});
 }

 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT, {11});
 }

 TEST(BufferToImageTest, ArgMedia) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {11});
 }

 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::ARGUMENT, {256});
 }

 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {1, 2, 3, 1});
 }

 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {1, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {3, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputMedia) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {3, 13, 17, 128});
 }

 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::IN_OUT_CHANNEL,
                                                      {3, 64, 64, 256});
 }

 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {1, 1, 3, 5});
 }

 TEST(BufferToImageTest, Filter1x1Media) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {1, 1, 13, 17});
 }

 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {1, 1, 128, 512});
 }

 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {3, 3, 3, 5});
 }

 TEST(BufferToImageTest, Filter3x3Meida) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {3, 3, 13, 17});
 }

 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(kernels::CONV2D_FILTER,
                                                      {3, 3, 128, 256});
 }

@@ -158,7 +158,7 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(kernels::ARGUMENT,
                                                             {11});
 }

@@ -203,7 +203,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
  const unsigned char input_data[] = {
      0xCD, 0x3C, 0x33, 0x40,
  };
-  TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
      kernels::ARGUMENT, {2}, input_data);
 }


--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -26,16 +26,16 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ChannelShuffleOp<DeviceType::OPENCL, float>);
+                    ChannelShuffleOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ChannelShuffleOp<DeviceType::OPENCL, half>);
+                    ChannelShuffleOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -35,7 +35,7 @@ class ChannelShuffleOp : public Operator<D, T> {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    int channels;
-    if (D == OPENCL) {
+    if (D == GPU) {
      channels = input->dim(3);
    } else if (D == CPU) {
      channels = input->dim(1);

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -31,7 +31,7 @@ void ChannelShuffle(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -42,7 +42,7 @@ void ChannelShuffle(
      .Input("Input")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -82,8 +82,8 @@ void ChannelShuffle(

 #define BM_CHANNEL_SHUFFLE(N, C, H, W, G)                 \
  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU);    \
-  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \
-  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL);
+  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, GPU); \
+  BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, GPU);

 BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
 BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -60,11 +60,11 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddInputFromArray<DeviceType::OPENCL, float>(
+  net.AddInputFromArray<DeviceType::GPU, float>(
      "Input", {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);

  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
@@ -74,10 +74,10 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
      .Finalize(net.NewOperatorDef());

  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

  // Transfer output
-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
                                           kernels::BufferType::IN_OUT_CHANNEL);

  // Check

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -26,16 +26,16 @@ void Register_Concat(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ConcatOp<DeviceType::OPENCL, float>);
+                    ConcatOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ConcatOp<DeviceType::OPENCL, half>);
+                    ConcatOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -74,12 +74,12 @@ void OpenclConcatHelper(int iters,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0);
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1);
+  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
+  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImage0",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImage1",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Concat", "ConcatBM")
      .Input("InputImage0")
@@ -91,7 +91,7 @@ void OpenclConcatHelper(int iters,

  // Warm-up
  for (int i = 0; i < 5; ++i) {
-    net.RunOp(DeviceType::OPENCL);
+    net.RunOp(DeviceType::GPU);
  }

  const int64_t tot =
@@ -101,7 +101,7 @@ void OpenclConcatHelper(int iters,
  testing::BytesProcessed(tot * sizeof(T));
  mace::testing::StartTiming();
  while (iters--) {
-    net.RunOp(DeviceType::OPENCL);
+    net.RunOp(DeviceType::GPU);
  }
 }
 }  // namespace

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -171,9 +171,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
    concat_axis_size += shapes[i][axis];
    GenerateRandomRealTypeData(shapes[i], &inputs[i]);
    input_ptrs[i] = inputs[i].data();
-    net.AddInputFromArray<DeviceType::OPENCL, float>(input_name,
+    net.AddInputFromArray<DeviceType::GPU, float>(input_name,
                                                     shapes[i], inputs[i]);
-    BufferToImage<DeviceType::OPENCL, T>(&net, input_name, image_name,
+    BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
                                         kernels::BufferType::IN_OUT_CHANNEL);
  }

@@ -188,9 +188,9 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
      .Finalize(net.NewOperatorDef());

  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
                                           kernels::BufferType::IN_OUT_CHANNEL);

  // Check

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -26,16 +26,16 @@ void Register_Conv2D(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    Conv2dOp<DeviceType::OPENCL, float>);
+                    Conv2dOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    Conv2dOp<DeviceType::OPENCL, half>);
+                    Conv2dOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -16,6 +16,7 @@
 #define MACE_OPS_CONV_2D_H_

 #include <memory>
+#include <string>

 #include "mace/core/operator.h"
 #include "mace/kernels/conv_2d.h"
@@ -33,8 +34,10 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                 this->padding_type_,
                 this->paddings_,
                 this->dilations_.data(),
-                 kernels::ActivationType::NOOP,
-                 0.0f,
+                 kernels::StringToActivationType(
+                     OperatorBase::GetSingleArgument<std::string>("activation",
+                                                                  "NOOP")),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
                 static_cast<bool>(OperatorBase::GetSingleArgument<int>(
                     "is_filter_transformed", false)),
                 ws->GetScratchBuffer(D)) {}

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -47,7 +47,7 @@ void Conv2d(int iters,
                                 {output_channels, channels, kernel_h,
                                  kernel_w});
    net.AddRandomInput<D, float>("Bias", {output_channels});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
    net.AddRandomInput<D, float>("Filter",
                                 {kernel_h, kernel_w, output_channels,
@@ -68,7 +68,7 @@ void Conv2d(int iters,
      .AddIntsArg("dilations", {dilation, dilation})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -138,8 +138,8 @@ void Conv2d(int iters,

 #define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC)                 \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU);    \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU);




--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -63,7 +63,7 @@ void TestNHWCSimple3x3VALID() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -134,7 +134,7 @@ void TestNHWCSimple3x3SAME() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -176,8 +176,8 @@ TEST_F(Conv2dOpTest, CPUSimple) {
 }

 TEST_F(Conv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3VALID<DeviceType::GPU, float>();
+  TestNHWCSimple3x3SAME<DeviceType::GPU, float>();
 }

 namespace {
@@ -219,7 +219,7 @@ void TestNHWCSimple3x3WithoutBias() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -255,7 +255,7 @@ TEST_F(Conv2dOpTest, CPUWithoutBias) {
 }

 TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
 }

 namespace {
@@ -301,7 +301,7 @@ void TestNHWCCombined3x3() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -341,9 +341,164 @@ TEST_F(Conv2dOpTest, CPUStride2) {
 }

 TEST_F(Conv2dOpTest, OPENCLStride2) {
-  TestNHWCCombined3x3<DeviceType::OPENCL, float>();
+  TestNHWCCombined3x3<DeviceType::GPU, float>();
 }

+namespace {
+template<DeviceType D, typename T>
+void TestFusedNHWCSimple3x3VALID() {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {3, 3, 1, 2},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
+
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(&net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+
+    // Transfer output
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
+}
+template<DeviceType D, typename T>
+void TestFusedNHWCSimple3x3WithoutBias() {
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 3, 2},
+      {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {3, 3, 1, 2},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::GPU) {
+    BufferToImage<D, T>(&net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(&net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELU")
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Check
+  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
+}
+
+}  // namespace
+
+TEST_F(Conv2dOpTest, FusedCPUSimple) {
+  TestFusedNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestFusedNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
+}
+
+TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
+  TestFusedNHWCSimple3x3VALID<DeviceType::GPU, float>();
+  TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
+}
+
+
 namespace {
 template<DeviceType D>
 void TestConv1x1() {
@@ -389,7 +544,7 @@ void TestConv1x1() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Filter", "FilterImage",
@@ -431,7 +586,7 @@ void TestConv1x1() {

 TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }

-TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
+TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }

 namespace {
 template<DeviceType D, typename T>
@@ -524,18 +679,18 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
 }  // namespace

 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 2);
 }

 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 2);
 }

 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({31, 113, 13, 17}, 3);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 32, 13, 17}, 4);
 }

 namespace {
@@ -644,52 +799,52 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
 }  // namespace

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 1, 32, 64},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {7, 7, 3, 64},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {1, 1, 5, 7},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
                                                {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16},
                                                {2, 2});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {7, 7, 16, 16},
                                                {2, 2});
 }

 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({63, 67}, {7, 7, 16, 16},
                                                {4, 4});
 }

@@ -787,15 +942,115 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
 }  // namespace

 TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
+  TestDilationConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, 2);
 }

 TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
+  TestDilationConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, 4);
 }

 TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
+  TestDilationConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, 4);
+}
+
+namespace {
+template<DeviceType D>
+void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
+                               const std::vector<index_t> &filter_shape,
+                               const std::vector<int> &dilations) {
+  testing::internal::LogToStderr();
+  auto func = [&](int stride_h, int stride_w, Padding type) {
+    srand(time(NULL));
+
+    // generate random input
+    index_t batch = 1;
+    index_t height = image_shape[0];
+    index_t width = image_shape[1];
+    index_t kernel_h = filter_shape[0];
+    index_t kernel_w = filter_shape[1];
+    index_t output_channels = filter_shape[2];
+    index_t input_channels = filter_shape[3];
+
+    OpsTestNet net;
+
+    // Add input data
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>(
+        "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputNCHW")
+        .Input("FilterOIHW")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", dilations)
+        .Finalize(net.NewOperatorDef());
+
+    // run on cpu
+    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+    // Check
+    Tensor expected;
+    expected.Copy(*net.GetOutput("Output"));
+
+    // run on gpu
+    BufferToImage<D, half>(&net, "Input", "InputImage",
+                           kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(&net, "Filter", "FilterImage",
+                           kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, half>(&net, "Bias", "BiasImage",
+                           kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {stride_h, stride_w})
+        .AddIntArg("padding", type)
+        .AddIntsArg("dilations", dilations)
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run on device
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT_CHANNEL);
+    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
+                            1e-2, 1e-1);
+  };
+
+  func(1, 1, VALID);
+  func(1, 1, SAME);
+}
+}  // namespace
+
+TEST_F(Conv2dOpTest, OPENCLHalf7X7AtrousConvD2) {
+  TestGeneralHalfAtrousConv<DeviceType::GPU>({32, 32}, {7, 7, 16, 3},
+                                                {2, 2});
+}
+
+TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
+  TestGeneralHalfAtrousConv<DeviceType::GPU>({63, 71}, {15, 15, 16, 16},
+                                                {2, 2});
 }

 namespace {
@@ -887,16 +1142,16 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
 }  // namespace

 TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({32, 32, 32, 64}, {1, 1});
 }

 TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
                                                     {2, 2});
 }

 TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {4, 4});
 }

 }  // namespace test

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -32,7 +32,7 @@ TEST(CoreTest, INIT_MODE) {
      .Finalize(&op_defs[op_defs.size() - 1]);

  Tensor *input =
-      ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::OPENCL),
+      ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU),
                      DataTypeToEnum<float>::v());
  input->Resize({1, 3, 3, 3});
  {
@@ -54,13 +54,13 @@ TEST(CoreTest, INIT_MODE) {
  }
  std::shared_ptr<OperatorRegistry> op_registry(new OperatorRegistry());
  auto net =
-      CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL, NetMode::INIT);
+      CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT);
  net->Run();

  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);

-  net = CreateNet(op_registry, net_def, &ws, DeviceType::OPENCL);
+  net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU);
  net->Run();
  EXPECT_TRUE(ws.GetTensor("Output") != nullptr);


--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -26,16 +26,16 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    DepthToSpaceOp<DeviceType::OPENCL, float>);
+                    DepthToSpaceOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    DepthToSpaceOp<DeviceType::OPENCL, half>);
+                    DepthToSpaceOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -40,7 +40,7 @@ class DepthToSpaceOp : public Operator<D, T> {
    int input_depth;
    if (D == CPU) {
      input_depth = input->dim(1);
-    } else if (D == OPENCL) {
+    } else if (D == GPU) {
      input_depth = input->dim(3);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -31,7 +31,7 @@ void DepthToSpace(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -42,7 +42,7 @@ void DepthToSpace(
      .Input("Input")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -82,8 +82,8 @@ void DepthToSpace(

 #define BM_DEPTH_TO_SPACE(N, C, H, W, G)                 \
  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, CPU);    \
-  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, OPENCL); \
-  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, OPENCL);
+  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, float, GPU); \
+  BM_DEPTH_TO_SPACE_MACRO(N, C, H, W, G, half, GPU);

 BM_DEPTH_TO_SPACE(1, 64, 64, 64, 4);
 BM_DEPTH_TO_SPACE(1, 64, 128, 128, 4);

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -65,8 +65,8 @@ void RunDepthToSpace(const bool d2s,
  }


-  if (D == DeviceType::OPENCL) {
-    ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
+  if (D == DeviceType::GPU) {
+    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
        kernels::BufferType::IN_OUT_CHANNEL);
  }
  auto expected = CreateTensor<float>(expected_shape, expected_data);
@@ -88,7 +88,7 @@ TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_CPU) {
 }

 TEST_F(SpaceToDepthOpTest, Input2x4x4_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 4, 4},
+  RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 4, 4},
      {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23,
       8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
      2,
@@ -110,7 +110,7 @@ TEST_F(SpaceToDepthOpTest, Input2x2x4_B2_CPU) {
 }

 TEST_F(SpaceToDepthOpTest, Input4x4x1_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(false, {1, 2, 2, 4},
+  RunDepthToSpace<DeviceType::GPU>(false, {1, 2, 2, 4},
      {1, 2, 3, 4, 5, 6, 7, 8,
       9, 10, 11, 12, 13, 14, 15, 16},
      2,
@@ -132,7 +132,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
 }

 TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 2, 16},
+  RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 2, 16},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
      2,
@@ -152,7 +152,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_CPU) {
 }

 TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
-  RunDepthToSpace<DeviceType::OPENCL>(true, {1, 1, 1, 16},
+  RunDepthToSpace<DeviceType::GPU>(true, {1, 1, 1, 16},
      {1,  2,  3,  4,  5,  6,  7,  8,
       9,  10, 11, 12, 13, 14, 15, 16},
      2,
@@ -165,7 +165,7 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
 TEST_F(DepthToSpaceOpTest, InputLarger_B2_OPENCL) {
  const std::vector<float > in = std::vector<float >(192 * 192 *128, 1.0);

-  RunDepthToSpace<DeviceType::OPENCL>(true, {1, 192, 192, 128},
+  RunDepthToSpace<DeviceType::GPU>(true, {1, 192, 192, 128},
                                      in,
                                      2,
                                      {1, 384, 384, 32},
@@ -234,19 +234,19 @@ void RandomTest(const bool d2s, const int block_size,
 }  // namespace

 TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
-  RandomTest<DeviceType::OPENCL, float>(true, 2, {1, 192, 192, 128});
+  RandomTest<DeviceType::GPU, float>(true, 2, {1, 192, 192, 128});
 }

 TEST_F(DepthToSpaceOpTest, OPENCLRandomHalf) {
-RandomTest<DeviceType::OPENCL, half>(true, 2, {1, 192, 192, 128});
+RandomTest<DeviceType::GPU, half>(true, 2, {1, 192, 192, 128});
 }

 TEST_F(SpaceToDepthOpTest, OPENCLRandomFloat) {
-RandomTest<DeviceType::OPENCL, float>(false, 2, {1, 384, 384, 32});
+RandomTest<DeviceType::GPU, float>(false, 2, {1, 384, 384, 32});
 }

 TEST_F(SpaceToDepthOpTest, OPENCLRandomHalf) {
-RandomTest<DeviceType::OPENCL, half>(false, 2, {1, 384, 384, 32});
+RandomTest<DeviceType::GPU, half>(false, 2, {1, 384, 384, 32});
 }

 }  // namespace test

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -26,16 +26,16 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    DepthwiseConv2dOp<DeviceType::OPENCL, float>);
+                    DepthwiseConv2dOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    DepthwiseConv2dOp<DeviceType::OPENCL, half>);
+                    DepthwiseConv2dOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -46,7 +46,7 @@ void DepthwiseConv2d(int iters,
    net.AddRandomInput<D, float>(
      "Filter", {multiplier, input_channels, kernel_h, kernel_w});
    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, height, width, input_channels});
    net.AddRandomInput<D, float>(
@@ -67,7 +67,7 @@ void DepthwiseConv2d(int iters,
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -134,8 +134,8 @@ void DepthwiseConv2d(int iters,

 #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M)                 \
  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU);    \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU);

 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
 BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -59,7 +59,7 @@ void SimpleValidTest() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -105,11 +105,11 @@ TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
 }

 TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) {
-  SimpleValidTest<DeviceType::OPENCL, float>();
+  SimpleValidTest<DeviceType::GPU, float>();
 }

 TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
-  SimpleValidTest<DeviceType::OPENCL, half>();
+  SimpleValidTest<DeviceType::GPU, half>();
 }

 namespace {
@@ -184,7 +184,7 @@ void ComplexValidTest() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -245,11 +245,11 @@ TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
 }

 TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
-  ComplexValidTest<DeviceType::OPENCL, float>();
+  ComplexValidTest<DeviceType::GPU, float>();
 }

 TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
-  ComplexValidTest<DeviceType::OPENCL, half>();
+  ComplexValidTest<DeviceType::GPU, half>();
 }

 namespace {
@@ -267,12 +267,12 @@ void TestNxNS12(const index_t height, const index_t width) {
    OpsTestNet net;

    // Add input data
-    net.AddRandomInput<DeviceType::OPENCL, float>("Input",
+    net.AddRandomInput<DeviceType::GPU, float>("Input",
                                                  {batch, height, width,
                                                   input_channels});
-    net.AddRandomInput<DeviceType::OPENCL, float>(
+    net.AddRandomInput<DeviceType::GPU, float>(
      "Filter", {kernel_h, kernel_w, input_channels, multiplier});
-    net.AddRandomInput<DeviceType::OPENCL, float>("Bias",
+    net.AddRandomInput<DeviceType::GPU, float>("Bias",
                                                  {multiplier
                                                     * input_channels});

@@ -307,11 +307,11 @@ void TestNxNS12(const index_t height, const index_t width) {
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

-    BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+    BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                         kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<DeviceType::OPENCL, T>(&net, "Filter", "FilterImage",
+    BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
                                         kernels::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
+    BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
                                         kernels::BufferType::ARGUMENT);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
      .Input("InputImage")
@@ -324,10 +324,10 @@ void TestNxNS12(const index_t height, const index_t width) {
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-    net.RunOp(DeviceType::OPENCL);
+    net.RunOp(DeviceType::GPU);

    // Transfer output
-    ImageToBuffer<DeviceType::OPENCL, float>(&net,
+    ImageToBuffer<DeviceType::GPU, float>(&net,
                                         "OutputImage",
                                         "DeviceOutput",
                                         kernels::BufferType::IN_OUT_CHANNEL);

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -26,16 +26,16 @@ void Register_Eltwise(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    EltwiseOp<DeviceType::OPENCL, float>);
+                    EltwiseOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    EltwiseOp<DeviceType::OPENCL, half>);
+                    EltwiseOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -34,11 +34,11 @@ void EltwiseBenchmark(
  net.AddRandomInput<D, T>("Input0", {n, h, w, c});
  net.AddRandomInput<D, T>("Input1", {n, h, w, c});

-  if (D == DeviceType::OPENCL) {
-    BufferToImage<D, T>(&net, "Input0", "InputImg0",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Input1", "InputImg1",
-                        kernels::BufferType::IN_OUT_CHANNEL);
+  if (D == DeviceType::GPU) {
+    BufferToImage<D, half>(&net, "Input0", "InputImg0",
+                           kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(&net, "Input1", "InputImg1",
+                           kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("InputImg0")
        .Input("InputImg1")
@@ -90,8 +90,8 @@ void EltwiseBenchmark(

 #define BM_ELTWISE(ELT_TYPE, N, H, W, C)                 \
  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, CPU);    \
-  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, OPENCL); \
-  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, OPENCL);
+  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, float, GPU); \
+  BM_ELTWISE_MACRO(ELT_TYPE, N, H, W, C, half, GPU);

 BM_ELTWISE(2, 1, 128, 128, 32);
 BM_ELTWISE(2, 1, 240, 240, 256);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -183,49 +183,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorScalar) {
 }

 TEST_F(EltwiseOpTest, GPUSimpleTensorScalar) {
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUM,
-                                                {1, 1, 1, 1}, {1}, 1,
-                                                {2});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SUB,
-                                                {1, 1, 2, 3},
-                                                {1, 2, 3, 4, 5, 6},
-                                                1,
-                                                {0, 1, 2, 3, 4, 5});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::PROD,
-                                                {1, 1, 2, 3},
-                                                {1, 2, 3, 4, 5, 6},
-                                                2,
-                                                {2, 4, 6, 8, 10, 12});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::DIV,
-                                                {1, 1, 2, 3},
-                                                {2, 4, 6, 8, 10, 12},
-                                                2,
-                                                {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MIN,
-                                                {1, 1, 2, 3},
-                                                {1, 2, 3, 4, 5, 6},
-                                                1,
-                                                {1, 1, 1, 1, 1, 1});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::MAX,
-                                                {1, 1, 2, 3},
-                                                {1, 2, 3, 4, 5, 6},
-                                                3,
-                                                {3, 3, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::NEG,
-                                                {1, 1, 2, 3},
-                                                {1, 2, 3, 4, 5, 6},
-                                                3,
-                                                {-1, -2, -3, -4, -5, -6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::ABS,
-                                                {1, 1, 2, 3},
-                                                {-1, -2, -3, -4, -5, -6},
-                                                3,
-                                                {1, 2, 3, 4, 5, 6});
-  SimpleTensorScalar<DeviceType::OPENCL, float>(kernels::EltwiseType::SQR_DIFF,
-                                                {1, 1, 2, 3},
-                                                {1, 2, 3, 4, 5, 6},
-                                                1,
-                                                {0, 1, 4, 9, 16, 25});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUM,
+                                             {1, 1, 1, 1}, {1}, 1,
+                                             {2});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SUB,
+                                             {1, 1, 2, 3},
+                                             {1, 2, 3, 4, 5, 6},
+                                             1,
+                                             {0, 1, 2, 3, 4, 5});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::PROD,
+                                             {1, 1, 2, 3},
+                                             {1, 2, 3, 4, 5, 6},
+                                             2,
+                                             {2, 4, 6, 8, 10, 12});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::DIV,
+                                             {1, 1, 2, 3},
+                                             {2, 4, 6, 8, 10, 12},
+                                             2,
+                                             {1, 2, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MIN,
+                                             {1, 1, 2, 3},
+                                             {1, 2, 3, 4, 5, 6},
+                                             1,
+                                             {1, 1, 1, 1, 1, 1});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::MAX,
+                                             {1, 1, 2, 3},
+                                             {1, 2, 3, 4, 5, 6},
+                                             3,
+                                             {3, 3, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::NEG,
+                                             {1, 1, 2, 3},
+                                             {1, 2, 3, 4, 5, 6},
+                                             3,
+                                             {-1, -2, -3, -4, -5, -6});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::ABS,
+                                             {1, 1, 2, 3},
+                                             {-1, -2, -3, -4, -5, -6},
+                                             3,
+                                             {1, 2, 3, 4, 5, 6});
+  SimpleTensorScalar<DeviceType::GPU, float>(kernels::EltwiseType::SQR_DIFF,
+                                             {1, 1, 2, 3},
+                                             {1, 2, 3, 4, 5, 6},
+                                             1,
+                                             {0, 1, 4, 9, 16, 25});
 }

 TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
@@ -277,49 +277,49 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
 }

 TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUM,
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 1, 3}, {1, 2, 3},
      {2, 4, 6, 5, 7, 9});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUB,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUB,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::PROD,
      {1, 1, 1, 3}, {1, 2, 3},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 4, 9, 4, 10, 18});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::DIV,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 1, 1, 1, 5},
      {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::DIV,
      {1, 1, 1, 5}, {1, 1, 1, 2, 4},
      {1, 2, 1, 5},
      {1, 1, 1, 2, 2, 1, 1, 1, 1, 1},
      {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MIN,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MAX,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SQR_DIFF,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
@@ -369,43 +369,43 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorTensor) {
                                               25});
 }
 TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUM,
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {2, 4, 6, 8, 10, 12});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUM,
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6},
      {0.2, 0.4, 0.6, 0.8, 1, 1.2},
      {0.1, 0.1});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SUB,
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {1, 1, 1, 5}, {1, 2, 3, 4, 5},
      {0, 0, 0, 0, 0});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::PROD,
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 4, 9, 16, 25, 36});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::DIV,
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 2, 1, 3}, {1, 2, 3, 4, 5, 6},
      {1, 1, 1, 1, 1, 1});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MIN,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::MAX,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-  SimpleTensorEltwise<DeviceType::OPENCL, float>(
+  SimpleTensorEltwise<DeviceType::GPU, float>(
      kernels::EltwiseType::SQR_DIFF,
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
      {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
@@ -420,7 +420,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", shape, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -441,7 +441,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImg",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("InputImg")
@@ -452,15 +452,15 @@ void RandomTensorScalar(const kernels::EltwiseType type,
      .Finalize(net.NewOperatorDef());

  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
                            1e-2);
  }
 }
@@ -474,8 +474,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0, true, true);
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC,
                                                  "TInput0", NCHW);
@@ -496,9 +496,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input0", "InputImg0",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input1", "InputImg1",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("InputImg0")
@@ -510,15 +510,15 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
      .Finalize(net.NewOperatorDef());

  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImg", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5);
  } else {
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2,
                            1e-2);
  }
 }
@@ -609,19 +609,19 @@ TEST_F(EltwiseOpTest, RandomTensorTensorFloat) {

 TEST_F(EltwiseOpTest, RandomTensorTensorHalf) {
  RandomTensorEltwise<half>(kernels::EltwiseType::SUM,
-                             {1, 32, 32, 16}, {1, 32, 32, 16});
+                            {1, 32, 32, 16}, {1, 32, 32, 16});
  RandomTensorEltwise<half>(kernels::EltwiseType::SUB,
-                             {3, 32, 32, 16}, {3, 32, 32, 16});
+                            {3, 32, 32, 16}, {3, 32, 32, 16});
  RandomTensorEltwise<half>(kernels::EltwiseType::PROD,
-                             {1, 31, 37, 17}, {1, 31, 37, 17});
+                            {1, 31, 37, 17}, {1, 31, 37, 17});
  RandomTensorEltwise<half>(kernels::EltwiseType::DIV,
-                             {5, 31, 37, 17}, {5, 31, 37, 17});
+                            {5, 31, 37, 17}, {5, 31, 37, 17});
  RandomTensorEltwise<half>(kernels::EltwiseType::MIN,
-                             {1, 32, 32, 16}, {1, 32, 32, 16});
+                            {1, 32, 32, 16}, {1, 32, 32, 16});
  RandomTensorEltwise<half>(kernels::EltwiseType::MAX,
-                             {3, 31, 37, 17}, {3, 31, 37, 17});
+                            {3, 31, 37, 17}, {3, 31, 37, 17});
  RandomTensorEltwise<half>(kernels::EltwiseType::SQR_DIFF,
-                             {3, 31, 37, 17}, {3, 31, 37, 17});
+                            {3, 31, 37, 17}, {3, 31, 37, 17});
 }



--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -26,16 +26,16 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    FoldedBatchNormOp<DeviceType::OPENCL, float>);
+                    FoldedBatchNormOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    FoldedBatchNormOp<DeviceType::OPENCL, half>);
+                    FoldedBatchNormOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -60,7 +60,7 @@ void Simple() {
    // Run
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -94,7 +94,7 @@ void Simple() {

 TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }

-TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
+TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::GPU>(); }

 TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  // generate random input
@@ -108,10 +108,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -138,11 +138,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -153,10 +153,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
    .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -173,10 +173,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -203,11 +203,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -219,10 +219,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
    .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
 }
@@ -239,10 +239,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -269,11 +269,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
                                           kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
                                           kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -284,9 +284,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
    .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
 }
@@ -303,10 +303,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});

  net.TransformDataFormat<DeviceType::CPU, float>("Input",
                                                  NHWC,
@@ -333,11 +333,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
                                          kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Scale", "ScaleImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
                                          kernels::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::OPENCL, half>(&net, "Offset", "OffsetImage",
+  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                          kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
@@ -349,9 +349,9 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
    .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2);
 }

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -26,16 +26,16 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    FullyConnectedOp<DeviceType::OPENCL, float>);
+                    FullyConnectedOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    FullyConnectedOp<DeviceType::OPENCL, half>);
+                    FullyConnectedOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -43,7 +43,7 @@ void FCBenchmark(
      .Input("Bias")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
    BufferToImage<D, T>(&net, "Weight", "WeightImage",
                        weight_type);
@@ -93,8 +93,8 @@ void FCBenchmark(

 #define BM_FC(N, H, W, C, OC)                 \
  BM_FC_MACRO(N, H, W, C, OC, float, CPU);    \
-  BM_FC_MACRO(N, H, W, C, OC, float, OPENCL); \
-  BM_FC_MACRO(N, H, W, C, OC, half, OPENCL);
+  BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
+  BM_FC_MACRO(N, H, W, C, OC, half, GPU);

 BM_FC(1, 16, 16, 32, 32);
 BM_FC(1, 8, 8, 32, 1000);

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -51,7 +51,7 @@ void Simple(const std::vector<index_t> &input_shape,
    // Run
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Weight", "WeightImage",
@@ -104,14 +104,14 @@ TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
 }

 TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
-  Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
+  Simple<DeviceType::GPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
                             {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
                             {206});
-  Simple<DeviceType::OPENCL>(
+  Simple<DeviceType::GPU>(
    {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
    {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
-  Simple<DeviceType::OPENCL>(
+  Simple<DeviceType::GPU>(
    {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
    {1, 2, 3, 4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3,
     4, 5, 6, 10, 20, 30, 40, 50, 60, 1, 2, 3, 4, 5, 6},
@@ -119,7 +119,7 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
 }

 TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
-  Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
+  Simple<DeviceType::GPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
                             {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }

@@ -136,11 +136,11 @@ void Complex(const index_t batch,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Weight", {out_channel, height * width * channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});

  OpDefBuilder("FC", "FullyConnectedTest")
    .Input("Input")
@@ -159,11 +159,11 @@ void Complex(const index_t batch,
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
                                       kernels::BufferType::WEIGHT_HEIGHT);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FC", "FullyConnectedTest")
@@ -176,9 +176,9 @@ void Complex(const index_t batch,
    .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
@@ -225,11 +225,11 @@ void TestWXFormat(const index_t batch,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
    "Weight", {out_channel, height * width * channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});

  OpDefBuilder("FC", "FullyConnectedTest")
    .Input("Input")
@@ -248,11 +248,11 @@ void TestWXFormat(const index_t batch,
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                       kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
                                       kernels::BufferType::WEIGHT_WIDTH);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FC", "FullyConnectedTest")
@@ -264,9 +264,9 @@ void TestWXFormat(const index_t batch,
    .Finalize(net.NewOperatorDef());

  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),

--- a/mace/ops/fused_conv_2d.cc
+++ b/mace/ops/fused_conv_2d.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/fused_conv_2d.h"
-
-namespace mace {
-namespace ops {
-
-void Register_FusedConv2D(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::OPENCL, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::OPENCL, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_FUSED_CONV_2D_H_
-#define MACE_OPS_FUSED_CONV_2D_H_
-
-#include <memory>
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/conv_2d.h"
-#include "mace/ops/conv_pool_2d_base.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
- public:
-  FusedConv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
-        functor_(this->strides_.data(),
-                 this->padding_type_,
-                 this->paddings_,
-                 this->dilations_.data(),
-                 kernels::StringToActivationType(
-                     OperatorBase::GetSingleArgument<std::string>("activation",
-                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
-                 static_cast<bool>(OperatorBase::GetSingleArgument<int>(
-                     "is_filter_transformed", false)),
-                 ws->GetScratchBuffer(D)) {}
-
-  bool Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *filter = this->Input(FILTER);
-    const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
-    Tensor *output = this->Output(OUTPUT);
-
-    functor_(input, filter, bias, output, future);
-
-    return true;
-  }
-
- private:
-  kernels::Conv2dFunctor<D, T> functor_;
-
- protected:
-  OP_INPUT_TAGS(INPUT, FILTER, BIAS);
-  OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_FUSED_CONV_2D_H_
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "mace/ops/fused_conv_2d.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-class FusedConv2dOpTest : public OpsTestBase {};
-
-namespace {
-template<DeviceType D, typename T>
-void TestNHWCSimple3x3VALID() {
-  OpsTestNet net;
-  // Add input data
-  net.AddInputFromArray<D, float>(
-    "Input", {1, 3, 3, 2},
-    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
-  net.AddInputFromArray<D, float>(
-    "Filter", {3, 3, 1, 2},
-    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
-
-  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
-
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
-}
-
-template<DeviceType D, typename T>
-void TestNHWCSimple3x3SAME() {
-  OpsTestNet net;
-
-  // Add input data
-  net.AddInputFromArray<D, float>(
-    "Input", {1, 3, 3, 2},
-    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
-  net.AddInputFromArray<D, float>(
-    "Filter", {3, 3, 1, 2},
-    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
-
-  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
-
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  auto expected = CreateTensor<float>(
-    {1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, CPUSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
-  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
-}
-
-TEST_F(FusedConv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
-}
-
-namespace {
-template<DeviceType D, typename T>
-void TestNHWCSimple3x3WithoutBias() {
-  OpsTestNet net;
-
-  // Add input data
-  net.AddInputFromArray<D, float>(
-    "Input", {1, 3, 3, 2},
-    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
-  net.AddInputFromArray<D, float>(
-    "Filter", {3, 3, 1, 2},
-    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-
-  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  // Check
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, CPUWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::CPU, float>();
-}
-
-TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
-  TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
-}
-
-namespace {
-template<DeviceType D>
-void TestConv1x1() {
-  // Construct graph
-  OpsTestNet net;
-
-  // Add input data
-  net.AddInputFromArray<D, float>(
-    "Input", {1, 3, 10, 5},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
-    "Filter", {1, 1, 2, 5},
-    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
-  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
-
-  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Filter", "FilterImage",
-                            kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT_CHANNEL);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  // Check
-  auto expected = CreateTensor<float>(
-    {1, 3, 10, 2},
-    {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
-     5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
-     5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
-     5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
-     5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
-     5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
-
-TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
-
-namespace {
-template<DeviceType D, typename T>
-void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
-  testing::internal::LogToStderr();
-  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
-                  Padding type) {
-    // generate random input
-    static unsigned int seed = time(NULL);
-    index_t batch = 3 + (rand_r(&seed) % 10);
-    index_t height = shape[0];
-    index_t width = shape[1];
-    index_t input_channels = shape[2] + (rand_r(&seed) % 10);
-    index_t output_channels = shape[3] + (rand_r(&seed) % 10);
-
-    OpsTestNet net;
-
-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-
-    // Construct graph
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // run on cpu
-    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-
-    // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
-
-    // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
-                            1e-5, 1e-4);
-  };
-
-  for (int kernel_size : {1, 3}) {
-    for (int stride : {1, 2}) {
-      func(kernel_size, kernel_size, stride, stride, VALID);
-      func(kernel_size, kernel_size, stride, stride, SAME);
-    }
-  }
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
-}
-
-namespace {
-template<DeviceType D>
-void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape,
-                               const int kernel, const int stride,
-                               Padding type) {
-  testing::internal::LogToStderr();
-  // generate random input
-  srand(time(NULL));
-  index_t batch = 3;
-  index_t height = shape[0];
-  index_t width = shape[1];
-  index_t input_channels = shape[2];
-  index_t output_channels = shape[3];
-
-  OpsTestNet net;
-
-  std::vector<float> float_input_data;
-  GenerateRandomRealTypeData({batch, height, width, input_channels},
-                             &float_input_data);
-  std::vector<float> float_filter_data;
-  GenerateRandomRealTypeData(
-      {kernel, kernel, output_channels, input_channels},
-      &float_filter_data);
-  std::vector<float> float_bias_data;
-  GenerateRandomRealTypeData({output_channels}, &float_bias_data);
-  // Add input data
-  net.AddInputFromArray<D, float>(
-      "Input", {batch, height, width, input_channels}, float_input_data);
-  net.AddInputFromArray<D, float>(
-      "Filter", {kernel, kernel, output_channels, input_channels},
-      float_filter_data);
-  net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
-
-  net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                  NHWC,
-                                                  "InputNCHW",
-                                                  NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                  HWOI,
-                                                  "FilterOIHW",
-                                                  OIHW);
-
-  // Construct graph
-  OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-    .Input("InputNCHW")
-    .Input("FilterOIHW")
-    .Input("Bias")
-    .Output("OutputNCHW")
-    .AddIntsArg("strides", {stride, stride})
-    .AddIntArg("padding", type)
-    .AddIntsArg("dilations", {1, 1})
-    .Finalize(net.NewOperatorDef());
-
-  // run on cpu
-  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                  NCHW,
-                                                  "Output",
-                                                  NHWC);
-
-  // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-
-  // run on gpu
-  BufferToImage<D, half>(&net, "Input", "InputImage",
-                         kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                         kernels::BufferType::CONV2D_FILTER);
-  BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                         kernels::BufferType::ARGUMENT);
-
-  OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {stride, stride})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-      .Finalize(net.NewOperatorDef());
-  // Run on device
-  net.RunOp(D);
-
-  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          kernels::BufferType::IN_OUT_CHANNEL);
-
-  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
-                          1e-2, 1e-1);
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 1, 1, VALID);
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 1, 1, SAME);
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 1, 2, VALID);
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 1, 2, SAME);
-}
-TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 3, 1, VALID);
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 3, 1, SAME);
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64}, 3, 2, VALID);
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({31, 37, 31, 37}, 3, 2, SAME);
-}
-
-namespace {
-template<DeviceType D, typename T>
-void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
-                           const std::vector<index_t> &filter_shape) {
-  testing::internal::LogToStderr();
-  auto func = [&](int stride_h, int stride_w, Padding type) {
-    srand(time(NULL));
-
-    // generate random input
-    index_t batch = 1;
-    index_t height = image_shape[0];
-    index_t width = image_shape[1];
-    index_t kernel_h = filter_shape[0];
-    index_t kernel_w = filter_shape[1];
-    index_t output_channels = filter_shape[2];
-    index_t input_channels = filter_shape[3];
-
-    OpsTestNet net;
-
-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-
-    // Construct graph
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // run on cpu
-    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-    // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
-
-    // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
-                            1e-5, 1e-4);
-  };
-
-  for (int stride : {1, 2}) {
-    func(stride, stride, VALID);
-    func(stride, stride, SAME);
-  }
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, OPENCL7X7ConvNxNS12) {
-  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({32, 32}, {7, 7, 64, 3});
-}
-
-TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
-  TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 64, 32});
-}
-
-namespace {
-template<DeviceType D, typename T>
-void TestAtrousConvNxN(const std::vector<index_t> &shape,
-                       const int dilation) {
-  testing::internal::LogToStderr();
-  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
-                  Padding type) {
-    srand(time(NULL));
-
-    // generate random input
-    index_t batch = 1;
-    index_t height = shape[0];
-    index_t width = shape[1];
-    index_t output_channels = shape[2];
-    index_t input_channels = shape[3];
-
-    OpsTestNet net;
-
-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-
-    // Construct graph
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {dilation, dilation})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // run on cpu
-    net.RunOp();
-
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-
-    // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
-
-    // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        kernels::BufferType::ARGUMENT);
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {dilation, dilation})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
-                            1e-5, 1e-4);
-  };
-
-  for (int kernel_size : {3}) {
-    for (int stride : {1}) {
-      func(kernel_size, kernel_size, stride, stride, VALID);
-      func(kernel_size, kernel_size, stride, stride, SAME);
-    }
-  }
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN2) {
-  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 2);
-}
-
-TEST_F(FusedConv2dOpTest, OPENCLalignedAtrousConvNxN4) {
-  TestAtrousConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
-}
-
-TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
-  TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
-}
-
-namespace {
-template<DeviceType D>
-void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
-                               const std::vector<index_t> &filter_shape,
-                               const std::vector<int> &dilations) {
-  testing::internal::LogToStderr();
-  auto func = [&](int stride_h, int stride_w, Padding type) {
-    srand(time(NULL));
-
-    // generate random input
-    index_t batch = 1;
-    index_t height = image_shape[0];
-    index_t width = image_shape[1];
-    index_t kernel_h = filter_shape[0];
-    index_t kernel_w = filter_shape[1];
-    index_t output_channels = filter_shape[2];
-    index_t input_channels = filter_shape[3];
-
-    OpsTestNet net;
-
-    // Add input data
-    net.AddRandomInput<D, float>("Input",
-                                 {batch, height, width, input_channels});
-    net.AddRandomInput<D, float>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, float>("Bias", {output_channels});
-
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
-                                                    HWOI,
-                                                    "FilterOIHW",
-                                                    OIHW);
-
-    // Construct graph
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNCHW")
-      .Input("FilterOIHW")
-      .Input("Bias")
-      .Output("OutputNCHW")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-
-    // run on cpu
-    net.RunOp();
-
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
-                                                    "Output",
-                                                    NHWC);
-    // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
-
-    // run on gpu
-    BufferToImage<D, half>(&net, "Input", "InputImage",
-                           kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           kernels::BufferType::ARGUMENT);
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            kernels::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
-                            1e-2, 1e-1);
-  };
-
-  func(1, 1, VALID);
-  func(1, 1, SAME);
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
-  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 16, 3},
-                                                {2, 2});
-}
-
-TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
-  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71}, {15, 15, 16, 16},
-                                                {2, 2});
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
@@ -19,16 +19,16 @@ namespace ops {

 void Register_ImageToBuffer(OperatorRegistry *op_registry) {
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ImageToBufferOp<DeviceType::OPENCL, float>);
+                    ImageToBufferOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ImageToBufferOp<DeviceType::OPENCL, half>);
+                    ImageToBufferOp<DeviceType::GPU, half>);
 }

 }  // namespace ops

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -26,16 +26,16 @@ void Register_MatMul(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    MatMulOp<DeviceType::OPENCL, float>);
+                    MatMulOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    MatMulOp<DeviceType::OPENCL, half>);
+                    MatMulOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -34,7 +34,7 @@ void MatMulBenchmark(
  net.AddRandomInput<D, float>("A", {batch, height, channels, 1});
  net.AddRandomInput<D, float>("B", {batch, channels, out_width, 1});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH);
    BufferToImage<D, T>(&net, "B", "BImage",
                        kernels::BufferType::IN_OUT_HEIGHT);
@@ -79,8 +79,8 @@ void MatMulBenchmark(

 #define BM_MATMUL(N, H, C, W)                 \
  BM_MATMUL_MACRO(N, H, C, W, float, CPU);    \
-  BM_MATMUL_MACRO(N, H, C, W, float, OPENCL); \
-  BM_MATMUL_MACRO(N, H, C, W, half, OPENCL);
+  BM_MATMUL_MACRO(N, H, C, W, float, GPU); \
+  BM_MATMUL_MACRO(N, H, C, W, half, GPU);

 BM_MATMUL(16, 32, 128, 49);
 BM_MATMUL(16, 32, 128, 961);

--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -37,7 +37,7 @@ void Simple(const std::vector<index_t> &A_shape,
  net.AddInputFromArray<D, float>("A", A_shape, A_value);
  net.AddInputFromArray<D, float>("B", B_shape, B_value);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "A", "AImage",
                            kernels::BufferType::IN_OUT_WIDTH);
    BufferToImage<D, float>(&net, "B", "BImage",
@@ -91,10 +91,10 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
 }

 TEST_F(MatMulOpTest, SimpleOPENCL) {
-  Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
+  Simple<DeviceType::GPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
                             {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1},
                             {22, 28, 49, 64});
-  Simple<DeviceType::OPENCL>(
+  Simple<DeviceType::GPU>(
      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
@@ -127,9 +127,9 @@ void Complex(const index_t batch,
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("A",
+  net.AddRandomInput<DeviceType::GPU, float>("A",
                                                {batch, height, channels, 1});
-  net.AddRandomInput<DeviceType::OPENCL, float>(
+  net.AddRandomInput<DeviceType::GPU, float>(
      "B", {batch, channels, out_width, 1});

  // run cpu
@@ -140,9 +140,9 @@ void Complex(const index_t batch,
  expected.Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::OPENCL, T>(&net, "A", "AImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
                                       kernels::BufferType::IN_OUT_WIDTH);
-  BufferToImage<DeviceType::OPENCL, T>(&net, "B", "BImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage",
                                       kernels::BufferType::IN_OUT_HEIGHT);

  OpDefBuilder("MatMul", "MatMulTest")
@@ -153,9 +153,9 @@ void Complex(const index_t batch,
      .Finalize(net.NewOperatorDef());

  // Run on opencl
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
                                           kernels::BufferType::IN_OUT_HEIGHT);
  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -403,7 +403,7 @@ class OpsTestNet {
  }

  void Sync() {
-    if (net_ && device_ == DeviceType::OPENCL) {
+    if (net_ && device_ == DeviceType::GPU) {
      OpenCLRuntime::Global()->command_queue().finish();
    }
  }

--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -26,15 +26,15 @@ void Register_Pad(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    PadOp<DeviceType::OPENCL, float>);
+                    PadOp<DeviceType::GPU, float>);
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    PadOp<DeviceType::OPENCL, half>);
+                    PadOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -33,7 +33,7 @@ void Pad(int iters, int batch, int height,
  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});

  const std::vector<int> paddings = {0, 0, pad, pad, pad, pad, 0, 0};
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Pad", "PadTest")
@@ -77,8 +77,8 @@ void Pad(int iters, int batch, int height,

 #define BM_PAD(N, H, W, C, PAD)                 \
  BM_PAD_MACRO(N, H, W, C, PAD, float, CPU);    \
-  BM_PAD_MACRO(N, H, W, C, PAD, float, OPENCL); \
-  BM_PAD_MACRO(N, H, W, C, PAD, half, OPENCL);
+  BM_PAD_MACRO(N, H, W, C, PAD, float, GPU); \
+  BM_PAD_MACRO(N, H, W, C, PAD, half, GPU);

 BM_PAD(1, 512, 512, 1, 2);
 BM_PAD(1, 112, 112, 64, 1);

--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -29,7 +29,7 @@ void Simple() {

  // Add input data
  net.AddRepeatedInput<D, float>("Input", {1, 2, 3, 1}, 2);
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Pad", "PadTest")
@@ -45,15 +45,24 @@ void Simple() {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "TInput",
+                                                    NCHW);
    OpDefBuilder("Pad", "PadTest")
-        .Input("Input")
-        .Output("Output")
-        .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
+        .Input("TInput")
+        .Output("TOutput")
+        .AddIntsArg("paddings", {0, 0, 0, 0, 1, 2, 1, 2})
        .AddFloatArg("constant_value", 1.0)
        .Finalize(net.NewOperatorDef());

    // Run
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
  }

  auto output = net.GetTensor("Output");
@@ -75,7 +84,7 @@ TEST_F(PadTest, SimpleCPU) {
 }

 TEST_F(PadTest, SimpleGPU) {
-  Simple<DeviceType::OPENCL>();
+  Simple<DeviceType::GPU>();
 }

 TEST_F(PadTest, ComplexCPU) {
@@ -84,15 +93,23 @@ TEST_F(PadTest, ComplexCPU) {

  // Add input data
  net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "TInput",
+                                                  NCHW);
  OpDefBuilder("Pad", "PadTest")
-      .Input("Input")
-      .Output("Output")
+      .Input("TInput")
+      .Output("TOutput")
      .AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1})
      .AddFloatArg("constant_value", 1.0)
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);

  auto output = net.GetTensor("Output");

@@ -109,39 +126,48 @@ TEST_F(PadTest, ComplexCPU) {
 namespace {
 template <typename T>
 void Complex(const std::vector<index_t> &input_shape,
-             const std::vector<int> &paddings) {
+             const std::vector<int> &cpu_paddings,
+             const std::vector<int> &gpu_paddings) {
  // Construct graph
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>("Input", input_shape);
+  net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "TInput",
+                                                  NCHW);
  OpDefBuilder("Pad", "PadTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("paddings", paddings)
+      .Input("TInput")
+      .Output("TOutput")
+      .AddIntsArg("paddings", cpu_paddings)
      .AddFloatArg("constant_value", 1.0)
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("TOutput",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);

  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

-  BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
                                       kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Pad", "PadTest")
      .Input("InputImage")
      .Output("OutputImage")
-      .AddIntsArg("paddings", paddings)
+      .AddIntsArg("paddings", gpu_paddings)
      .AddFloatArg("constant_value", 1.0)
      .Finalize(net.NewOperatorDef());

  // Run
-  net.RunOp(DeviceType::OPENCL);
+  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OpenCLOutput",
+  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput",
                                           kernels::BufferType::IN_OUT_CHANNEL);

  auto output = net.GetTensor("OpenCLOutput");
@@ -155,15 +181,21 @@ void Complex(const std::vector<index_t> &input_shape,
 }  // namespace

 TEST_F(PadTest, ComplexFloat) {
-  Complex<float>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
-  Complex<float>({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0});
-  Complex<float>({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0});
+  Complex<float>({1, 32, 32, 4},
+                 {0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0});
+  Complex<float>({1, 31, 37, 16},
+                 {0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0});
+  Complex<float>({1, 128, 128, 32},
+                 {0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0});
 }

 TEST_F(PadTest, ComplexHalf) {
-  Complex<half>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
-  Complex<half>({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0});
-  Complex<half>({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0});
+  Complex<half>({1, 32, 32, 4},
+                {0, 0, 0, 0, 2, 2, 1, 1}, {0, 0, 2, 2, 1, 1, 0, 0});
+  Complex<half>({1, 31, 37, 16},
+                {0, 0, 0, 0, 2, 0, 1, 0}, {0, 0, 2, 0, 1, 0, 0, 0});
+  Complex<half>({1, 128, 128, 32},
+                {0, 0, 0, 0, 0, 1, 0, 2}, {0, 0, 0, 1, 0, 2, 0, 0});
 }

 }  // namespace test

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -26,16 +26,16 @@ void Register_Pooling(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    PoolingOp<DeviceType::OPENCL, float>);
+                    PoolingOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    PoolingOp<DeviceType::OPENCL, half>);
+                    PoolingOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -41,7 +41,7 @@ void Pooling(int iters,
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, height, width, channels});
  } else {
@@ -58,7 +58,7 @@ void Pooling(int iters,
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -104,7 +104,7 @@ void Pooling(int iters,

 #define BM_POOLING(N, C, H, W, K, S, PA, PO) \
  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
-  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, OPENCL);
+  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, GPU);

 BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
 BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -211,7 +211,7 @@ void SimpleMaxPooling3S2() {
                                                    NCHW,
                                                    "Output",
                                                    NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Pooling", "PoolingTest")
@@ -238,7 +238,7 @@ void SimpleMaxPooling3S2() {
 TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2<CPU>(); }

 TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
-  SimpleMaxPooling3S2<OPENCL>();
+  SimpleMaxPooling3S2<GPU>();
 }

 namespace {
@@ -304,24 +304,24 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
 }  // namespace

 TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
-  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
-  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
-  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
-  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
+  MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<GPU, float>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) {
-  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
-  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
-  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
-  MaxPooling3S2<OPENCL, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
+  MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<GPU, half>({3, 64, 32, 32}, {2, 2}, Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
-  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
-  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
-  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
-  MaxPooling3S2<OPENCL, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
+  MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {1, 1}, Padding::VALID);
+  MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {2, 2}, Padding::VALID);
+  MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {1, 1}, Padding::SAME);
+  MaxPooling3S2<GPU, half>({3, 41, 43, 47}, {2, 2}, Padding::SAME);
 }

 TEST_F(PoolingOpTest, AVG_VALID) {
@@ -400,7 +400,7 @@ void SimpleAvgPoolingTest() {
 }  // namespace

 TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
-  SimpleAvgPoolingTest<OPENCL>();
+  SimpleAvgPoolingTest<GPU>();
 }

 namespace {
@@ -468,43 +468,43 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
 }  // namespace

 TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) {
-  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
+  AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
                                Padding::VALID);
-  AvgPoolingTest<OPENCL, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
+  AvgPoolingTest<GPU, float>({3, 15, 15, 128}, {4, 4}, {4, 4},
                                Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) {
-  AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4},
+  AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4},
                               Padding::VALID);
-  AvgPoolingTest<OPENCL, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
+  AvgPoolingTest<GPU, half>({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) {
-  AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
+  AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
                                Padding::VALID);
-  AvgPoolingTest<OPENCL, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
+  AvgPoolingTest<GPU, float>({3, 64, 64, 128}, {16, 16}, {16, 16},
                                Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
-  AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
+  AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
                               Padding::VALID);
-  AvgPoolingTest<OPENCL, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
+  AvgPoolingTest<GPU, half>({3, 64, 64, 128}, {16, 16}, {16, 16},
                               Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) {
-  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
+  AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
                                Padding::VALID);
-  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
+  AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {2, 2}, {2, 2},
                                Padding::SAME);
 }

 TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
-  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
+  AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
                                Padding::VALID);
-  AvgPoolingTest<OPENCL, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
+  AvgPoolingTest<GPU, float>({3, 31, 37, 128}, {8, 8}, {8, 8},
                                Padding::SAME);
 }


--- a/mace/ops/reorganize.cc
+++ b/mace/ops/reorganize.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/reorganize.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ReOrganize(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReOrganize")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    ReOrganizeOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/reorganize.h
+++ b/mace/ops/reorganize.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_REORGANIZE_H_
-#define MACE_OPS_REORGANIZE_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/reorganize.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class ReOrganizeOp : public Operator<D, T> {
- public:
-  ReOrganizeOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
-
-  bool Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const index_t num_dims = shape_.size();
-    int unknown_idx = -1;
-    index_t product = 1;
-    std::vector<index_t> out_shape;
-
-    for (int i = 0; i < num_dims; ++i) {
-      if (shape_[i] == -1) {
-        MACE_CHECK(unknown_idx == -1) << "Only one input size may be -1";
-        unknown_idx = i;
-        out_shape.push_back(1);
-      } else {
-        MACE_CHECK(shape_[i] >= 0) << "Shape must be non-negative: "
-                                   << shape_[i];
-        out_shape.push_back(shape_[i]);
-        product *= shape_[i];
-      }
-    }
-
-    if (unknown_idx != -1) {
-      MACE_CHECK(product != 0)
-          << "Cannot infer shape if there is zero shape size.";
-      const index_t missing = input->size() / product;
-      MACE_CHECK(missing * product == input->size())
-          << "Input size not match reshaped tensor size";
-      out_shape[unknown_idx] = missing;
-    }
-
-    Tensor *output = this->Output(OUTPUT);
-    output->Resize(out_shape);
-
-    functor_(input, out_shape, output, future);
-    return true;
-  }
-
- private:
-  std::vector<int64_t> shape_;
-  kernels::ReOrganizeFunctor<D, T> functor_;
-
- protected:
-  OP_INPUT_TAGS(INPUT);
-  OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_REORGANIZE_H_
--- a/mace/ops/reorganize_test.cc
+++ b/mace/ops/reorganize_test.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmock/gmock.h"
-#include "mace/core/operator.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-class ReOrganizeTest : public OpsTestBase {};
-
-namespace {
-void TestReOrganize(const std::vector<index_t> &input_shape,
-                    const std::vector<float> &input_data,
-                    const std::vector<index_t> &output_shape,
-                    const std::vector<float> &output_data) {
-  const std::vector<int> out_shape(output_shape.begin(), output_shape.end());
-
-  // Construct graph
-  OpsTestNet net;
-
-  OpDefBuilder("ReOrganize", "ReOrganizeTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("shape", out_shape)
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input",
-                                                input_shape, input_data);
-
-  // Run
-  net.RunOp();
-
-  auto output = net.GetTensor("Output");
-
-  EXPECT_THAT(output->shape(), ::testing::ContainerEq(output_shape));
-
-  const float *output_ptr = output->data<float>();
-  int size = output->size();
-  for (int i = 0; i < size; ++i) {
-    ASSERT_EQ(output_data[i], output_ptr[i]) << "With Index " << i;
-  }
-
-  // Reverse reorganzie
-  const std::vector<int> in_shape(input_shape.begin(), input_shape.end());
-  OpDefBuilder("ReOrganize", "ReOrganizeTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("shape", in_shape)
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input",
-                                                output_shape, output_data);
-
-  // Run
-  net.RunOp();
-
-  output = net.GetTensor("Output");
-
-  EXPECT_THAT(output->shape(), ::testing::ContainerEq(input_shape));
-
-  output_ptr = output->data<float>();
-  size = output->size();
-  for (int i = 0; i < size; ++i) {
-    ASSERT_EQ(input_data[i], output_ptr[i]) << "With Index " << i;
-  }
-}
-}  // namespace
-
-TEST_F(ReOrganizeTest, Simple) {
-  TestReOrganize({1, 1, 4, 6},
-                 {0, 4, 8, 12, 16, 20,
-                  1, 5, 9, 13, 17, 21,
-                  2, 6, 10, 14, 18, 22,
-                  3, 7, 11, 15, 19, 23},
-                 {1, 1, 8, 3},
-                 {0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19,
-                  4, 12, 20, 5, 13, 21, 6, 14, 22, 7, 15, 23});
-  TestReOrganize({1, 1, 5, 6},
-                 {0, 5, 10, 15, 20, 25,
-                  1, 6, 11, 16, 21, 26,
-                  2, 7, 12, 17, 22, 27,
-                  3, 8, 13, 18, 23, 28,
-                  4, 9, 14, 19, 24, 29},
-                 {1, 1, 10, 3},
-                 {0, 10, 20, 1, 11, 21, 2, 12, 22, 3, 13, 23,
-                  4, 14, 24, 5, 15, 25, 6, 16, 26, 7, 17, 27,
-                  8, 18, 28, 9, 19, 29});
-}
-
-TEST_F(ReOrganizeTest, Complex) {
-  TestReOrganize({1, 2, 2, 6},
-                 {0, 4, 8, 12, 16, 20,
-                  1, 5, 9, 13, 17, 21,
-                  2, 6, 10, 14, 18, 22,
-                  3, 7, 11, 15, 19, 23},
-                 {1, 2, 6, 2},
-                 {0, 12, 1, 13, 4, 16, 5, 17, 8, 20, 9, 21,
-                  2, 14, 3, 15, 6, 18, 7, 19, 10, 22, 11, 23});
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -26,16 +26,16 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    ResizeBilinearOp<DeviceType::OPENCL, float>);
+                    ResizeBilinearOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    ResizeBilinearOp<DeviceType::OPENCL, half>);
+                    ResizeBilinearOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -38,7 +38,7 @@ void ResizeBilinearBenchmark(int iters,
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, channels, input_height, input_width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, input_height, input_width, channels});
  } else {
@@ -55,7 +55,7 @@ void ResizeBilinearBenchmark(int iters,
      .AddIntsArg("size", {output_height, output_width})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
@@ -99,8 +99,8 @@ void ResizeBilinearBenchmark(int iters,

 #define BM_RESIZE_BILINEAR(N, C, H0, W0, H1, W1)                 \
  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, CPU);    \
-  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, OPENCL); \
-  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, OPENCL);
+  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, float, GPU); \
+  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, half, GPU);

 BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480);


--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -132,7 +132,7 @@ void TestRandomResizeBilinear() {
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

-    if (D == DeviceType::OPENCL) {
+    if (D == DeviceType::GPU) {
      BufferToImage<D, float>(&net, "Input", "InputImage",
                              kernels::BufferType::IN_OUT_CHANNEL);

@@ -147,8 +147,6 @@ void TestRandomResizeBilinear() {

      ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
                              kernels::BufferType::IN_OUT_CHANNEL);
-    } else {
-      // TODO(someone): support NEON
    }
    // Check
    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"),
@@ -158,7 +156,7 @@ void TestRandomResizeBilinear() {
 }  // namespace

 TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
-  TestRandomResizeBilinear<DeviceType::OPENCL>();
+  TestRandomResizeBilinear<DeviceType::GPU>();
 }

 }  // namespace test

--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -26,16 +26,16 @@ void Register_Slice(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    SliceOp<DeviceType::OPENCL, float>);
+                    SliceOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    SliceOp<DeviceType::OPENCL, half>);
+                    SliceOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/slice_benchmark.cc
+++ b/mace/ops/slice_benchmark.cc
@@ -38,7 +38,7 @@ void BMSliceHelper(int iters,
  GenerateRandomRealTypeData(input_shape, &input_data);
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);

@@ -85,8 +85,8 @@ void BMSliceHelper(int iters,

 #define BM_SLICE(N, H, W, C, NO)                 \
  BM_SLICE_MACRO(N, H, W, C, NO, float, CPU);    \
-  BM_SLICE_MACRO(N, H, W, C, NO, float, OPENCL); \
-  BM_SLICE_MACRO(N, H, W, C, NO, half, OPENCL);
+  BM_SLICE_MACRO(N, H, W, C, NO, float, GPU); \
+  BM_SLICE_MACRO(N, H, W, C, NO, half, GPU);

 BM_SLICE(1, 32, 32, 32, 2);
 BM_SLICE(1, 32, 32, 128, 2);

--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
@@ -51,7 +51,7 @@ void RandomTest(const int num_outputs, const int axis) {
  GenerateRandomRealTypeData(input_shape, &input_data);
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);

@@ -75,7 +75,7 @@ void RandomTest(const int num_outputs, const int axis) {
  // Run
  net.RunOp(D);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::GPU) {
    for (int i = 0; i < num_outputs; ++i) {
      ImageToBuffer<D, float>(&net,
                              MakeString("OutputImage", i),
@@ -130,15 +130,15 @@ TEST_F(SliceOpTest, CPUAxis1) {
 }

 TEST_F(SliceOpTest, OPENCLFloat) {
-  RandomTest<DeviceType::OPENCL, float>(2, 3);
-  RandomTest<DeviceType::OPENCL, float>(4, 3);
-  RandomTest<DeviceType::OPENCL, float>(11, 3);
+  RandomTest<DeviceType::GPU, float>(2, 3);
+  RandomTest<DeviceType::GPU, float>(4, 3);
+  RandomTest<DeviceType::GPU, float>(11, 3);
 }

 TEST_F(SliceOpTest, OPENCLHalf) {
-  RandomTest<DeviceType::OPENCL, half>(2, 3);
-  RandomTest<DeviceType::OPENCL, half>(4, 3);
-  RandomTest<DeviceType::OPENCL, half>(11, 3);
+  RandomTest<DeviceType::GPU, half>(2, 3);
+  RandomTest<DeviceType::GPU, half>(4, 3);
+  RandomTest<DeviceType::GPU, half>(11, 3);
 }

 }  // namespace test

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -26,16 +26,16 @@ void Register_Softmax(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    SoftmaxOp<DeviceType::OPENCL, float>);
+                    SoftmaxOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    SoftmaxOp<DeviceType::OPENCL, half>);
+                    SoftmaxOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -33,7 +33,7 @@ void SoftmaxBenchmark(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -44,7 +44,7 @@ void SoftmaxBenchmark(
      .Input("Input")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -82,8 +82,8 @@ void SoftmaxBenchmark(

 #define BM_SOFTMAX(N, C, H, W)                 \
  BM_SOFTMAX_MACRO(N, C, H, W, float, CPU);    \
-  BM_SOFTMAX_MACRO(N, C, H, W, float, OPENCL); \
-  BM_SOFTMAX_MACRO(N, C, H, W, half, OPENCL);
+  BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \
+  BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);

 BM_SOFTMAX(1, 2, 512, 512);
 BM_SOFTMAX(1, 3, 512, 512);

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -40,7 +40,7 @@ void Simple() {
    // Run
    net.RunOp(D);
    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -68,7 +68,7 @@ void Simple() {
 }  // namespace

 TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
-TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
+TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::GPU>(); }

 namespace {
 template<DeviceType D>
@@ -114,18 +114,18 @@ void Complex(const std::vector<index_t> &logits_shape) {
 }  // namespace

 TEST_F(SoftmaxOpTest, OPENCLAligned) {
-  Complex<DeviceType::OPENCL>({1, 256, 256, 3});
-  Complex<DeviceType::OPENCL>({1, 128, 128, 16});
+  Complex<DeviceType::GPU>({1, 256, 256, 3});
+  Complex<DeviceType::GPU>({1, 128, 128, 16});
 }

 TEST_F(SoftmaxOpTest, OPENCLMulBatchAligned) {
-  Complex<DeviceType::OPENCL>({5, 64, 64, 3});
-  Complex<DeviceType::OPENCL>({8, 128, 128, 8});
+  Complex<DeviceType::GPU>({5, 64, 64, 3});
+  Complex<DeviceType::GPU>({8, 128, 128, 8});
 }

 TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
-  Complex<DeviceType::OPENCL>({1, 113, 107, 13});
-  Complex<DeviceType::OPENCL>({5, 211, 107, 1});
+  Complex<DeviceType::GPU>({1, 113, 107, 13});
+  Complex<DeviceType::GPU>({5, 211, 107, 1});
 }

 }  // namespace test

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -20,16 +20,16 @@ namespace ops {
 void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    SpaceToBatchNDOp<DeviceType::OPENCL, float>);
+                    SpaceToBatchNDOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    SpaceToBatchNDOp<DeviceType::OPENCL, half>);
+                    SpaceToBatchNDOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -65,7 +65,7 @@ void BMSpaceToBatch(
      BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)

 #define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
-  BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, OPENCL);
+  BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU);

 BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
 BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -85,7 +85,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
                                const std::vector<index_t> &batch_shape,
                                const std::vector<float> &batch_data) {
  auto space_tensor = std::unique_ptr<Tensor>(new Tensor(
-      GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<T>::v()));
+      GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
  space_tensor->Resize(space_shape);
  {
    Tensor::MappingGuard space_mapper(space_tensor.get());
@@ -97,7 +97,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
  }

  auto batch_tensor = std::unique_ptr<Tensor>(new Tensor(
-      GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<T>::v()));
+      GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
  batch_tensor->Resize(batch_shape);
  {
    Tensor::MappingGuard batch_mapper(batch_tensor.get());
@@ -106,10 +106,10 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
    memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T));
  }

-  RunSpaceToBatch<DeviceType::OPENCL>(space_shape, space_data, block_data,
+  RunSpaceToBatch<DeviceType::GPU>(space_shape, space_data, block_data,
                                      padding_data, batch_tensor.get());

-  RunBatchToSpace<DeviceType::OPENCL>(batch_shape, batch_data, block_data,
+  RunBatchToSpace<DeviceType::GPU>(batch_shape, batch_data, block_data,
                                      padding_data, space_tensor.get());
 }
 }  // namespace

--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -26,16 +26,16 @@ void Register_SpaceToDepth(OperatorRegistry *op_registry) {

 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    SpaceToDepthOp<DeviceType::OPENCL, float>);
+                    SpaceToDepthOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    SpaceToDepthOp<DeviceType::OPENCL, half>);
+                    SpaceToDepthOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
@@ -45,7 +45,7 @@ class SpaceToDepthOp : public Operator<D, T> {
      input_height = input->dim(2);
      input_width = input->dim(3);
      input_depth = input->dim(1);
-    } else if (D == OPENCL) {
+    } else if (D == GPU) {
      input_height = input->dim(1);
      input_width = input->dim(2);
      input_depth = input->dim(3);

--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -31,7 +31,7 @@ void SpaceToDepth(
  // Add input data
  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
@@ -42,7 +42,7 @@ void SpaceToDepth(
      .Input("Input")
      .Output("Output")
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::OPENCL) {
+  } else if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -82,8 +82,8 @@ void SpaceToDepth(

 #define BM_SPACE_TO_DEPTH(N, C, H, W, G)                 \
  BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, CPU);    \
-  BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, OPENCL); \
-  BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, OPENCL);
+  BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, float, GPU); \
+  BM_SPACE_TO_DEPTH_MACRO(N, C, H, W, G, half, GPU);

 BM_SPACE_TO_DEPTH(1, 64, 64, 64, 4);
 BM_SPACE_TO_DEPTH(1, 64, 128, 128, 4);

--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -147,23 +147,23 @@ void WinogradConvolution(const index_t batch,
 }  // namespace

 TEST_F(WinogradConvlutionTest, AlignedConvolution) {
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
                                                 Padding::VALID);
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
+  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 32, 16,
                                                 Padding::SAME);
 }

 TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37,
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
                                                 Padding::VALID);
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31,
+  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
                                                 Padding::SAME);
 }

 TEST_F(WinogradConvlutionTest, BatchConvolution) {
-  WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32,
+  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
                                                 Padding::VALID);
-  WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31,
+  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
                                                 Padding::SAME);
 }


--- a/mace/ops/winograd_inverse_transform.cc
+++ b/mace/ops/winograd_inverse_transform.cc
@@ -20,16 +20,16 @@ namespace ops {
 void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    WinogradInverseTransformOp<DeviceType::OPENCL, float>);
+                    WinogradInverseTransformOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    WinogradInverseTransformOp<DeviceType::OPENCL, half>);
+                    WinogradInverseTransformOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
@@ -20,16 +20,16 @@ namespace ops {
 void Register_WinogradTransform(OperatorRegistry *op_registry) {
 #ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
-                    WinogradTransformOp<DeviceType::OPENCL, float>);
+                    WinogradTransformOp<DeviceType::GPU, float>);

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
-                                     .Device(DeviceType::OPENCL)
+                                     .Device(DeviceType::GPU)
                                     .TypeConstraint<half>("T")
                                     .Build(),
-                    WinogradTransformOp<DeviceType::OPENCL, half>);
+                    WinogradTransformOp<DeviceType::GPU, half>);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -62,7 +62,7 @@ void BMWinogradTransform(
  BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)

 #define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
-  BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
+  BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU);

 BM_WINOGRAD_TRANSFORM(1, 16, 16, 128);
 BM_WINOGRAD_TRANSFORM(1, 64, 64, 128);
@@ -116,7 +116,7 @@ void BMWinogradInverseTransform(
      BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)

 #define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
-  BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
+  BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU);

 BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
 BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);

--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -9,8 +9,7 @@ enum NetMode {

 enum DeviceType {
  CPU    = 0;                    // In default, we will use CPU.
-  NEON   = 1;
-  OPENCL = 2;
+  GPU    = 2;
 }

 enum DataType {

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -28,7 +28,7 @@ namespace mace {

 const char *MaceVersion();

-enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
+enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };

 enum MaceStatus { MACE_SUCCESS = 0, MACE_INVALID_ARGS = 1 };


--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -475,8 +475,6 @@ class CaffeConverter(object):
                self.ops_map[final_op.name].children[0].type \
                in activation_name_map:
            activation_op = self.ops_map[final_op.name].children[0]
-            if not is_depthwise:
-                op_def.type = "FusedConv2D"
            fused_act_arg = op_def.arg.add()
            fused_act_arg.name = 'activation'
            fused_act_arg.s = activation_name_map[activation_op.type]
@@ -984,15 +982,10 @@ class CaffeConverter(object):
        self.resolved_ops.add(op.name)

    def convert_reshape(self, op):
-        if self.device == 'cpu':
-            op_def = self.CommonConvert(op, 'Reshape')
-        else:
-            op_def = self.CommonConvert(op, 'ReOrganize')
+        op_def = self.CommonConvert(op, 'Reshape')
        input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
        output_shape = input_shape
        shape_param = np.asarray(op.layer.reshape_param.shape.dim)
-        if self.device != 'cpu':
-            shape_param = shape_param[[0, 3, 1, 2]]
        for i in range(len(shape_param)):
            if shape_param[i] != 0:
                output_shape[i] = shape_param[i]

--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -508,8 +508,6 @@ class TFConverter(object):
        if len(self.tf_graph.get(final_op.name, [])) == 1 and \
           self.tf_graph[final_op.name][0].type in activation_name_map:
            activation_op = self.tf_graph[final_op.name][0]
-            if op_def.type == "Conv2D":
-                op_def.type = "FusedConv2D"
            fused_act_arg = op_def.arg.add()
            fused_act_arg.name = 'activation'
            fused_act_arg.s = activation_name_map[activation_op.type]
@@ -958,14 +956,17 @@ class TFConverter(object):
        conv_op = self.tf_graph[op.name][0]
        op_def.name = conv_op.name
        op_def.type = conv_op.type
-        self.transpose_filter_tensor[get_input_tensor(conv_op,
-                                                      1).name] = (0, 1, 3, 2)
+
        if self.device == 'gpu':
+            self.transpose_filter_tensor[
+                get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2)
            op_def.input.extend([op.inputs[0].name])
            output_name = self.add_buffer_to_image(
                get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
            op_def.input.extend([output_name])
        else:
+            self.transpose_filter_tensor[
+                get_input_tensor(conv_op, 1).name] = (3, 2, 0, 1)
            op_def.input.extend([get_input_tensor(op, 0).name])
            op_def.input.extend([get_input_tensor(conv_op, 1).name])

@@ -1020,7 +1021,6 @@ class TFConverter(object):
        if len(self.tf_graph[final_op.name]) == 1 and \
                self.tf_graph[final_op.name][0].type == 'Relu':
            relu_op = self.tf_graph[final_op.name][0]
-            op_def.type = "FusedConv2D"
            fused_relu_arg = op_def.arg.add()
            fused_relu_arg.name = 'activation'
            fused_relu_arg.s = "RELU"
@@ -1092,8 +1092,12 @@ class TFConverter(object):
        op_def.output.extend([output.name for output in op.outputs])
        paddings_arg = op_def.arg.add()
        paddings_arg.name = 'paddings'
-        paddings_arg.ints.extend(
-            get_input_tensor(op, 1).eval().astype(np.int32).flat)
+        if self.device == 'gpu':
+            paddings_value = get_input_tensor(op, 1).eval().astype(np.int32)
+        else:
+            paddings_value = get_input_tensor(op, 1).eval().astype(np.int32)
+            paddings_value = paddings_value[[0, 3, 1, 2]]
+        paddings_arg.ints.extend(paddings_value.flat)
        self.unused_tensor.add(get_input_tensor(op, 1).name)
        if len(op.inputs) == 3:
            constant_value_arg = op_def.arg.add()

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -248,7 +248,7 @@ void MaceRun(const int in_out_size,
  std::string filter_tensor_name = "filter";
  std::string filter_tensor_img_name = filter_tensor_name + "_image";

-  const DeviceType device = DeviceType::OPENCL;
+  const DeviceType device = DeviceType::GPU;

  NetDef net_def;

@@ -300,7 +300,7 @@ void MaceRun(const int in_out_size,
    }
  }

-  CheckOutputs<DeviceType::OPENCL, T>(net_def, inputs, outputs);
+  CheckOutputs<DeviceType::GPU, T>(net_def, inputs, outputs);
 }

 }  // namespace

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -22,7 +22,7 @@
 *          --input_file=input_data \
 *          --output_file=mace.out  \
 *          --model_data_file=model_data.data \
- *          --device=OPENCL
+ *          --device=GPU
 */
 #include <malloc.h>
 #include <stdint.h>
@@ -108,10 +108,8 @@ std::string FormatName(const std::string input) {
 DeviceType ParseDeviceType(const std::string &device_str) {
  if (device_str.compare("CPU") == 0) {
    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
-    return DeviceType::NEON;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
+  } else if (device_str.compare("GPU") == 0) {
+    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
  } else {
@@ -203,7 +201,7 @@ DEFINE_string(output_file,
 DEFINE_string(model_data_file,
              "",
              "model data file name, used when EMBED_MODEL_DATA set to 0");
-DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
+DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
 DEFINE_int32(round, 1, "round");
 DEFINE_int32(restart_round, 1, "restart round");
 DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
@@ -234,7 +232,7 @@ bool RunModel(const std::vector<std::string> &input_names,
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
 #ifdef MACE_ENABLE_OPENCL
-  if (device_type == DeviceType::OPENCL) {
+  if (device_type == DeviceType::GPU) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
@@ -252,7 +250,7 @@ bool RunModel(const std::vector<std::string> &input_names,
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);
  mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
-  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+  if (device_type == DeviceType::GPU || device_type == DeviceType::HEXAGON) {
    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
  }
  int64_t t2 = NowMicros();
@@ -329,7 +327,7 @@ bool RunModel(const std::vector<std::string> &input_names,
         mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);

 #ifdef MACE_ENABLE_OPENCL
-  if (device_type == DeviceType::OPENCL) {
+  if (device_type == DeviceType::GPU) {
    WriteOpenCLPlatformInfo(kernel_file_path);
  }
 #endif  // MACE_ENABLE_OPENCL

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -69,7 +69,7 @@ def get_data_and_device_type(runtime):
        device_type = "HEXAGON"
    elif runtime == "gpu":
        data_type = "DT_HALF"
-        device_type = "OPENCL"
+        device_type = "GPU"
    elif runtime == "cpu":
        data_type = "DT_FLOAT"
        device_type = "CPU"