Feature: Remove the BufferTransform ops of GPU initialization phase.

1. Remove the BufferTransform ops of GPU initialization phase. 2. Add tranformer to tranform the memory type between CPU and GPU.

Feature: Remove the BufferTransform ops of GPU initialization phase.
1. Remove the BufferTransform ops of GPU initialization phase. 2. Add tranformer to tranform the memory type between CPU and GPU.
19dcf2c3 · liuqi · 4983fcb2 · 19dcf2c3 · 19dcf2c3 · 19dcf2c3
26 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -77,9 +77,12 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
 }

 MaceStatus SerialNet::Init() {
-  // TODO(liuqi): where to do memory reuse.
  MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
  OpInitContext init_context(ws_);
+  // TODO(liuqi): where to do memory reuse.
+  if (target_device_->device_type() == DeviceType::GPU) {
+
+  }
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    DeviceType device_type = op->device_type();

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -131,14 +131,14 @@ class Operation {
  }

  inline void set_debug_def(
-      const std::shared_ptr<const OperatorDef> &operator_def) {
+      const std::shared_ptr<OperatorDef> &operator_def) {
    operator_def_ = operator_def;
  }

  inline bool has_debug_def() const { return operator_def_ != nullptr; }

 protected:
-  std::shared_ptr<const OperatorDef> operator_def_;
+  std::shared_ptr<OperatorDef> operator_def_;
  std::vector<const Tensor *> inputs_;
  std::vector<Tensor *> outputs_;


--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -222,6 +222,17 @@ class Tensor {
    return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
  }

+  inline MemoryType memory_type() const {
+    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" );
+    if (buffer_->OnHost()) {
+      return MemoryType::CPU_BUFFER;
+    } else if (typeid(*buffer_) == typeid(Image)) {
+      return MemoryType::GPU_IMAGE;
+    } else {
+      return MemoryType::GPU_BUFFER;
+    }
+  }
+
 #ifdef MACE_ENABLE_OPENCL
  inline cl::Image *opencl_image() const {
    MACE_CHECK(has_opencl_image(), name_, " do not have image");

--- a/mace/core/transformer.h
+++ b/mace/core/transformer.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_TRANSFORMER_H_
+#define MACE_CORE_TRANSFORMER_H_
+
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+
+class TransformerBase {
+ public:
+  // Construct transform operation.
+  virtual std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
+      OperatorDef *op_def,
+      bool transform_filter = true) = 0;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_TRANSFORMER_H_
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -31,7 +31,7 @@ cc_library(
            "ops_registry.cc",
            "ops_test_util.cc",
            "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
+            "buffer_transformer.cc",
            "lstm_cell.cc",
            "winograd_transform.cc",
            "quantize.cc",
@@ -42,7 +42,7 @@ cc_library(
            "opencl/image/*.cc",
            "opencl/buffer/*.cc",
            "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
+            "buffer_transformer.cc",
            "lstm_cell.cc",
            "winograd_transform.cc",
        ],

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"

 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
                                              "NOOP"));
    auto relux_max_limit = static_cast<T>(
        Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(
          new opencl::image::ActivationKernel<T>(type, relux_max_limit));
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    if (type == ActivationType::PRELU) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -237,12 +237,10 @@ void TestSimplePrelu() {
  if (D == DeviceType::GPU) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            ops::BufferType::ARGUMENT);

    OpDefBuilder("Activation", "PreluTest")
        .Input("InputImage")
-        .Input("AlphaImage")
+        .Input("Alpha")
        .Output("OutputImage")
        .AddStringArg("activation", "PRELU")
        .Finalize(net.NewOperatorDef());

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/batch_norm.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -147,12 +148,25 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    ActivationType activation = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::BatchNormKernel<T>(
          epsilon, activation, relux_max_limit));
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    // Transform filters
+    int input_size = operator_def_->input_size();
+    for (int i = 0; i < input_size; ++i) {
+      const Tensor *input_tensor = context->workspace()->GetTensor(
+          operator_def_->input(i));
+      if (input_tensor != nullptr && input_tensor->is_weight()) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
+                       == MaceStatus::MACE_SUCCESS);
+      }
+    }
  }
  MaceStatus Run(OpContext *context) override {
    bool not_folded = this->InputSize() == 5;

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/bias_add.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
      : Operation(context),
        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
            "data_format", NHWC))) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::BiasAddKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/buffer_inverse_transform.cc
+++ b/mace/ops/buffer_inverse_transform.cc
@@ -51,7 +51,7 @@ class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {

 private:
  const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
 };



--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -15,8 +15,7 @@
 #include <memory>

 #include "mace/core/operator.h"
-#include "mace/ops/opencl/buffer/buffer_transform.h"
-#include "mace/ops/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/buffer_transformer.h"

 namespace mace {
 namespace ops {
@@ -29,12 +28,16 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit BufferTransformOp(OpConstructContext *context)
      : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)),
+        out_mem_type_(MemoryType::GPU_BUFFER),
+        transformer_(nullptr) {
+    MemoryType in_mem_type = context->workspace()->GetTensor(
+        operator_def_->input(0))->memory_type();
    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::BufferToImage<T>);
-    } else {
-      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+      out_mem_type_ = MemoryType::GPU_IMAGE;
    }
+    transformer_.reset(new OpenCLBufferTransformer<T>(in_mem_type,
+                                                      out_mem_type_));
  }

  MaceStatus Run(OpContext *context) override {
@@ -45,13 +48,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));

-    return kernel_->Compute(context, input, type,
-                            wino_blk_size_, output);
+    return transformer_->Transform(
+        context, input, type, wino_blk_size_, out_mem_type_, output);
  }

 private:
  const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+  MemoryType out_mem_type_;
+  std::unique_ptr<OpenCLBufferTransformer<T>> transformer_;
 };



--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -38,8 +38,9 @@
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-#include "mace/ops/opencl/image/conv_2d.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/conv_2d.h"
+#include "mace/ops/opencl/image/conv_2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
@@ -960,11 +961,23 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::Conv2dKernel<T>);
    } else {
+      mem_type = MemoryType::GPU_BUFFER;
      kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
    }
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -34,8 +34,9 @@
 #include "mace/public/mace.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/ops/opencl/image/depthwise_conv2d.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/depthwise_conv2d.h"
+#include "mace/ops/opencl/image/depthwise_conv2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
@@ -490,11 +491,23 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
 public:
  explicit DepthwiseConv2dOp(OpConstructContext *context)
      : DepthwiseConv2dOpBase(context) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
    } else {
+      mem_type = MemoryType::GPU_BUFFER;
      kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
    }
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -26,6 +26,7 @@
 #include "mace/core/tensor.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/eltwise.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -1086,12 +1087,25 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
    int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
            "scalar_input_index", 1);
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::EltwiseKernel<T>(
          type, coeff, scalar_input, scalar_input_index));
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    // Transform filters
+    int input_size = operator_def_->input_size();
+    for (int i = 0; i < input_size; ++i) {
+      const Tensor *input_tensor = context->workspace()->GetTensor(
+          operator_def_->input(i));
+      if (input_tensor != nullptr && input_tensor->is_weight()) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
+                       == MaceStatus::MACE_SUCCESS);
+      }
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input0 = this->Input(0);

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -27,6 +27,7 @@
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/fully_connected.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -192,11 +193,22 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
 public:
  explicit FullyConnectedOp(OpConstructContext *context)
      : FullyConnectedOpBase(context) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
 #define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_

-#include "mace/ops/opencl/buffer_inverse_transform.h"
+#include "mace/ops/opencl/buffer_transformer.h"

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
@@ -34,7 +34,7 @@ MaceStatus BufferTypeTransform(
    Tensor *output);

 template <typename T>
-class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
+class BufferInverseTransform: public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(OpContext *context,
                     const Tensor *input,

--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
 #define MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_

-#include "mace/ops/opencl/buffer_transform.h"
+#include "mace/ops/opencl/buffer_transform_kernel.h"

 #include <vector>


--- a/mace/ops/opencl/buffer_transform.h
+++ b/mace/ops/opencl/buffer_transform.h
@@ -12,18 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
-#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
+#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_

 #include "mace/ops/opencl/common.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"

 namespace mace {
-
 class OpContext;
 class Tensor;
-
 namespace ops {
 class OpenCLBufferTransformKernel {
 public:
@@ -32,10 +30,9 @@ class OpenCLBufferTransformKernel {
                             const BufferType type,
                             const int wino_blk_size,
                             Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
+ MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
 };
-
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
--- a/mace/ops/opencl/buffer_transformer.cc
+++ b/mace/ops/opencl/buffer_transformer.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer_transformer.h"
+
+namespace mace {
+namespace ops {
+
+std::string TransformedName(const std::string &name) {
+  // TODO(liuqi): This may create a conflict.
+  const char *postfix = "_mace_identity_transformed";
+  return name + postfix;
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
+#define MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
+
+#include "mace/core/operator.h"
+#include "mace/ops/opencl/common.h"
+#include "mace/ops/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/image/image_to_buffer.h"
+#include "mace/ops/opencl/buffer/buffer_transform.h"
+
+namespace mace {
+namespace ops {
+// Only used for GPU Operation(BufferTransform)
+template <typename T>
+class OpenCLBufferTransformer {
+ public:
+  OpenCLBufferTransformer(const MemoryType in_mem_type,
+                         const MemoryType out_mem_type) {
+    if (out_mem_type == MemoryType::GPU_IMAGE) {
+      kernel_.reset(new opencl::image::BufferToImage<T>);
+    } else if (in_mem_type == MemoryType::GPU_IMAGE){
+      kernel_.reset(new opencl::image::ImageToBuffer<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+    }
+  }
+
+  MaceStatus Transform(OpContext *context,
+                       const Tensor *input,
+                       const BufferType type,
+                       const int wino_blk_size,
+                       const MemoryType out_mem_type,
+                       Tensor *output) {
+    Workspace *ws = context->workspace();
+    DataType dt = DataTypeToEnum<T>::value;
+    MemoryType in_mem_type = input->memory_type();
+    if (out_mem_type == MemoryType::GPU_IMAGE ||
+        out_mem_type == MemoryType::GPU_BUFFER) {
+      if (in_mem_type != MemoryType::CPU_BUFFER) {
+        return kernel_->Compute(
+            context, input, type, wino_blk_size, output);
+      } else {
+        // convert to the GPU Buffer with the input's data type.
+        Tensor *internal_tensor = ws->CreateTensor(
+            InternalTransformedName(input->name()),
+            context->device()->allocator(), input->dtype());
+        output->Resize(input->shape());
+        const uint8_t *input_ptr = input->data<uint8_t>();
+        Tensor::MappingGuard guard(internal_tensor);
+        uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
+        memcpy(internal_ptr, input_ptr, input->raw_size());
+        // convert the internal GPU Buffer to output.
+        return kernel_->Compute(
+            context, internal_tensor, type, wino_blk_size, output);
+      }
+    } else {  // out_mem_type == MemoryType::CPU_BUFFER
+      // convert to the GPU Buffer with the output's data type.
+      Tensor internal_tensor(context->device()->allocator(),
+                             dt,
+                             false,
+                             InternalTransformedName(input->name()));
+      MACE_RETURN_IF_ERROR(kernel_->Compute(
+          context, input, type, wino_blk_size, &internal_tensor));
+      // convert the internal GPU Buffer to output.
+      Tensor::MappingGuard guard(&internal_tensor);
+      const T *internal_ptr = internal_tensor.data<T>();
+      output->Resize(internal_tensor.shape());
+      T *output_ptr = output->mutable_data<T>();
+      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      return MaceStatus::MACE_SUCCESS;
+    }
+  }
+
+ private:
+  std::string InternalTransformedName(const std::string &name) {
+    // TODO(liuqi): This may create a conflict.
+    const char *postfix = "_mace_identity_internal";
+    return name + postfix;
+  }
+
+ private:
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+};
+
+std::string TransformedName(const std::string &name);
+
+template <typename T>
+MaceStatus TransformFilter(
+    mace::OpConstructContext *context,
+    OperatorDef *op_def,
+    const int input_idx,
+    const BufferType buffer_type,
+    const MemoryType mem_type) {
+  const DataType dt = DataTypeToEnum<T>::value;
+  OpContext op_context(context->workspace(), context->device());
+  Workspace *ws = context->workspace();
+  std::string input_name = op_def->input(input_idx);
+  Tensor *input = ws->GetTensor(input_name);
+  std::string output_name = TransformedName(input_name);
+  Tensor *output =
+      ws->CreateTensor(output_name, context->device()->allocator(), dt);
+
+  // update the information
+  op_def->set_input(input_idx, output_name);
+  input->MarkUnused();
+  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
+      Transform(&op_context, input, buffer_type, 0, mem_type, output);
+}
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
--- a/mace/ops/opencl/cl/eltwise.cl
+++ b/mace/ops/opencl/cl/eltwise.cl
@@ -45,7 +45,7 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS
  DATA_TYPE4 out;
 #if ELTWISE_TYPE == 0
  #ifdef COEFF_SUM
-    out = mad(coeff1, in0, mad(coeff0, in1, 0));
+    out = mad(coeff0, in0, mad(coeff1, in1, 0));
  #else
    out = in0 + in1;
  #endif

--- a/mace/ops/opencl/common.h
+++ b/mace/ops/opencl/common.h
@@ -29,7 +29,6 @@ enum BufferType {
  WEIGHT_HEIGHT = 7,
  WEIGHT_WIDTH = 8,
 };
-
 }  // namespace ops
 }  // namespace mace
 #endif  // MACE_OPS_OPENCL_COMMON_H_
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
 #define MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_

-#include "mace/ops/opencl/buffer_transform.h"
+#include "mace/ops/opencl/buffer_transform_kernel.h"

 #include <set>
 #include <string>

--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -20,7 +20,7 @@
 #include <vector>

 #include "mace/core/op_context.h"
-#include "mace/ops/opencl/buffer_inverse_transform.h"
+#include "mace/ops/opencl/buffer_transform_kernel.h"
 #include "mace/ops/opencl/helper.h"

 namespace mace {
@@ -29,7 +29,7 @@ namespace opencl {
 namespace image {

 template <typename T>
-class ImageToBuffer : public OpenCLBufferInverseTransformKernel {
+class ImageToBuffer : public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(OpContext *context,
                     const Tensor *input,

--- a/mace/ops/transformer.cc
+++ b/mace/ops/transformer.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/transformer.h"
+
+#include <string>
+#include <memory>
+
+namespace mace {
+namespace ops {
+
+std::unique_ptr<OperatorDef> Transformer::DoTransform(
+    mace::OperatorDef *op_def,
+    const int input_idx,
+    const mace::DataType dt,
+    const BufferType buffer_type,
+    const MemoryType mem_type) {
+  int32_t device = op_def->device_type();
+  std::string input_name = op_def->input(input_idx);
+  std::string output_name = input_name + "_transformed";
+
+  op_def->set_input(input_idx, output_name);
+  std::unique_ptr<OperatorDef> op(new OperatorDef);
+  op->set_name(output_name);
+  op->set_type("BufferTransform");
+  op->add_input(input_name);
+  op->add_output(output_name);
+  Argument *arg = op->add_arg();
+  arg->set_name("buffer_type");
+  arg->set_i(static_cast<int32_t>(buffer_type));
+  arg = op->add_arg();
+  arg->set_name("mem_type");
+  arg->set_i(static_cast<int32_t>(mem_type));
+  arg = op->add_arg();
+  arg->set_name("T");
+  arg->set_i(static_cast<int32_t>(dt));
+  arg = op->add_arg();
+  arg->set_name("device");
+  arg->set_i(device);
+
+  return std::move(op);
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer_inverse_transform.h
@@ -12,30 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#ifndef MACE_KERNELS_TRANSFORMER_H_
+#define MACE_KERNELS_TRANSFORMER_H_

+#include "mace/core/transformer.h"
 #include "mace/ops/opencl/common.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"

 namespace mace {
-
 class OpContext;
-class Tensor;
-
 namespace ops {
-class OpenCLBufferInverseTransformKernel {
+
+class Transformer : public TransformerBase {
 public:
-  virtual MaceStatus Compute(OpContext *context,
-                             const Tensor *input,
-                             const BufferType type,
-                             const int wino_blk_size,
-                             Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
+  // Transform source tensor to target.
+  std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
+      OperatorDef *op_def,
+      bool transform_filter = true) override;
+ private:
+  std::unique_ptr<OperatorDef> DoTransform(
+      mace::OperatorDef *op_def,
+      const int input_idx,
+      const mace::DataType dt,
+      const BufferType buffer_type,
+      const MemoryType mem_type);
 };

+
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#endif  // MACE_KERNELS_TENSOR_TRANSFORMER_H_