From 19dcf2c3678a73cbbb21e34bf1895ca49447c81c Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Wed, 7 Nov 2018 11:02:13 +0800
Subject: [PATCH] Feature: Remove the BufferTransform ops of GPU initialization
 phase.

1. Remove the BufferTransform ops of GPU initialization phase.
2. Add tranformer to tranform the memory type between CPU and GPU.
---
 mace/core/net.cc                              |   5 +-
 mace/core/operator.h                          |   4 +-
 mace/core/tensor.h                            |  11 ++
 mace/core/transformer.h                       |  32 +++++
 mace/ops/BUILD                                |   4 +-
 mace/ops/activation.cc                        |   8 ++
 mace/ops/activation_test.cc                   |   4 +-
 mace/ops/batch_norm.cc                        |  14 ++
 mace/ops/bias_add.cc                          |   6 +
 mace/ops/buffer_inverse_transform.cc          |   2 +-
 mace/ops/buffer_transform.cc                  |  22 +--
 mace/ops/conv_2d.cc                           |  15 ++-
 mace/ops/depthwise_conv2d.cc                  |  15 ++-
 mace/ops/eltwise.cc                           |  14 ++
 mace/ops/fully_connected.cc                   |  12 ++
 .../opencl/buffer/buffer_inverse_transform.h  |   4 +-
 mace/ops/opencl/buffer/buffer_transform.h     |   2 +-
 ..._transform.h => buffer_transform_kernel.h} |  11 +-
 mace/ops/opencl/buffer_transformer.cc         |  27 ++++
 mace/ops/opencl/buffer_transformer.h          | 126 ++++++++++++++++++
 mace/ops/opencl/cl/eltwise.cl                 |   2 +-
 mace/ops/opencl/common.h                      |   1 -
 mace/ops/opencl/image/buffer_to_image.h       |   2 +-
 mace/ops/opencl/image/image_to_buffer.h       |   4 +-
 mace/ops/transformer.cc                       |  56 ++++++++
 ...ffer_inverse_transform.h => transformer.h} |  33 ++---
 26 files changed, 386 insertions(+), 50 deletions(-)
 create mode 100644 mace/core/transformer.h
 rename mace/ops/opencl/{buffer_transform.h => buffer_transform_kernel.h} (83%)
 create mode 100644 mace/ops/opencl/buffer_transformer.cc
 create mode 100644 mace/ops/opencl/buffer_transformer.h
 create mode 100644 mace/ops/transformer.cc
 rename mace/ops/{opencl/buffer_inverse_transform.h => transformer.h} (55%)
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 63ca5792..1fe5b0e9 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -77,9 +77,12 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
 }
 
 MaceStatus SerialNet::Init() {
-  // TODO(liuqi): where to do memory reuse.
   MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
   OpInitContext init_context(ws_);
+  // TODO(liuqi): where to do memory reuse.
+  if (target_device_->device_type() == DeviceType::GPU) {
+
+  }
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
     DeviceType device_type = op->device_type();
diff --git a/mace/core/operator.h b/mace/core/operator.h
index c354afbd..7017240c 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -131,14 +131,14 @@ class Operation {
   }
 
   inline void set_debug_def(
-      const std::shared_ptr<const OperatorDef> &operator_def) {
+      const std::shared_ptr<OperatorDef> &operator_def) {
     operator_def_ = operator_def;
   }
 
   inline bool has_debug_def() const { return operator_def_ != nullptr; }
 
  protected:
-  std::shared_ptr<const OperatorDef> operator_def_;
+  std::shared_ptr<OperatorDef> operator_def_;
   std::vector<const Tensor *> inputs_;
   std::vector<Tensor *> outputs_;
 
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 7cf01043..f217bee4 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -222,6 +222,17 @@ class Tensor {
     return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
   }
 
+  inline MemoryType memory_type() const {
+    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" );
+    if (buffer_->OnHost()) {
+      return MemoryType::CPU_BUFFER;
+    } else if (typeid(*buffer_) == typeid(Image)) {
+      return MemoryType::GPU_IMAGE;
+    } else {
+      return MemoryType::GPU_BUFFER;
+    }
+  }
+
 #ifdef MACE_ENABLE_OPENCL
   inline cl::Image *opencl_image() const {
     MACE_CHECK(has_opencl_image(), name_, " do not have image");
diff --git a/mace/core/transformer.h b/mace/core/transformer.h
new file mode 100644
index 00000000..09f56009
--- /dev/null
+++ b/mace/core/transformer.h
@@ -0,0 +1,32 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_TRANSFORMER_H_
+#define MACE_CORE_TRANSFORMER_H_
+
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+
+class TransformerBase {
+ public:
+  // Construct transform operation.
+  virtual std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
+      OperatorDef *op_def,
+      bool transform_filter = true) = 0;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_TRANSFORMER_H_
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 1e1efc89..5fb0683c 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -31,7 +31,7 @@ cc_library(
             "ops_registry.cc",
             "ops_test_util.cc",
             "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
+            "buffer_transformer.cc",
             "lstm_cell.cc",
             "winograd_transform.cc",
             "quantize.cc",
@@ -42,7 +42,7 @@ cc_library(
             "opencl/image/*.cc",
             "opencl/buffer/*.cc",
             "buffer_inverse_transform.cc",
-            "buffer_transform.cc",
+            "buffer_transformer.cc",
             "lstm_cell.cc",
             "winograd_transform.cc",
         ],
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index 7c733177..19b3289f 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
                                               "NOOP"));
     auto relux_max_limit = static_cast<T>(
         Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(
           new opencl::image::ActivationKernel<T>(type, relux_max_limit));
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    if (type == ActivationType::PRELU) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index f56a3a17..f127be42 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -237,12 +237,10 @@ void TestSimplePrelu() {
   if (D == DeviceType::GPU) {
     BufferToImage<D, float>(&net, "Input", "InputImage",
                             ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            ops::BufferType::ARGUMENT);
 
     OpDefBuilder("Activation", "PreluTest")
         .Input("InputImage")
-        .Input("AlphaImage")
+        .Input("Alpha")
         .Output("OutputImage")
         .AddStringArg("activation", "PRELU")
         .Finalize(net.NewOperatorDef());
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 07c00189..cf022d6a 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/batch_norm.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -147,12 +148,25 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
     ActivationType activation = ops::StringToActivationType(
         Operation::GetOptionalArg<std::string>("activation", "NOOP"));
     float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::BatchNormKernel<T>(
           epsilon, activation, relux_max_limit));
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    // Transform filters
+    int input_size = operator_def_->input_size();
+    for (int i = 0; i < input_size; ++i) {
+      const Tensor *input_tensor = context->workspace()->GetTensor(
+          operator_def_->input(i));
+      if (input_tensor != nullptr && input_tensor->is_weight()) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
+                       == MaceStatus::MACE_SUCCESS);
+      }
+    }
   }
   MaceStatus Run(OpContext *context) override {
     bool not_folded = this->InputSize() == 5;
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 9b528fa9..0b406dd1 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/bias_add.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
       : Operation(context),
         data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
             "data_format", NHWC))) {
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::BiasAddKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc
index 8cfd72b5..8482e255 100644
--- a/mace/ops/buffer_inverse_transform.cc
+++ b/mace/ops/buffer_inverse_transform.cc
@@ -51,7 +51,7 @@ class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
 
  private:
   const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
 };
 
 
diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc
index cb127880..1accbe21 100644
--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -15,8 +15,7 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/ops/opencl/buffer/buffer_transform.h"
-#include "mace/ops/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 
 namespace mace {
 namespace ops {
@@ -29,12 +28,16 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit BufferTransformOp(OpConstructContext *context)
       : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)),
+        out_mem_type_(MemoryType::GPU_BUFFER),
+        transformer_(nullptr) {
+    MemoryType in_mem_type = context->workspace()->GetTensor(
+        operator_def_->input(0))->memory_type();
     if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::BufferToImage<T>);
-    } else {
-      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+      out_mem_type_ = MemoryType::GPU_IMAGE;
     }
+    transformer_.reset(new OpenCLBufferTransformer<T>(in_mem_type,
+                                                      out_mem_type_));
   }
 
   MaceStatus Run(OpContext *context) override {
@@ -45,13 +48,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
         static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
             "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
 
-    return kernel_->Compute(context, input, type,
-                            wino_blk_size_, output);
+    return transformer_->Transform(
+        context, input, type, wino_blk_size_, out_mem_type_, output);
   }
 
  private:
   const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+  MemoryType out_mem_type_;
+  std::unique_ptr<OpenCLBufferTransformer<T>> transformer_;
 };
 
 
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 7bb213c0..bf5ebaa0 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -38,8 +38,9 @@
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/ops/opencl/image/conv_2d.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/conv_2d.h"
+#include "mace/ops/opencl/image/conv_2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
@@ -960,11 +961,23 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
             Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
         relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::Conv2dKernel<T>);
     } else {
+      mem_type = MemoryType::GPU_BUFFER;
       kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
     }
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 1f1dd136..47f45e4a 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -34,8 +34,9 @@
 #include "mace/public/mace.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/ops/opencl/image/depthwise_conv2d.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/depthwise_conv2d.h"
+#include "mace/ops/opencl/image/depthwise_conv2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
@@ -490,11 +491,23 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
  public:
   explicit DepthwiseConv2dOp(OpConstructContext *context)
       : DepthwiseConv2dOpBase(context) {
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
     } else {
+      mem_type = MemoryType::GPU_BUFFER;
       kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
     }
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 29f0c5a7..96384cde 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -26,6 +26,7 @@
 #include "mace/core/tensor.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/eltwise.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -1086,12 +1087,25 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
     int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
             "scalar_input_index", 1);
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::EltwiseKernel<T>(
           type, coeff, scalar_input, scalar_input_index));
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    // Transform filters
+    int input_size = operator_def_->input_size();
+    for (int i = 0; i < input_size; ++i) {
+      const Tensor *input_tensor = context->workspace()->GetTensor(
+          operator_def_->input(i));
+      if (input_tensor != nullptr && input_tensor->is_weight()) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
+                       == MaceStatus::MACE_SUCCESS);
+      }
+    }
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input0 = this->Input(0);
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index 3705e415..cea80bf5 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -27,6 +27,7 @@
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/fully_connected.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -192,11 +193,22 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
  public:
   explicit FullyConnectedOp(OpConstructContext *context)
       : FullyConnectedOpBase(context) {
+    MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
       kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/opencl/buffer/buffer_inverse_transform.h b/mace/ops/opencl/buffer/buffer_inverse_transform.h
index 647f2514..8b05bf5f 100644
--- a/mace/ops/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
 #define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
 
-#include "mace/ops/opencl/buffer_inverse_transform.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
@@ -34,7 +34,7 @@ MaceStatus BufferTypeTransform(
     Tensor *output);
 
 template <typename T>
-class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
+class BufferInverseTransform: public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(OpContext *context,
                      const Tensor *input,
diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h
index 4919bb09..c9e31cfa 100644
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
 #define MACE_OPS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
 
-#include "mace/ops/opencl/buffer_transform.h"
+#include "mace/ops/opencl/buffer_transform_kernel.h"
 
 #include <vector>
 
diff --git a/mace/ops/opencl/buffer_transform.h b/mace/ops/opencl/buffer_transform_kernel.h
similarity index 83%
rename from mace/ops/opencl/buffer_transform.h
rename to mace/ops/opencl/buffer_transform_kernel.h
index 45808d40..5d4ff094 100644
--- a/mace/ops/opencl/buffer_transform.h
+++ b/mace/ops/opencl/buffer_transform_kernel.h
@@ -12,18 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
-#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
+#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
+#define MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
 
 #include "mace/ops/opencl/common.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
-
 class OpContext;
 class Tensor;
-
 namespace ops {
 class OpenCLBufferTransformKernel {
  public:
@@ -32,10 +30,9 @@ class OpenCLBufferTransformKernel {
                              const BufferType type,
                              const int wino_blk_size,
                              Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
+ MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
 };
-
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORM_H_
+#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc
new file mode 100644
index 00000000..e3b1b67b
--- /dev/null
+++ b/mace/ops/opencl/buffer_transformer.cc
@@ -0,0 +1,27 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer_transformer.h"
+
+namespace mace {
+namespace ops {
+
+std::string TransformedName(const std::string &name) {
+  // TODO(liuqi): This may create a conflict.
+  const char *postfix = "_mace_identity_transformed";
+  return name + postfix;
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
new file mode 100644
index 00000000..78f82177
--- /dev/null
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -0,0 +1,126 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
+#define MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
+
+#include "mace/core/operator.h"
+#include "mace/ops/opencl/common.h"
+#include "mace/ops/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/image/image_to_buffer.h"
+#include "mace/ops/opencl/buffer/buffer_transform.h"
+
+namespace mace {
+namespace ops {
+// Only used for GPU Operation(BufferTransform)
+template <typename T>
+class OpenCLBufferTransformer {
+ public:
+  OpenCLBufferTransformer(const MemoryType in_mem_type,
+                         const MemoryType out_mem_type) {
+    if (out_mem_type == MemoryType::GPU_IMAGE) {
+      kernel_.reset(new opencl::image::BufferToImage<T>);
+    } else if (in_mem_type == MemoryType::GPU_IMAGE){
+      kernel_.reset(new opencl::image::ImageToBuffer<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+    }
+  }
+
+  MaceStatus Transform(OpContext *context,
+                       const Tensor *input,
+                       const BufferType type,
+                       const int wino_blk_size,
+                       const MemoryType out_mem_type,
+                       Tensor *output) {
+    Workspace *ws = context->workspace();
+    DataType dt = DataTypeToEnum<T>::value;
+    MemoryType in_mem_type = input->memory_type();
+    if (out_mem_type == MemoryType::GPU_IMAGE ||
+        out_mem_type == MemoryType::GPU_BUFFER) {
+      if (in_mem_type != MemoryType::CPU_BUFFER) {
+        return kernel_->Compute(
+            context, input, type, wino_blk_size, output);
+      } else {
+        // convert to the GPU Buffer with the input's data type.
+        Tensor *internal_tensor = ws->CreateTensor(
+            InternalTransformedName(input->name()),
+            context->device()->allocator(), input->dtype());
+        output->Resize(input->shape());
+        const uint8_t *input_ptr = input->data<uint8_t>();
+        Tensor::MappingGuard guard(internal_tensor);
+        uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
+        memcpy(internal_ptr, input_ptr, input->raw_size());
+        // convert the internal GPU Buffer to output.
+        return kernel_->Compute(
+            context, internal_tensor, type, wino_blk_size, output);
+      }
+    } else {  // out_mem_type == MemoryType::CPU_BUFFER
+      // convert to the GPU Buffer with the output's data type.
+      Tensor internal_tensor(context->device()->allocator(),
+                             dt,
+                             false,
+                             InternalTransformedName(input->name()));
+      MACE_RETURN_IF_ERROR(kernel_->Compute(
+          context, input, type, wino_blk_size, &internal_tensor));
+      // convert the internal GPU Buffer to output.
+      Tensor::MappingGuard guard(&internal_tensor);
+      const T *internal_ptr = internal_tensor.data<T>();
+      output->Resize(internal_tensor.shape());
+      T *output_ptr = output->mutable_data<T>();
+      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      return MaceStatus::MACE_SUCCESS;
+    }
+  }
+
+ private:
+  std::string InternalTransformedName(const std::string &name) {
+    // TODO(liuqi): This may create a conflict.
+    const char *postfix = "_mace_identity_internal";
+    return name + postfix;
+  }
+
+ private:
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+};
+
+std::string TransformedName(const std::string &name);
+
+template <typename T>
+MaceStatus TransformFilter(
+    mace::OpConstructContext *context,
+    OperatorDef *op_def,
+    const int input_idx,
+    const BufferType buffer_type,
+    const MemoryType mem_type) {
+  const DataType dt = DataTypeToEnum<T>::value;
+  OpContext op_context(context->workspace(), context->device());
+  Workspace *ws = context->workspace();
+  std::string input_name = op_def->input(input_idx);
+  Tensor *input = ws->GetTensor(input_name);
+  std::string output_name = TransformedName(input_name);
+  Tensor *output =
+      ws->CreateTensor(output_name, context->device()->allocator(), dt);
+
+  // update the information
+  op_def->set_input(input_idx, output_name);
+  input->MarkUnused();
+  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
+      Transform(&op_context, input, buffer_type, 0, mem_type, output);
+}
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
diff --git a/mace/ops/opencl/cl/eltwise.cl b/mace/ops/opencl/cl/eltwise.cl
index 931d0eca..6f352d4f 100644
--- a/mace/ops/opencl/cl/eltwise.cl
+++ b/mace/ops/opencl/cl/eltwise.cl
@@ -45,7 +45,7 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS
   DATA_TYPE4 out;
 #if ELTWISE_TYPE == 0
   #ifdef COEFF_SUM
-    out = mad(coeff1, in0, mad(coeff0, in1, 0));
+    out = mad(coeff0, in0, mad(coeff1, in1, 0));
   #else
     out = in0 + in1;
   #endif
diff --git a/mace/ops/opencl/common.h b/mace/ops/opencl/common.h
index 0a238960..f0bf872e 100644
--- a/mace/ops/opencl/common.h
+++ b/mace/ops/opencl/common.h
@@ -29,7 +29,6 @@ enum BufferType {
   WEIGHT_HEIGHT = 7,
   WEIGHT_WIDTH = 8,
 };
-
 }  // namespace ops
 }  // namespace mace
 #endif  // MACE_OPS_OPENCL_COMMON_H_
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index 64319721..e84691f8 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
 #define MACE_OPS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
 
-#include "mace/ops/opencl/buffer_transform.h"
+#include "mace/ops/opencl/buffer_transform_kernel.h"
 
 #include <set>
 #include <string>
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index 9aa65f0e..4200087e 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "mace/core/op_context.h"
-#include "mace/ops/opencl/buffer_inverse_transform.h"
+#include "mace/ops/opencl/buffer_transform_kernel.h"
 #include "mace/ops/opencl/helper.h"
 
 namespace mace {
@@ -29,7 +29,7 @@ namespace opencl {
 namespace image {
 
 template <typename T>
-class ImageToBuffer : public OpenCLBufferInverseTransformKernel {
+class ImageToBuffer : public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(OpContext *context,
                      const Tensor *input,
diff --git a/mace/ops/transformer.cc b/mace/ops/transformer.cc
new file mode 100644
index 00000000..7df66ffa
--- /dev/null
+++ b/mace/ops/transformer.cc
@@ -0,0 +1,56 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/transformer.h"
+
+#include <string>
+#include <memory>
+
+namespace mace {
+namespace ops {
+
+std::unique_ptr<OperatorDef> Transformer::DoTransform(
+    mace::OperatorDef *op_def,
+    const int input_idx,
+    const mace::DataType dt,
+    const BufferType buffer_type,
+    const MemoryType mem_type) {
+  int32_t device = op_def->device_type();
+  std::string input_name = op_def->input(input_idx);
+  std::string output_name = input_name + "_transformed";
+
+  op_def->set_input(input_idx, output_name);
+  std::unique_ptr<OperatorDef> op(new OperatorDef);
+  op->set_name(output_name);
+  op->set_type("BufferTransform");
+  op->add_input(input_name);
+  op->add_output(output_name);
+  Argument *arg = op->add_arg();
+  arg->set_name("buffer_type");
+  arg->set_i(static_cast<int32_t>(buffer_type));
+  arg = op->add_arg();
+  arg->set_name("mem_type");
+  arg->set_i(static_cast<int32_t>(mem_type));
+  arg = op->add_arg();
+  arg->set_name("T");
+  arg->set_i(static_cast<int32_t>(dt));
+  arg = op->add_arg();
+  arg->set_name("device");
+  arg->set_i(device);
+
+  return std::move(op);
+}
+
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/buffer_inverse_transform.h b/mace/ops/transformer.h
similarity index 55%
rename from mace/ops/opencl/buffer_inverse_transform.h
rename to mace/ops/transformer.h
index 7f52a64f..67ecd60f 100644
--- a/mace/ops/opencl/buffer_inverse_transform.h
+++ b/mace/ops/transformer.h
@@ -12,30 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#ifndef MACE_KERNELS_TRANSFORMER_H_
+#define MACE_KERNELS_TRANSFORMER_H_
 
+#include "mace/core/transformer.h"
 #include "mace/ops/opencl/common.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
 
 namespace mace {
-
 class OpContext;
-class Tensor;
-
 namespace ops {
-class OpenCLBufferInverseTransformKernel {
+
+class Transformer : public TransformerBase {
  public:
-  virtual MaceStatus Compute(OpContext *context,
-                             const Tensor *input,
-                             const BufferType type,
-                             const int wino_blk_size,
-                             Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
+  // Transform source tensor to target.
+  std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
+      OperatorDef *op_def,
+      bool transform_filter = true) override;
+ private:
+  std::unique_ptr<OperatorDef> DoTransform(
+      mace::OperatorDef *op_def,
+      const int input_idx,
+      const mace::DataType dt,
+      const BufferType buffer_type,
+      const MemoryType mem_type);
 };
 
+
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#endif  // MACE_KERNELS_TENSOR_TRANSFORMER_H_
-- 
GitLab